mirror of https://github.com/ollama/ollama
update to c45f89d55
This commit is contained in:
parent
79c7f54881
commit
d207c02d11
|
|
@ -1,6 +1,6 @@
|
|||
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
|
||||
WORKDIR=llama/vendor
|
||||
FETCH_HEAD=380b4c984e06f8d8381392d15814b3392e39560e
|
||||
FETCH_HEAD=c45f89d5516221e51eef93e467e3bec6eb811994
|
||||
|
||||
.PHONY: help
|
||||
help:
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
int LLAMA_BUILD_NUMBER = 0;
|
||||
char const *LLAMA_COMMIT = "380b4c984e06f8d8381392d15814b3392e39560e";
|
||||
char const *LLAMA_COMMIT = "c45f89d5516221e51eef93e467e3bec6eb811994";
|
||||
char const *LLAMA_COMPILER = "";
|
||||
char const *LLAMA_BUILD_TARGET = "";
|
||||
|
|
|
|||
|
|
@ -1013,31 +1013,40 @@ bool tty_can_use_colors() {
|
|||
// Model utils
|
||||
//
|
||||
|
||||
static inline void common_init_sampler_from_model(
|
||||
// TODO: move to common/sampling
|
||||
static void common_init_sampler_from_model(
|
||||
const llama_model * model,
|
||||
common_params_sampling & sparams) {
|
||||
|
||||
const uint64_t config = sparams.user_sampling_config;
|
||||
|
||||
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
|
||||
if (config & user_config) return;
|
||||
if (config & user_config) {
|
||||
return;
|
||||
}
|
||||
|
||||
char buf[64] = {0};
|
||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||
char * end = nullptr;
|
||||
int32_t v = strtol(buf, &end, 10);
|
||||
if (end && end != buf) dst = v;
|
||||
if (end && end != buf) {
|
||||
dst = v;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
|
||||
if (config & user_config) return;
|
||||
if (config & user_config) {
|
||||
return;
|
||||
}
|
||||
|
||||
char buf[128] = {0};
|
||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||
char * end = nullptr;
|
||||
float v = strtof(buf, &end);
|
||||
if (end && end != buf) dst = v;
|
||||
if (end && end != buf) {
|
||||
dst = v;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -1065,31 +1074,125 @@ static inline void common_init_sampler_from_model(
|
|||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
|
||||
}
|
||||
|
||||
struct common_init_result common_init_from_params(common_params & params) {
|
||||
common_init_result iparams;
|
||||
struct common_init_result::impl {
|
||||
impl() = default;
|
||||
~impl() = default;
|
||||
|
||||
llama_model_ptr model;
|
||||
llama_context_ptr context;
|
||||
|
||||
std::vector<llama_adapter_lora_ptr> lora;
|
||||
|
||||
std::vector<common_sampler_ptr> samplers;
|
||||
};
|
||||
|
||||
common_init_result::common_init_result(common_params & params) :
|
||||
pimpl(new impl{}) {
|
||||
auto mparams = common_model_params_to_llama(params);
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
if (params.fit_params) {
|
||||
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
|
||||
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||
}
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
||||
__func__, params.model.path.c_str());
|
||||
return iparams;
|
||||
return;
|
||||
}
|
||||
|
||||
common_init_sampler_from_model(model, params.sampling);
|
||||
pimpl->model.reset(model);
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
// updates params.sampling
|
||||
// TODO: fix naming
|
||||
common_init_sampler_from_model(model, params.sampling);
|
||||
|
||||
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||
params.sampling.ignore_eos = false;
|
||||
}
|
||||
|
||||
// initialize once
|
||||
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
||||
if (llama_vocab_is_eog(vocab, i)) {
|
||||
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
|
||||
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
||||
}
|
||||
}
|
||||
|
||||
if (params.sampling.ignore_eos) {
|
||||
// add EOG biases to the active set of logit biases
|
||||
params.sampling.logit_bias.insert(
|
||||
params.sampling.logit_bias.end(),
|
||||
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
||||
}
|
||||
|
||||
//if (params.sampling.penalty_last_n == -1) {
|
||||
// LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
// params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
||||
//}
|
||||
|
||||
//if (params.sampling.dry_penalty_last_n == -1) {
|
||||
// LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
||||
//}
|
||||
|
||||
pimpl->samplers.resize(cparams.n_seq_max);
|
||||
|
||||
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
|
||||
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
|
||||
}
|
||||
|
||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
||||
__func__, params.model.path.c_str());
|
||||
llama_model_free(model);
|
||||
return iparams;
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
pimpl->context.reset(lctx);
|
||||
}
|
||||
|
||||
llama_model * common_init_result::model() {
|
||||
return pimpl->model.get();
|
||||
}
|
||||
|
||||
llama_context * common_init_result::context() {
|
||||
return pimpl->context.get();
|
||||
}
|
||||
|
||||
common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
|
||||
return pimpl->samplers[seq_id].get();
|
||||
}
|
||||
|
||||
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
||||
return pimpl->lora;
|
||||
}
|
||||
|
||||
void common_init_result::free_context() {
|
||||
pimpl->context.reset();
|
||||
}
|
||||
|
||||
common_init_result_ptr common_init_from_params(common_params & params) {
|
||||
common_init_result_ptr res(new common_init_result(params));
|
||||
|
||||
llama_model * model = res->model();
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
||||
return res;
|
||||
}
|
||||
|
||||
llama_context * lctx = res->context();
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||
return res;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
||||
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
||||
params.ctx_shift = false;
|
||||
|
|
@ -1101,10 +1204,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
|
||||
const auto cvec = common_control_vector_load(params.control_vectors);
|
||||
if (cvec.n_embd == -1) {
|
||||
llama_free(lctx);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
return res;
|
||||
}
|
||||
|
||||
int err = llama_apply_adapter_cvec(
|
||||
|
|
@ -1115,10 +1215,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
params.control_vector_layer_start,
|
||||
params.control_vector_layer_end);
|
||||
if (err) {
|
||||
llama_free(lctx);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1142,10 +1239,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_free(lctx);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1155,9 +1249,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||
if (lora == nullptr) {
|
||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
llama_free(lctx);
|
||||
llama_model_free(model);
|
||||
return iparams;
|
||||
return res;
|
||||
}
|
||||
|
||||
char buf[1024];
|
||||
|
|
@ -1166,43 +1258,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
la.task_name = buf;
|
||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
||||
la.prompt_prefix = buf;
|
||||
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||
}
|
||||
|
||||
if (!params.lora_init_without_apply) {
|
||||
common_set_adapter_lora(lctx, params.lora_adapters);
|
||||
}
|
||||
|
||||
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||
params.sampling.ignore_eos = false;
|
||||
}
|
||||
|
||||
// initialize once
|
||||
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
||||
if (llama_vocab_is_eog(vocab, i)) {
|
||||
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
||||
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
||||
}
|
||||
}
|
||||
|
||||
if (params.sampling.ignore_eos) {
|
||||
// add EOG biases to the active set of logit biases
|
||||
params.sampling.logit_bias.insert(
|
||||
params.sampling.logit_bias.end(),
|
||||
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
||||
}
|
||||
|
||||
if (params.sampling.penalty_last_n == -1) {
|
||||
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
||||
}
|
||||
|
||||
if (params.sampling.dry_penalty_last_n == -1) {
|
||||
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
||||
}
|
||||
|
||||
if (params.warmup) {
|
||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||
|
||||
|
|
@ -1241,12 +1303,11 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||
llama_set_warmup(lctx, false);
|
||||
}
|
||||
|
||||
iparams.model.reset(model);
|
||||
iparams.context.reset(lctx);
|
||||
|
||||
return iparams;
|
||||
return res;
|
||||
}
|
||||
|
||||
common_init_result::~common_init_result() = default;
|
||||
|
||||
std::string get_model_endpoint() {
|
||||
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
||||
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
||||
|
|
@ -1255,7 +1316,9 @@ std::string get_model_endpoint() {
|
|||
std::string model_endpoint = "https://huggingface.co/";
|
||||
if (endpoint_env) {
|
||||
model_endpoint = endpoint_env;
|
||||
if (model_endpoint.back() != '/') model_endpoint += '/';
|
||||
if (model_endpoint.back() != '/') {
|
||||
model_endpoint += '/';
|
||||
}
|
||||
}
|
||||
return model_endpoint;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -99,6 +99,7 @@ enum llama_example {
|
|||
LLAMA_EXAMPLE_TTS,
|
||||
LLAMA_EXAMPLE_DIFFUSION,
|
||||
LLAMA_EXAMPLE_FINETUNE,
|
||||
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
|
|
@ -195,7 +196,6 @@ struct common_params_sampling {
|
|||
|
||||
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
||||
|
||||
|
||||
std::vector<enum common_sampler_type> samplers = {
|
||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||
COMMON_SAMPLER_TYPE_DRY,
|
||||
|
|
@ -216,6 +216,10 @@ struct common_params_sampling {
|
|||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
||||
|
||||
bool has_logit_bias() const {
|
||||
return !logit_bias.empty();
|
||||
}
|
||||
|
||||
// print the parameters into a string
|
||||
std::string print() const;
|
||||
};
|
||||
|
|
@ -303,8 +307,8 @@ struct lr_opt {
|
|||
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
||||
|
||||
struct common_params {
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 4096; // context size
|
||||
int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
|
||||
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
|
||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
|
|
@ -325,9 +329,12 @@ struct common_params {
|
|||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
||||
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
|
||||
|
|
@ -669,15 +676,29 @@ bool tty_can_use_colors();
|
|||
// Model utils
|
||||
//
|
||||
|
||||
// note: defines object's lifetime
|
||||
struct common_init_result {
|
||||
llama_model_ptr model;
|
||||
llama_context_ptr context;
|
||||
struct common_sampler;
|
||||
|
||||
std::vector<llama_adapter_lora_ptr> lora;
|
||||
// note: defines the model, context, samplers, ets. lifetimes
|
||||
struct common_init_result {
|
||||
common_init_result(common_params & params);
|
||||
~common_init_result();
|
||||
|
||||
llama_model * model();
|
||||
llama_context * context();
|
||||
common_sampler * sampler(llama_seq_id seq_id);
|
||||
|
||||
std::vector<llama_adapter_lora_ptr> & lora();
|
||||
|
||||
void free_context();
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
};
|
||||
|
||||
struct common_init_result common_init_from_params(common_params & params);
|
||||
using common_init_result_ptr = std::unique_ptr<common_init_result>;
|
||||
|
||||
common_init_result_ptr common_init_from_params(common_params & params);
|
||||
|
||||
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||
|
|
|
|||
|
|
@ -104,9 +104,10 @@ struct ring_buffer {
|
|||
struct common_sampler {
|
||||
common_params_sampling params;
|
||||
|
||||
struct llama_sampler * grmr;
|
||||
struct llama_sampler * chain;
|
||||
|
||||
bool grammar;
|
||||
|
||||
ring_buffer<llama_token> prev;
|
||||
|
||||
std::vector<llama_token_data> cur;
|
||||
|
|
@ -116,7 +117,6 @@ struct common_sampler {
|
|||
void reset() {
|
||||
prev.clear();
|
||||
|
||||
llama_sampler_reset(grmr);
|
||||
llama_sampler_reset(chain);
|
||||
}
|
||||
|
||||
|
|
@ -167,10 +167,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||
|
||||
lparams.no_perf = params.no_perf;
|
||||
|
||||
struct llama_sampler * grmr;
|
||||
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
||||
|
||||
bool grammar = false;
|
||||
std::vector<llama_sampler *> samplers;
|
||||
|
||||
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
||||
#ifdef LLAMA_USE_LLGUIDANCE
|
||||
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
||||
samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
|
||||
grammar = true;
|
||||
#else
|
||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
||||
#endif // LLAMA_USE_LLGUIDANCE
|
||||
|
|
@ -217,30 +222,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||
trigger_patterns_c.push_back(regex.c_str());
|
||||
}
|
||||
|
||||
grmr = params.grammar_lazy
|
||||
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
trigger_tokens.data(), trigger_tokens.size())
|
||||
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
||||
if (!grmr) {
|
||||
return nullptr;
|
||||
if (!params.grammar.empty()) {
|
||||
if (params.grammar_lazy) {
|
||||
samplers.push_back(
|
||||
llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||
trigger_tokens.data(), trigger_tokens.size()));
|
||||
} else {
|
||||
samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
|
||||
}
|
||||
|
||||
grammar = true;
|
||||
}
|
||||
}
|
||||
|
||||
auto * result = new common_sampler {
|
||||
/* .params = */ params,
|
||||
/* .grmr = */ grmr,
|
||||
/* .chain = */ llama_sampler_chain_init(lparams),
|
||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||
/* .cur = */ {},
|
||||
/* .cur_p = */ {},
|
||||
};
|
||||
|
||||
llama_sampler_chain_add(result->chain,
|
||||
llama_sampler_init_logit_bias(
|
||||
llama_vocab_n_tokens(vocab),
|
||||
params.logit_bias.size(),
|
||||
params.logit_bias.data()));
|
||||
if (params.has_logit_bias()) {
|
||||
samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
|
||||
}
|
||||
|
||||
if (params.mirostat == 0) {
|
||||
for (const auto & cnstr : params.samplers) {
|
||||
|
|
@ -253,58 +251,70 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||
c_breakers.push_back(str.c_str());
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||
}
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||
samplers.push_back(llama_sampler_init_top_k (params.top_k));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
||||
samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_XTC:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_INFILL:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
||||
samplers.push_back(llama_sampler_init_infill (vocab));
|
||||
break;
|
||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "unknown sampler type");
|
||||
}
|
||||
}
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||
|
||||
samplers.push_back(llama_sampler_init_dist(params.seed));
|
||||
} else if (params.mirostat == 1) {
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||
samplers.push_back(llama_sampler_init_temp(params.temp));
|
||||
samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||
} else if (params.mirostat == 2) {
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||
samplers.push_back(llama_sampler_init_temp(params.temp));
|
||||
samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||
} else {
|
||||
GGML_ASSERT(false && "unknown mirostat version");
|
||||
}
|
||||
|
||||
for (auto * smpl : samplers) {
|
||||
llama_sampler_chain_add(chain, smpl);
|
||||
}
|
||||
|
||||
auto * result = new common_sampler {
|
||||
/* .params = */ params,
|
||||
/* .chain = */ chain,
|
||||
/* .grammar = */ grammar,
|
||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||
/* .cur = */ {},
|
||||
/* .cur_p = */ {},
|
||||
};
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void common_sampler_free(struct common_sampler * gsmpl) {
|
||||
if (gsmpl) {
|
||||
llama_sampler_free(gsmpl->grmr);
|
||||
|
||||
llama_sampler_free(gsmpl->chain);
|
||||
|
||||
delete gsmpl;
|
||||
|
|
@ -314,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
|||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
if (accept_grammar) {
|
||||
llama_sampler_accept(gsmpl->grmr, token);
|
||||
}
|
||||
if (gsmpl->grammar) {
|
||||
const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
|
||||
|
||||
llama_sampler_accept(gsmpl->chain, token);
|
||||
for (int i = 0; i < n_smpl; i++) {
|
||||
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||
|
||||
// the grammar sampler is always the first one
|
||||
if (i == 0) {
|
||||
if (accept_grammar) {
|
||||
llama_sampler_accept(smpl, token);
|
||||
}
|
||||
} else {
|
||||
llama_sampler_accept(smpl, token);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
llama_sampler_accept(gsmpl->chain, token);
|
||||
}
|
||||
|
||||
gsmpl->prev.push_back(token);
|
||||
}
|
||||
|
|
@ -329,12 +352,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
|
|||
|
||||
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||
return new common_sampler {
|
||||
/* .params = */ gsmpl->params,
|
||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||
/* .prev = */ gsmpl->prev,
|
||||
/* .cur = */ gsmpl->cur,
|
||||
/* .cur_p = */ gsmpl->cur_p,
|
||||
/* .params = */ gsmpl->params,
|
||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||
/* .grammar = */ gsmpl->grammar,
|
||||
/* .prev = */ gsmpl->prev,
|
||||
/* .cur = */ gsmpl->cur,
|
||||
/* .cur_p = */ gsmpl->cur_p,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -383,58 +406,33 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
|||
}
|
||||
}
|
||||
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
|
||||
return gsmpl->chain;
|
||||
}
|
||||
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
llama_token id = LLAMA_TOKEN_NULL;
|
||||
|
||||
auto & grmr = gsmpl->grmr;
|
||||
auto & chain = gsmpl->chain;
|
||||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||
|
||||
if (grammar_first) {
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
}
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||
|
||||
const llama_token id = cur_p.data[cur_p.selected].id;
|
||||
id = cur_p.data[cur_p.selected].id;
|
||||
|
||||
if (grammar_first) {
|
||||
return id;
|
||||
}
|
||||
|
||||
// check if it the sampled token fits the grammar
|
||||
{
|
||||
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
||||
|
||||
llama_sampler_apply(grmr, &single_token_data_array);
|
||||
|
||||
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||
if (is_valid) {
|
||||
return id;
|
||||
}
|
||||
}
|
||||
|
||||
// resampling:
|
||||
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
||||
|
||||
return cur_p.data[cur_p.selected].id;
|
||||
return id;
|
||||
}
|
||||
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
|
||||
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
||||
|
||||
std::vector<llama_token> result;
|
||||
|
|
@ -442,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|||
|
||||
size_t i = 0;
|
||||
for (; i < draft.size(); i++) {
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||
|
||||
common_sampler_accept(gsmpl, id, true);
|
||||
|
||||
|
|
@ -454,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|||
}
|
||||
|
||||
if (i == draft.size()) {
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||
|
||||
common_sampler_accept(gsmpl, id, true);
|
||||
|
||||
|
|
@ -464,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
|||
return result;
|
||||
}
|
||||
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
|
||||
std::vector<int> idxs(draft.size() + 1);
|
||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||
idxs[i] = i;
|
||||
}
|
||||
|
||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
|
||||
}
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||
|
|
@ -515,7 +513,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
|||
|
||||
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
|
||||
result += std::string("-> ");
|
||||
result += std::string(llama_sampler_name(smpl)) + " ";
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
|||
// arguments can be nullptr to skip printing
|
||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
||||
|
||||
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
||||
|
||||
// extended sampling implementation:
|
||||
//
|
||||
// - set logits
|
||||
|
|
@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
|||
// - check if the token fits the grammar (if any)
|
||||
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||
//
|
||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
||||
//
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
|
||||
|
||||
// generalized version of common_sampler_sample
|
||||
//
|
||||
|
|
@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|||
//
|
||||
// returns at least 1 token, up to idxs.size()
|
||||
//
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
|
||||
|
||||
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||
|
||||
|
|
@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
|
|||
|
||||
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
||||
const char * grammar_kind, const char * grammar_data);
|
||||
|
||||
struct common_sampler_deleter {
|
||||
void operator()(common_sampler * s) { common_sampler_free(s); }
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
|
||||
|
|
|
|||
|
|
@ -313,6 +313,7 @@ extern "C" {
|
|||
bool check_tensors; // validate model tensor data
|
||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||
bool no_host; // bypass host buffer allowing extra buffers to be used
|
||||
bool no_alloc; // only load metadata and simulate memory allocations
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
|
|
@ -466,10 +467,24 @@ extern "C" {
|
|||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||
// returns true if the parameters could be successfully modified to fit device memory
|
||||
// this function is NOT thread safe because it modifies the global llama logger state
|
||||
LLAMA_API bool llama_params_fit(
|
||||
const char * path_model,
|
||||
struct llama_model_params * mparams,
|
||||
struct llama_context_params * cparams,
|
||||
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||
size_t margin, // margin of memory to leave per device in bytes
|
||||
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||
|
||||
LLAMA_API int64_t llama_time_us(void);
|
||||
|
||||
LLAMA_API size_t llama_max_devices(void);
|
||||
LLAMA_API size_t llama_max_parallel_sequences(void);
|
||||
LLAMA_API size_t llama_max_tensor_buft_overrides(void);
|
||||
|
||||
LLAMA_API bool llama_supports_mmap (void);
|
||||
LLAMA_API bool llama_supports_mlock (void);
|
||||
|
|
@ -1354,7 +1369,9 @@ extern "C" {
|
|||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
// The logger state is global so these functions are NOT thread safe.
|
||||
LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
|
||||
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
//
|
||||
// Performance utils
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
#include "llama-model.h"
|
||||
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <stdexcept>
|
||||
|
|
@ -72,6 +73,43 @@ llama_context::llama_context(
|
|||
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
if (cparams.yarn_ext_factor != 0) {
|
||||
static auto get_mscale = [](float scale, float mscale) {
|
||||
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
||||
};
|
||||
|
||||
const float factor = 1.0f / cparams.rope_freq_scale;
|
||||
|
||||
// ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
|
||||
if (hparams.rope_yarn_log_mul != 0.0f) {
|
||||
// note: here we assume `mscale == 1.0f`
|
||||
// TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
|
||||
float mscale = 1.0f;
|
||||
const float mscale_all_dims = hparams.rope_yarn_log_mul;
|
||||
|
||||
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
// special-case DEEPSEEK v2:
|
||||
// https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
|
||||
mscale = mscale_all_dims;
|
||||
}
|
||||
|
||||
cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
||||
|
||||
LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
|
||||
__func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
|
||||
} else {
|
||||
cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
|
||||
}
|
||||
|
||||
// when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
|
||||
// https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
|
||||
//
|
||||
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
|
||||
// https://github.com/ggml-org/llama.cpp/pull/17945
|
||||
cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
|
||||
}
|
||||
|
||||
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
||||
|
||||
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
||||
|
|
@ -220,6 +258,7 @@ llama_context::llama_context(
|
|||
|
||||
backend_buft.clear();
|
||||
backend_ptrs.clear();
|
||||
backend_buf_exp_size.clear();
|
||||
|
||||
for (auto & backend : backends) {
|
||||
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
||||
|
|
@ -236,6 +275,7 @@ llama_context::llama_context(
|
|||
|
||||
backend_buft.push_back(buft);
|
||||
backend_ptrs.push_back(backend.get());
|
||||
backend_buf_exp_size.push_back(0);
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
||||
|
|
@ -351,7 +391,8 @@ llama_context::llama_context(
|
|||
|
||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||
{
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
|
||||
model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
|
||||
if (!gf) {
|
||||
if (pipeline_parallel) {
|
||||
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
||||
|
|
@ -369,7 +410,7 @@ llama_context::llama_context(
|
|||
|
||||
// reserve with tg (token generation) graph to get the number of splits and nodes
|
||||
{
|
||||
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
|
||||
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute tg buffers");
|
||||
}
|
||||
|
|
@ -384,7 +425,7 @@ llama_context::llama_context(
|
|||
//
|
||||
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
||||
//
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
}
|
||||
|
|
@ -393,11 +434,13 @@ llama_context::llama_context(
|
|||
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||
ggml_backend_t backend = backend_ptrs[i];
|
||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||
size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
if (size > 1) {
|
||||
if (!model.hparams.no_alloc) {
|
||||
backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
}
|
||||
if (backend_buf_exp_size[i] > 1) {
|
||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buft_name(buft),
|
||||
size / 1024.0 / 1024.0);
|
||||
backend_buf_exp_size[i] / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -416,6 +459,23 @@ llama_context::llama_context(
|
|||
}
|
||||
|
||||
llama_context::~llama_context() {
|
||||
// FIXME this currently results in a use-after-free bug if the model is freed before the context
|
||||
// if (!model.hparams.no_alloc) {
|
||||
// for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||
// ggml_backend_t backend = backend_ptrs[i];
|
||||
// ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||
|
||||
// const size_t size_exp = backend_buf_exp_size[i];
|
||||
// const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
// if (size_exp == size_act) {
|
||||
// LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
||||
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||
// } else {
|
||||
// LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
||||
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
ggml_opt_free(opt_ctx);
|
||||
}
|
||||
|
||||
|
|
@ -1317,6 +1377,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|||
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
||||
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
#endif
|
||||
synchronize();
|
||||
buf_output = nullptr;
|
||||
logits = nullptr;
|
||||
embd = nullptr;
|
||||
|
|
@ -1388,7 +1449,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
|||
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
||||
}
|
||||
|
||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
||||
ggml_cgraph * llama_context::graph_reserve(
|
||||
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
|
||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
GGML_ASSERT(n_outputs >= 1);
|
||||
|
||||
|
|
@ -1425,8 +1487,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
|||
|
||||
// initialize scheduler with the specified graph
|
||||
if (split_only) {
|
||||
ggml_backend_sched_split_graph(sched.get(), gf);
|
||||
if (sizes) {
|
||||
ggml_backend_sched_reserve_size(sched.get(), gf, sizes);
|
||||
} else {
|
||||
ggml_backend_sched_split_graph(sched.get(), gf);
|
||||
}
|
||||
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
GGML_ASSERT(!sizes);
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
|
@ -2048,15 +2115,26 @@ void llama_context::perf_reset() {
|
|||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
||||
for (const auto & buft_size : model.memory_breakdown()) {
|
||||
ret[buft_size.first].model += buft_size.second;
|
||||
for (const auto & [buft, size] : model.memory_breakdown()) {
|
||||
ret[buft].model += size;
|
||||
}
|
||||
for (const auto & buft_size : memory->memory_breakdown()) {
|
||||
ret[buft_size.first].context += buft_size.second;
|
||||
if (memory) {
|
||||
for (const auto & [buft, size] : memory->memory_breakdown()) {
|
||||
ret[buft].context += size;
|
||||
}
|
||||
}
|
||||
for (const auto & backend_ptr : backends) {
|
||||
ggml_backend_t backend = backend_ptr.get();
|
||||
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
if (model.hparams.no_alloc) {
|
||||
for (size_t i = 0; i < backends.size(); ++i) {
|
||||
ggml_backend_t backend = backends[i].get();
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
||||
ret[buft].compute += backend_buf_exp_size[i];
|
||||
}
|
||||
} else {
|
||||
for (const auto & backend_ptr : backends) {
|
||||
ggml_backend_t backend = backend_ptr.get();
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
||||
ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,6 +26,10 @@ struct llama_memory_breakdown_data {
|
|||
size_t model = 0; // memory allocated for the model
|
||||
size_t context = 0; // memory allocated for the context
|
||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||
|
||||
size_t total() const {
|
||||
return model + context + compute;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_context {
|
||||
|
|
@ -206,7 +210,8 @@ public:
|
|||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||
|
||||
// reserve a graph with a dummy ubatch of the specified size
|
||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
|
||||
ggml_cgraph * graph_reserve(
|
||||
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
|
||||
|
||||
private:
|
||||
llm_graph_params graph_params(
|
||||
|
|
@ -281,9 +286,10 @@ private:
|
|||
|
||||
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
||||
|
||||
// buffer types used for the compute buffer of each backend
|
||||
// pointers and buffer types used for the compute buffer of each backend
|
||||
std::vector<ggml_backend_t> backend_ptrs;
|
||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||
std::vector<size_t> backend_buf_exp_size; // expected buffer sizes
|
||||
|
||||
llm_graph_result_ptr gf_res_prev;
|
||||
llm_graph_result_ptr gf_res_reserve;
|
||||
|
|
|
|||
|
|
@ -78,7 +78,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
|||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const float pos = ubatch->pos[i];
|
||||
attn_scale_data[i] = std::log(
|
||||
std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
|
||||
std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
|
||||
) * f_attn_temp_scale + 1.0;
|
||||
}
|
||||
|
||||
|
|
@ -574,7 +574,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|||
freq_base (cparams.rope_freq_base),
|
||||
freq_scale (cparams.rope_freq_scale),
|
||||
ext_factor (cparams.yarn_ext_factor),
|
||||
attn_factor (llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor)),
|
||||
attn_factor (cparams.yarn_attn_factor),
|
||||
beta_fast (cparams.yarn_beta_fast),
|
||||
beta_slow (cparams.yarn_beta_slow),
|
||||
norm_eps (hparams.f_norm_eps),
|
||||
|
|
@ -1203,7 +1203,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
|||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);
|
||||
|
||||
auto & cur = inp->attn_scale;
|
||||
|
||||
|
|
|
|||
|
|
@ -132,8 +132,8 @@ public:
|
|||
// temperature tuning, used by llama4
|
||||
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
||||
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
||||
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
|
||||
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
|
||||
virtual ~llm_graph_input_attn_temp() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
|
@ -142,6 +142,7 @@ public:
|
|||
|
||||
const uint32_t n_attn_temp_floor_scale;
|
||||
const float f_attn_temp_scale;
|
||||
const float f_attn_temp_offset;
|
||||
};
|
||||
|
||||
class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@
|
|||
#include "ggml.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
|
||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||
if (dense_first) {
|
||||
|
|
@ -239,13 +238,3 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
|
|||
|
||||
return false;
|
||||
}
|
||||
|
||||
float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) {
|
||||
GGML_ASSERT(ext_factor >= 0.0f);
|
||||
|
||||
if (ext_factor != 0.0f) {
|
||||
attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
||||
}
|
||||
|
||||
return attn_factor;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ struct llama_hparams_convnext {
|
|||
|
||||
struct llama_hparams {
|
||||
bool vocab_only;
|
||||
bool no_alloc;
|
||||
bool rope_finetuned;
|
||||
bool use_par_res;
|
||||
bool swin_norm;
|
||||
|
|
@ -167,6 +168,7 @@ struct llama_hparams {
|
|||
uint32_t n_no_rope_layer_step = 4;
|
||||
uint32_t n_attn_temp_floor_scale = 0;
|
||||
float f_attn_temp_scale = 0.0f;
|
||||
float f_attn_temp_offset = 0.0f; // offset position index
|
||||
|
||||
// gemma3n altup
|
||||
uint32_t n_altup = 4; // altup_num_inputs
|
||||
|
|
@ -273,13 +275,6 @@ struct llama_hparams {
|
|||
// TODO: think of a better place for this function
|
||||
// TODO: pack the SWA params in a struct?
|
||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
||||
|
||||
// when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
|
||||
// https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
|
||||
//
|
||||
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
|
||||
// https://github.com/ggml-org/llama.cpp/pull/17945
|
||||
static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor);
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
|
|
|
|||
|
|
@ -25,6 +25,10 @@ time_meas::~time_meas() {
|
|||
}
|
||||
}
|
||||
|
||||
void llama_log_get(ggml_log_callback * log_callback, void ** user_data) {
|
||||
ggml_log_get(log_callback, user_data);
|
||||
}
|
||||
|
||||
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||
ggml_log_set(log_callback, user_data);
|
||||
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
||||
|
|
|
|||
|
|
@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache(
|
|||
|
||||
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
||||
for (auto & [buft, ctx] : ctx_map) {
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
|
||||
ggml_backend_buffer_t buf;
|
||||
if (model.hparams.no_alloc) {
|
||||
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
|
||||
t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
|
||||
}
|
||||
} else {
|
||||
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
|
||||
}
|
||||
if (!buf) {
|
||||
throw std::runtime_error("failed to allocate buffer for kv cache");
|
||||
}
|
||||
|
|
@ -482,9 +490,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
|||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||
for (const auto & [_, buf] : ctxs_bufs) {
|
||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
||||
for (const auto & [ctx, buf] : ctxs_bufs) {
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
|
||||
|
||||
if (hparams.no_alloc) {
|
||||
GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
|
||||
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
||||
} else {
|
||||
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
||||
ret[buft] += ggml_backend_buffer_get_size(buf.get());
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
@ -1372,7 +1389,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
|||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
||||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||
const auto & yarn_attn_factor = llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor);
|
||||
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
|
||||
|
||||
const auto & n_rot = hparams.n_rot;
|
||||
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
|
||||
|
|
@ -1544,9 +1561,11 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
|
|||
|
||||
const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
|
||||
|
||||
slot_info sinfo;
|
||||
|
||||
bool res = true;
|
||||
res = res && state_read_meta(io, strm, cell_count, seq_id);
|
||||
res = res && state_read_data(io, strm, cell_count);
|
||||
res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
|
||||
res = res && state_read_data(io, strm, cell_count, sinfo);
|
||||
|
||||
if (!res) {
|
||||
if (seq_id == -1) {
|
||||
|
|
@ -1685,7 +1704,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
|||
}
|
||||
}
|
||||
|
||||
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
|
||||
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
|
||||
auto & cells = v_cells[strm];
|
||||
auto & head = v_heads[strm];
|
||||
|
||||
|
|
@ -1722,7 +1741,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|||
ubatch.seq_id[i] = &dest_seq_id;
|
||||
}
|
||||
|
||||
const auto sinfo = find_slot(ubatch, true);
|
||||
sinfo = find_slot(ubatch, false);
|
||||
if (sinfo.empty()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
||||
return false;
|
||||
|
|
@ -1732,20 +1751,16 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|||
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
|
||||
apply_ubatch(sinfo, ubatch);
|
||||
|
||||
const auto head_cur = sinfo.head();
|
||||
LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id);
|
||||
|
||||
// keep the head at the old position because we will read the KV data into it in state_read_data()
|
||||
head = head_cur;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: head_cur = %d, head = %d, cell_count = %d, dest_seq_id = %d\n", __func__, head_cur, head, cell_count, dest_seq_id);
|
||||
|
||||
// DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
||||
// Assume that this is one contiguous block of cells
|
||||
GGML_ASSERT(head_cur + cell_count <= cells.size());
|
||||
GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]);
|
||||
GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
|
||||
GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
|
||||
GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
|
||||
// DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values
|
||||
GGML_ASSERT(sinfo.n_stream() == 1);
|
||||
GGML_ASSERT(sinfo.idxs[0].size() == cell_count);
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
const uint32_t idx = sinfo.idxs[0][i];
|
||||
GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]);
|
||||
GGML_ASSERT(cells.seq_has(idx, dest_seq_id));
|
||||
}
|
||||
} else {
|
||||
// whole KV cache restore
|
||||
|
||||
|
|
@ -1778,15 +1793,24 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|||
}
|
||||
}
|
||||
|
||||
// Create contiguous slot_info for whole cache restore
|
||||
sinfo.s0 = strm;
|
||||
sinfo.s1 = strm;
|
||||
sinfo.resize(1);
|
||||
sinfo.strm[0] = strm;
|
||||
sinfo.idxs[0].resize(cell_count);
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
sinfo.idxs[0][i] = i;
|
||||
}
|
||||
|
||||
head = 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
|
||||
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) {
|
||||
auto & cells = v_cells[strm];
|
||||
auto & head = v_heads[strm];
|
||||
|
||||
uint32_t v_trans;
|
||||
uint32_t n_layer;
|
||||
|
|
@ -1836,8 +1860,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|||
}
|
||||
|
||||
if (cell_count) {
|
||||
// Read and set the keys for the whole cell range
|
||||
ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
|
||||
if (sinfo.is_contiguous()) {
|
||||
// Fast path: contiguous cells, single memcpy
|
||||
ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row);
|
||||
} else {
|
||||
// Slow path: scatter to non-contiguous positions
|
||||
const void * src = io.read(cell_count * k_size_row);
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
const size_t dst_offset = sinfo.idxs[0][i] * k_size_row;
|
||||
ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1868,8 +1901,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|||
}
|
||||
|
||||
if (cell_count) {
|
||||
// Read and set the values for the whole cell range
|
||||
ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
|
||||
if (sinfo.is_contiguous()) {
|
||||
// Fast path: contiguous cells, single memcpy
|
||||
ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row);
|
||||
} else {
|
||||
// Slow path: scatter to non-contiguous positions
|
||||
const void * src = io.read(cell_count * v_size_row);
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
const size_t dst_offset = sinfo.idxs[0][i] * v_size_row;
|
||||
ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
|
@ -1908,10 +1950,22 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
|||
}
|
||||
|
||||
if (cell_count) {
|
||||
// For each row in the transposed matrix, read the values for the whole cell range
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
const size_t dst_offset = (head + j * cells.size()) * v_size_el;
|
||||
ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
||||
if (sinfo.is_contiguous()) {
|
||||
// Fast path: contiguous cells
|
||||
const uint32_t h = sinfo.head();
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
const size_t dst_offset = (h + j * cells.size()) * v_size_el;
|
||||
ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
||||
}
|
||||
} else {
|
||||
// Slow path: scatter to non-contiguous positions
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
const void * src = io.read(cell_count * v_size_el);
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el;
|
||||
ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -72,6 +72,23 @@ public:
|
|||
void clear() {
|
||||
idxs.clear();
|
||||
}
|
||||
|
||||
// check if indices are contiguous starting from head()
|
||||
bool is_contiguous() const {
|
||||
if (idxs.empty() || idxs[0].empty()) {
|
||||
return true;
|
||||
}
|
||||
if (idxs.size() > 1) {
|
||||
return false;
|
||||
}
|
||||
const uint32_t h = idxs[0][0];
|
||||
for (size_t i = 0; i < idxs[0].size(); ++i) {
|
||||
if (idxs[0][i] != h + i) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
using slot_info_vec_t = std::vector<slot_info>;
|
||||
|
|
@ -264,8 +281,8 @@ private:
|
|||
void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
|
||||
void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
|
||||
|
||||
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
||||
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
|
||||
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id = -1);
|
||||
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
|
||||
};
|
||||
|
||||
class llama_kv_cache_context : public llama_memory_context_i {
|
||||
|
|
|
|||
|
|
@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
|
|||
std::vector<std::string> & splits,
|
||||
bool use_mmap,
|
||||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||
int trace = 0;
|
||||
|
|
@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(
|
|||
|
||||
this->use_mmap = use_mmap;
|
||||
this->check_tensors = check_tensors;
|
||||
this->no_alloc = no_alloc;
|
||||
}
|
||||
|
||||
std::string llama_model_loader::get_arch_name() const {
|
||||
|
|
|
|||
|
|
@ -71,6 +71,7 @@ struct llama_model_loader {
|
|||
|
||||
bool use_mmap = false;
|
||||
bool check_tensors;
|
||||
bool no_alloc;
|
||||
|
||||
llama_files files;
|
||||
llama_ftype ftype;
|
||||
|
|
@ -97,6 +98,7 @@ struct llama_model_loader {
|
|||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
bool use_mmap,
|
||||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
||||
|
||||
|
|
|
|||
|
|
@ -668,6 +668,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
hparams.n_swa = 8192;
|
||||
hparams.n_attn_temp_floor_scale = 8192;
|
||||
hparams.f_attn_temp_scale = 0.1f;
|
||||
hparams.f_attn_temp_offset = 1.0f;
|
||||
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
||||
}
|
||||
|
||||
|
|
@ -1646,6 +1647,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
||||
|
||||
hparams.f_attn_temp_offset = 0.0f;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 27: type = LLM_TYPE_16B; break;
|
||||
case 60: type = LLM_TYPE_236B; break;
|
||||
|
|
@ -2291,6 +2294,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
|
||||
|
||||
hparams.f_attn_temp_offset = 0.0f;
|
||||
|
||||
// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
|
||||
if (hparams.f_attn_temp_scale != 0.0f) {
|
||||
hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
|
||||
|
|
@ -2309,32 +2314,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
default: throw std::runtime_error("unsupported model architecture");
|
||||
}
|
||||
|
||||
// ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
|
||||
if (hparams.rope_yarn_log_mul != 0.0f) {
|
||||
const float factor = 1.0f / hparams.rope_freq_scale_train;
|
||||
|
||||
// note: here we assume `mscale == 1.0f`
|
||||
// TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
|
||||
float mscale = 1.0f;
|
||||
const float mscale_all_dims = hparams.rope_yarn_log_mul;
|
||||
|
||||
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
// special-case DEEPSEEK v2:
|
||||
// https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
|
||||
if (arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
|
||||
mscale = mscale_all_dims;
|
||||
}
|
||||
|
||||
static auto get_mscale = [](float scale, float mscale) {
|
||||
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
||||
};
|
||||
|
||||
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
||||
|
||||
LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
|
||||
__func__, hparams.yarn_attn_factor, mscale, mscale_all_dims);
|
||||
}
|
||||
|
||||
pimpl->n_bytes = ml.n_bytes;
|
||||
|
||||
pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
|
||||
|
|
@ -3424,9 +3403,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
// optional bias tensors
|
||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
|
|
@ -6670,9 +6649,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
||||
GGML_ASSERT(!ml.no_alloc);
|
||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
||||
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
||||
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
|
||||
// then we could just use metal for all layers
|
||||
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
||||
void * addr = nullptr;
|
||||
size_t first, last; // NOLINT
|
||||
|
|
@ -6688,9 +6669,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
bufs.emplace_back(buf);
|
||||
buf_map.emplace(idx, buf);
|
||||
}
|
||||
}
|
||||
else {
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
} else {
|
||||
ggml_backend_buffer_t buf;
|
||||
if (ml.no_alloc) {
|
||||
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
|
||||
}
|
||||
} else {
|
||||
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
|
||||
}
|
||||
if (buf == nullptr) {
|
||||
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
||||
}
|
||||
|
|
@ -6745,6 +6733,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
}
|
||||
}
|
||||
|
||||
if (ml.no_alloc) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// load tensor data
|
||||
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
||||
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
||||
|
|
@ -6787,9 +6779,18 @@ size_t llama_model::n_devices() const {
|
|||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||
for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
|
||||
for (const auto & buf : bufs) {
|
||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
||||
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
|
||||
if (hparams.no_alloc) {
|
||||
GGML_ASSERT(bufs.size() == 1);
|
||||
ggml_backend_buffer_t buf = bufs[0].get();
|
||||
GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
|
||||
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
||||
} else {
|
||||
for (const auto & buf : bufs) {
|
||||
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
|
|
@ -6834,6 +6835,7 @@ void llama_model::print_info() const {
|
|||
// hparams
|
||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
||||
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
||||
LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
|
||||
|
||||
if (!hparams.vocab_only) {
|
||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||
|
|
@ -7686,6 +7688,7 @@ llama_model_params llama_model_default_params() {
|
|||
/*.check_tensors =*/ false,
|
||||
/*.use_extra_bufts =*/ true,
|
||||
/*.no_host =*/ false,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
}
|
||||
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
|
||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||
ml.init_mappings(false); // no prefetching
|
||||
|
||||
llama_model model(llama_model_default_params());
|
||||
|
|
|
|||
|
|
@ -1884,7 +1884,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "qwen2" ||
|
||||
tokenizer_pre == "deepseek-r1-qwen") {
|
||||
tokenizer_pre == "deepseek-r1-qwen" ||
|
||||
tokenizer_pre == "kormo") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
#include "llama.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include "llama-chat.h"
|
||||
#include "llama-context.h"
|
||||
#include "llama-mmap.h"
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-model-loader.h"
|
||||
|
|
@ -11,11 +14,14 @@
|
|||
#include "ggml-backend.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <stdexcept>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
|
|
@ -37,6 +43,643 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
|
|||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
struct llama_device_memory_data {
|
||||
int64_t total;
|
||||
int64_t free;
|
||||
llama_memory_breakdown_data mb;
|
||||
};
|
||||
|
||||
static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||
const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
|
||||
const ggml_log_level log_level) {
|
||||
struct user_data_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
void * user_data;
|
||||
} original_logger;
|
||||
ggml_log_level min_level; // prints below this log level go to debug log
|
||||
};
|
||||
user_data_t ud;
|
||||
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
||||
ud.min_level = log_level;
|
||||
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
||||
const user_data_t * ud = (const user_data_t *) user_data;
|
||||
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
||||
}, &ud);
|
||||
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
mparams_copy.no_alloc = true;
|
||||
mparams_copy.use_mmap = false;
|
||||
|
||||
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||
if (model == nullptr) {
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to load model");
|
||||
}
|
||||
|
||||
llama_context * ctx = llama_init_from_model(model, *cparams);
|
||||
if (ctx == nullptr) {
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
throw std::runtime_error("failed to create llama_context from model");
|
||||
}
|
||||
|
||||
std::vector<llama_device_memory_data> ret(model->devices.size());
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||
|
||||
for (const auto & [buft, mb] : memory_breakdown) {
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (!dev) {
|
||||
continue;
|
||||
}
|
||||
for (size_t i = 0; i < ret.size(); i++) {
|
||||
if (model->devices[i] == dev) {
|
||||
ret[i].mb.model += mb.model;
|
||||
ret[i].mb.context += mb.context;
|
||||
ret[i].mb.compute += mb.compute;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < ret.size(); i++) {
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
||||
ret[i].free = free;
|
||||
ret[i].total = total;
|
||||
}
|
||||
|
||||
devs = model->devices;
|
||||
hp_ngl = model->hparams.n_layer;
|
||||
hp_n_ctx_train = model->hparams.n_ctx_train;
|
||||
hp_n_expert = model->hparams.n_expert;
|
||||
|
||||
llama_memory_breakdown_print(ctx); // goes to debug log
|
||||
|
||||
llama_free(ctx);
|
||||
llama_model_free(model);
|
||||
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// enum to identify part of a layer for distributing its tensors:
|
||||
enum layer_fraction_t {
|
||||
LAYER_FRACTION_NONE = 0, // nothing
|
||||
LAYER_FRACTION_ATTN = 1, // attention
|
||||
LAYER_FRACTION_UP = 2, // attention + up
|
||||
LAYER_FRACTION_GATE = 3, // attention + up + gate
|
||||
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
|
||||
};
|
||||
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
||||
|
||||
static void llama_params_fit_impl(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
constexpr int64_t MiB = 1024*1024;
|
||||
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||
const llama_model_params default_mparams = llama_model_default_params();
|
||||
|
||||
std::vector<ggml_backend_dev_t> devs;
|
||||
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||
uint32_t hp_nex = 0; // hparams.n_expert
|
||||
|
||||
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
|
||||
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
const size_t nd = devs.size(); // number of devices
|
||||
if (nd == 0) {
|
||||
LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<std::string> dev_names;
|
||||
{
|
||||
dev_names.reserve(nd);
|
||||
size_t max_length = 0;
|
||||
for (ggml_backend_dev_t dev : devs) {
|
||||
std::string name = ggml_backend_dev_name(dev);
|
||||
name += " (";
|
||||
name += ggml_backend_dev_description(dev);
|
||||
name += ")";
|
||||
dev_names.push_back(name);
|
||||
max_length = std::max(max_length, name.length());
|
||||
}
|
||||
for (std::string & dn : dev_names) {
|
||||
dn.insert(dn.end(), max_length - dn.length(), ' ');
|
||||
}
|
||||
}
|
||||
|
||||
int64_t sum_total = 0;
|
||||
int64_t sum_projected_free = 0;
|
||||
int64_t min_projected_free = INT64_MAX;
|
||||
int64_t sum_projected_used = 0;
|
||||
int64_t sum_projected_ctx = 0;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||
}
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const llama_device_memory_data & dmd = dmds_full[id];
|
||||
|
||||
const int64_t projected_used = dmd.mb.total();
|
||||
const int64_t projected_free = dmd.free - projected_used;
|
||||
|
||||
sum_total += dmd.total;
|
||||
sum_projected_used += projected_used;
|
||||
sum_projected_free += projected_free;
|
||||
min_projected_free = std::min(min_projected_free, projected_free);
|
||||
sum_projected_ctx += dmd.mb.context;
|
||||
|
||||
if (nd > 1) {
|
||||
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
|
||||
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
|
||||
projected_free >= 0 ? "surplus" : "deficit");
|
||||
}
|
||||
}
|
||||
assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
|
||||
assert(sum_projected_used >= sum_projected_ctx);
|
||||
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||
__func__, sum_projected_used/MiB, sum_total/MiB);
|
||||
if (min_projected_free >= margin) {
|
||||
if (nd == 1) {
|
||||
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||
__func__, min_projected_free/MiB, margin/MiB);
|
||||
return;
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
|
||||
__func__, min_projected_free/MiB, margin/MiB);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 2: try reducing memory use by reducing the context size
|
||||
|
||||
{
|
||||
int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
|
||||
if (global_surplus < 0) {
|
||||
LLAMA_LOG_INFO(nd == 1 ?
|
||||
"%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
|
||||
"%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||
__func__, margin/MiB, -global_surplus/MiB);
|
||||
if (cparams->n_ctx == 0) {
|
||||
if (hp_nct > n_ctx_min) {
|
||||
const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
|
||||
const uint32_t ctx_reduction = std::min(
|
||||
uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
|
||||
cparams->n_ctx = hp_nct - ctx_reduction;
|
||||
const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
|
||||
global_surplus += memory_reduction;
|
||||
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||
__func__, hp_nct, n_ctx_min);
|
||||
}
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||
}
|
||||
}
|
||||
if (global_surplus >= 0) {
|
||||
LLAMA_LOG_INFO("%s: entire model can be fit across devices by reducing context\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||
throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||
}
|
||||
if (nd > 1) {
|
||||
if (!tensor_split) {
|
||||
throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
|
||||
}
|
||||
if (mparams->tensor_split) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
if (mparams->tensor_split[id] != 0.0f) {
|
||||
throw std::runtime_error("model_params::tensor_split already set by user, abort");
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||
}
|
||||
if (hp_ngl < 2*nd) {
|
||||
throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
|
||||
+ std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
|
||||
}
|
||||
}
|
||||
if (!tensor_buft_overrides) {
|
||||
throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
|
||||
}
|
||||
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
||||
throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
|
||||
}
|
||||
|
||||
// step 3: iteratively fill the back to front with "dense" layers
|
||||
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
|
||||
// - for a MoE model, same as dense model but with all MoE tensors in system memory
|
||||
|
||||
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
|
||||
auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
|
||||
constexpr size_t n_strings = 1000;
|
||||
if (il >= n_strings) {
|
||||
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
|
||||
}
|
||||
switch (lf) {
|
||||
case LAYER_FRACTION_ATTN: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_UP: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_GATE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
case LAYER_FRACTION_MOE: {
|
||||
static std::array<std::string, n_strings> patterns;
|
||||
if (patterns[il].empty()) {
|
||||
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps";
|
||||
}
|
||||
return patterns[il].c_str();
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
};
|
||||
|
||||
struct ngl_t {
|
||||
uint32_t n_layer = 0; // number of total layers
|
||||
uint32_t n_part = 0; // number of partial layers, <= n_layer
|
||||
|
||||
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
||||
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
||||
};
|
||||
|
||||
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||
|
||||
// utility function to set n_gpu_layers and tensor_split
|
||||
auto set_ngl_tensor_split_tbo = [&](
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||
llama_model_params & mparams,
|
||||
const bool add_nonrepeating) {
|
||||
mparams.n_gpu_layers = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
||||
if (nd > 1) {
|
||||
tensor_split[id] = ngl_per_device[id].n_layer;
|
||||
}
|
||||
}
|
||||
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
|
||||
uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
|
||||
|
||||
if (add_nonrepeating) {
|
||||
mparams.n_gpu_layers += 1;
|
||||
tensor_split[nd - 1] += 1;
|
||||
}
|
||||
mparams.tensor_split = tensor_split;
|
||||
|
||||
size_t itbo = 0;
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
|
||||
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
||||
if (itbo + 1 >= ntbo) {
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
|
||||
+ std::to_string(ntbo) + " is insufficient for model\n");
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
|
||||
itbo++;
|
||||
}
|
||||
il0 += ngl_per_device[id].n_part;
|
||||
}
|
||||
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||
tensor_buft_overrides[itbo].buft = nullptr;
|
||||
itbo++;
|
||||
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||
};
|
||||
|
||||
// utility function that returns the memory use per device for given numbers of layers per device
|
||||
auto get_memory_for_layers = [&](
|
||||
const char * func_name,
|
||||
const std::vector<ngl_t> & ngl_per_device,
|
||||
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||
const bool add_nonrepeating) -> std::vector<int64_t> {
|
||||
llama_model_params mparams_copy = *mparams;
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
|
||||
|
||||
const dmds_t dmd_nl = llama_get_device_memory_data(
|
||||
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
const ngl_t & n = ngl_per_device[id];
|
||||
LLAMA_LOG_DEBUG(
|
||||
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
||||
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
||||
}
|
||||
|
||||
std::vector<int64_t> ret;
|
||||
ret.reserve(nd);
|
||||
for (const llama_device_memory_data & dmd : dmd_nl) {
|
||||
ret.push_back(dmd.mb.total());
|
||||
}
|
||||
return ret;
|
||||
};
|
||||
|
||||
int64_t global_surplus_cpu_moe = 0;
|
||||
if (hp_nex > 0) {
|
||||
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors
|
||||
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
|
||||
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
|
||||
tensor_buft_overrides[1] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
||||
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||
|
||||
for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
|
||||
global_surplus_cpu_moe += dmd.free;
|
||||
global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
|
||||
}
|
||||
|
||||
if (global_surplus_cpu_moe > 0) {
|
||||
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
||||
__func__, global_surplus_cpu_moe/MiB);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
||||
__func__, -global_surplus_cpu_moe/MiB);
|
||||
}
|
||||
|
||||
// reset
|
||||
tensor_buft_overrides[0] = {nullptr, nullptr};
|
||||
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||
}
|
||||
|
||||
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||
targets.reserve(nd);
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
targets.push_back(dmds_full[id].free - margin);
|
||||
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||
}
|
||||
|
||||
// whether for the optimal memory use we expect to load at least some MoE tensors:
|
||||
const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
|
||||
overflow_bufts.reserve(nd);
|
||||
for (size_t id = 0; id < nd - 1; ++id) {
|
||||
overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
|
||||
}
|
||||
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||
|
||||
std::vector<ngl_t> ngl_per_device(nd);
|
||||
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
|
||||
if (hp_nex > 0) {
|
||||
for (size_t id = 0; id < nd; id++) {
|
||||
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
|
||||
}
|
||||
}
|
||||
|
||||
// optimize the number of layers per device using the method of false position:
|
||||
// - ngl_per_device has 0 layers for each device, lower bound
|
||||
// - try a "high" configuration where a device is given all unassigned layers
|
||||
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
||||
// - check memory use of our guess, replace either the low or high bound
|
||||
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
||||
if (hp_nex == 0) {
|
||||
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||
}
|
||||
uint32_t n_unassigned = hp_ngl;
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
ngl_per_device_high[id].n_layer = n_unassigned;
|
||||
if (hp_nex > 0) {
|
||||
ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
|
||||
}
|
||||
if (ngl_per_device_high[id].n_layer > 0) {
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
|
||||
if (mem_high[id] > targets[id]) {
|
||||
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
ngl_per_device_test[id].n_layer += step_size;
|
||||
if (hp_nex) {
|
||||
ngl_per_device_test[id].n_part += step_size;
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
n_unassigned -= ngl_per_device[id].n_layer;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
}
|
||||
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
n_unassigned -= ngl_per_device[id].n_layer;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
|
||||
return;
|
||||
}
|
||||
|
||||
// step 4: for a MoE model where all dense tensors fit,
|
||||
// convert the dense-only layers in the back to full layers in the front until all devices are full
|
||||
// essentially the same procedure as for the dense-only layers except front-to-back
|
||||
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
|
||||
|
||||
size_t id_dense_start = nd;
|
||||
for (int id = nd - 1; id >= 0; id--) {
|
||||
if (ngl_per_device[id].n_layer > 0) {
|
||||
id_dense_start = id;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
assert(id_dense_start < nd);
|
||||
|
||||
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||
for (size_t id = 0; id <= id_dense_start; id++) {
|
||||
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||
const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
|
||||
ngl_per_device_high[id].n_layer += n_layer_move;
|
||||
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
||||
ngl_per_device_high[jd].n_part = 0;
|
||||
}
|
||||
size_t id_dense_start_high = nd - 1;
|
||||
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
|
||||
|
||||
if (mem_high[id] > targets[id]) {
|
||||
assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
|
||||
assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
|
||||
assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||
>= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||
uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||
while (delta > 1) {
|
||||
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||
step_size = std::max(step_size, uint32_t(1));
|
||||
step_size = std::min(step_size, delta - 1);
|
||||
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
uint32_t n_converted_test = 0;
|
||||
for (;id_dense_start_test < nd; id_dense_start_test++) {
|
||||
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
|
||||
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
|
||||
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
|
||||
ngl_per_device_test[id].n_layer += n_convert_jd;
|
||||
n_converted_test += n_convert_jd;
|
||||
|
||||
if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
|
||||
if (mem_test[id] <= targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
} else {
|
||||
ngl_per_device_high = ngl_per_device_test;
|
||||
mem_high = mem_test;
|
||||
id_dense_start_high = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||
}
|
||||
delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||
}
|
||||
} else {
|
||||
ngl_per_device = ngl_per_device_high;
|
||||
id_dense_start = id_dense_start_high;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
|
||||
// try to fit at least part of one more layer
|
||||
if (ngl_per_device[id_dense_start].n_layer > 0) {
|
||||
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||
size_t id_dense_start_test = id_dense_start;
|
||||
ngl_per_device_test[id_dense_start_test].n_layer--;
|
||||
ngl_per_device_test[id_dense_start_test].n_part--;
|
||||
ngl_per_device_test[id].n_layer++;
|
||||
ngl_per_device_test[id].n_part++;
|
||||
if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
|
||||
id_dense_start_test++;
|
||||
}
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
if (mem_test[id] < targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
if (mem_test[id] < targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
} else {
|
||||
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||
if (mem_test[id] < targets[id]) {
|
||||
ngl_per_device = ngl_per_device_test;
|
||||
mem = mem_test;
|
||||
id_dense_start = id_dense_start_test;
|
||||
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||
LLAMA_LOG_INFO(
|
||||
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||
}
|
||||
|
||||
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
|
||||
}
|
||||
|
||||
bool llama_params_fit(
|
||||
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||
const int64_t t0_us = llama_time_us();
|
||||
bool ok = true;
|
||||
try {
|
||||
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
|
||||
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
||||
} catch (const std::runtime_error & e) {
|
||||
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||
ok = false;
|
||||
}
|
||||
const int64_t t1_us = llama_time_us();
|
||||
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||
return ok;
|
||||
}
|
||||
|
||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||
struct llama_sampler_chain_params result = {
|
||||
/*.no_perf =*/ true,
|
||||
|
|
@ -49,6 +692,10 @@ size_t llama_max_devices(void) {
|
|||
return 16;
|
||||
}
|
||||
|
||||
size_t llama_max_tensor_buft_overrides() {
|
||||
return 4096;
|
||||
}
|
||||
|
||||
bool llama_supports_mmap(void) {
|
||||
return llama_mmap::SUPPORTED;
|
||||
}
|
||||
|
|
@ -108,11 +755,12 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
ml.print_info();
|
||||
|
||||
model.hparams.vocab_only = params.vocab_only;
|
||||
model.hparams.no_alloc = params.no_alloc;
|
||||
|
||||
try {
|
||||
model.load_arch(ml);
|
||||
|
|
|
|||
|
|
@ -31,16 +31,25 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
|
|||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
|
|
|||
|
|
@ -112,4 +112,8 @@ struct clip_graph {
|
|||
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
||||
// support dynamic resolution
|
||||
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
|
||||
|
||||
// Generic function to stack frames for audio processing
|
||||
// Abstracts out the StackAudioFrames logic used by ultravox
|
||||
ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -157,6 +157,7 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_INTERNVL,
|
||||
PROJECTOR_TYPE_LLAMA4,
|
||||
PROJECTOR_TYPE_QWEN2A,
|
||||
PROJECTOR_TYPE_GLMA,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_VOXTRAL,
|
||||
PROJECTOR_TYPE_LFM2,
|
||||
|
|
@ -183,6 +184,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||
{ PROJECTOR_TYPE_GLMA, "glma"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||
|
|
|
|||
|
|
@ -65,6 +65,13 @@ struct clip_hparams {
|
|||
int32_t n_mel_bins = 0; // whisper preprocessor
|
||||
int32_t proj_stack_factor = 0; // ultravox
|
||||
|
||||
// audio-to-mel preprocessor params
|
||||
int32_t audio_chunk_len = -1; // in seconds
|
||||
int32_t audio_sample_rate = -1;
|
||||
int32_t audio_n_fft = -1;
|
||||
int32_t audio_window_len = -1;
|
||||
int32_t audio_hop_len = -1;
|
||||
|
||||
// legacy
|
||||
bool has_llava_projector = false;
|
||||
int minicpmv_version = 0;
|
||||
|
|
@ -256,6 +263,7 @@ struct clip_model {
|
|||
ggml_tensor * conv1d_2_w = nullptr;
|
||||
ggml_tensor * conv1d_2_b = nullptr;
|
||||
ggml_tensor * mm_norm_pre_w = nullptr;
|
||||
ggml_tensor * mm_norm_pre_b = nullptr;
|
||||
ggml_tensor * mm_norm_mid_w = nullptr;
|
||||
|
||||
// cogvlm
|
||||
|
|
@ -277,3 +285,5 @@ struct clip_model {
|
|||
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||
}
|
||||
};
|
||||
|
||||
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx);
|
||||
|
|
|
|||
|
|
@ -733,6 +733,32 @@ ggml_tensor * clip_graph::build_rope_2d(
|
|||
return cur;
|
||||
}
|
||||
|
||||
// Generic function to stack frames for audio processing
|
||||
// Abstracts out the StackAudioFrames logic used by ultravox
|
||||
ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
|
||||
if (stack_factor <= 1) {
|
||||
return cur;
|
||||
}
|
||||
|
||||
int64_t total_elements = ggml_nelements(cur);
|
||||
int64_t stride = n_embed * stack_factor;
|
||||
|
||||
// Calculate padded length
|
||||
int64_t padded_len = GGML_PAD(total_elements, stride);
|
||||
int64_t pad = padded_len - total_elements;
|
||||
|
||||
if (pad > 0) {
|
||||
// Pad the tensor to make it divisible by stride
|
||||
cur = ggml_view_1d(ctx0, cur, total_elements, 0);
|
||||
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
|
||||
}
|
||||
|
||||
// Reshape to [stride, padded_len / stride]
|
||||
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
|
||||
ggml_row_size(cur->type, stride), 0);
|
||||
return cur;
|
||||
}
|
||||
|
||||
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
||||
// support dynamic resolution
|
||||
ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
|
||||
|
|
@ -809,6 +835,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
|
||||
} break;
|
||||
|
|
@ -1153,16 +1180,22 @@ struct clip_model_loader {
|
|||
} break;
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
{
|
||||
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
|
||||
model.proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
|
||||
model.proj_type == PROJECTOR_TYPE_GLMA;
|
||||
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
|
||||
if (hparams.n_mel_bins != 128) {
|
||||
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
|
||||
}
|
||||
hparams.ffn_op = FFN_GELU_ERF;
|
||||
log_ffn_op = "gelu_erf"; // temporary solution for logging
|
||||
|
||||
// audio preprocessing params
|
||||
hparams.audio_chunk_len = 30; // in seconds
|
||||
hparams.audio_sample_rate = 16000;
|
||||
hparams.audio_n_fft = 400;
|
||||
hparams.audio_window_len = 400;
|
||||
hparams.audio_hop_len = 160;
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
|
|
@ -1200,6 +1233,11 @@ struct clip_model_loader {
|
|||
LOG_INF("\n--- audio hparams ---\n");
|
||||
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
|
||||
LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
|
||||
LOG_INF("%s: audio_chunk_len: %d\n", __func__, hparams.audio_chunk_len);
|
||||
LOG_INF("%s: audio_sample_rate: %d\n", __func__, hparams.audio_sample_rate);
|
||||
LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft);
|
||||
LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len);
|
||||
LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||
|
|
@ -1527,6 +1565,21 @@ struct clip_model_loader {
|
|||
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
||||
model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
{
|
||||
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
||||
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
||||
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
||||
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
||||
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
|
||||
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
|
||||
model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
|
||||
model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight"));
|
||||
model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
{
|
||||
model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
|
||||
|
|
@ -2273,7 +2326,14 @@ struct llava_uhd {
|
|||
clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
|
||||
clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
|
||||
std::vector<slice_coordinates> slices;
|
||||
|
||||
img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
|
||||
bool padding_overview = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
|
||||
std::array<uint8_t, 3> pad_color_overview = {0, 0, 0};
|
||||
|
||||
img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC;
|
||||
bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
|
||||
std::array<uint8_t, 3> pad_color_refined = {0, 0, 0};
|
||||
};
|
||||
|
||||
static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
|
||||
|
|
@ -2300,10 +2360,11 @@ struct llava_uhd {
|
|||
auto refine_size = llava_uhd::select_best_resolution(
|
||||
original_size,
|
||||
ctx->model.hparams.image_res_candidates);
|
||||
res.overview_size = clip_image_size{slice_size, slice_size};
|
||||
res.refined_size = refine_size;
|
||||
res.grid_size = clip_image_size{0, 0};
|
||||
res.padding_refined = true;
|
||||
res.overview_size = clip_image_size{slice_size, slice_size};
|
||||
res.refined_size = refine_size;
|
||||
res.grid_size = clip_image_size{0, 0};
|
||||
res.padding_refined = true;
|
||||
res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; // preserve old behavior when padding
|
||||
|
||||
LOG_DBG("%s: using pinpoints for slicing\n", __func__);
|
||||
LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
|
||||
|
|
@ -2382,12 +2443,13 @@ struct llava_uhd {
|
|||
|
||||
static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
|
||||
std::vector<clip_image_u8_ptr> output;
|
||||
img_tool::resize_algo interpolation = img_tool::RESIZE_ALGO_BILINEAR; // TODO: make it configurable
|
||||
|
||||
// resize to overview size
|
||||
clip_image_u8_ptr resized_img(clip_image_u8_init());
|
||||
img_tool::resize(*img, *resized_img, inst.overview_size, interpolation);
|
||||
img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview,
|
||||
inst.padding_overview, inst.pad_color_overview);
|
||||
output.push_back(std::move(resized_img));
|
||||
|
||||
if (inst.slices.empty()) {
|
||||
// no slices, just return the resized image
|
||||
return output;
|
||||
|
|
@ -2395,13 +2457,8 @@ struct llava_uhd {
|
|||
|
||||
// resize to refined size
|
||||
clip_image_u8_ptr refined_img(clip_image_u8_init());
|
||||
if (inst.padding_refined) {
|
||||
img_tool::resize(*img, *refined_img, inst.refined_size, interpolation);
|
||||
} else {
|
||||
// only algo bicubic preserves the ratio; old models rely on this behavior
|
||||
// TODO: do we need to support other algos here?
|
||||
img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, false);
|
||||
}
|
||||
img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined,
|
||||
inst.padding_refined, inst.pad_color_refined);
|
||||
|
||||
// create slices
|
||||
for (const auto & slice : inst.slices) {
|
||||
|
|
@ -2934,6 +2991,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
n_patches /= 2;
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
{
|
||||
n_patches = img->nx;
|
||||
// whisper downscales input token by half after conv1d
|
||||
n_patches /= 2;
|
||||
// reshape by merge_factor
|
||||
n_patches /= ctx->model.hparams.proj_stack_factor;
|
||||
// for BOI and EOI token embeddings
|
||||
n_patches += 2;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
{
|
||||
n_patches += 2; // for BOI and EOI token embeddings
|
||||
|
|
@ -3269,6 +3336,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
|
|
@ -3379,6 +3447,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
return ctx->model.mm_model_proj->ne[1];
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
return ctx->model.mm_fc_w->ne[1];
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
|
|
@ -3425,6 +3495,7 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
|||
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_GLMA
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL;
|
||||
}
|
||||
|
||||
|
|
@ -3459,3 +3530,7 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
|
|||
batch->entries.push_back(clip_image_f32_ptr(audio));
|
||||
batch->is_audio = true;
|
||||
}
|
||||
|
||||
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
||||
return &ctx->model.hparams;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
|||
GGML_ASSERT(model.layers[0].q_b);
|
||||
GGML_ASSERT(model.layers[0].v_b);
|
||||
GGML_ASSERT(!model.layers[0].k_b); // no bias for k
|
||||
GGML_ASSERT(model.post_ln_w && model.post_ln_b);
|
||||
|
||||
ggml_tensor * pos_embd_selected = ggml_view_2d(
|
||||
ctx0, model.position_embeddings,
|
||||
|
|
@ -49,15 +48,7 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
|||
if (model.audio_has_stack_frames()) {
|
||||
// StackAudioFrames
|
||||
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
|
||||
int64_t stride = n_embd * hparams.proj_stack_factor;
|
||||
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
|
||||
int64_t pad = padded_len - ggml_nelements(cur);
|
||||
if (pad > 0) {
|
||||
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
|
||||
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
|
||||
}
|
||||
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
|
||||
ggml_row_size(cur->type, stride), 0);
|
||||
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
|
||||
cb(cur, "after_stacked", -1);
|
||||
}
|
||||
|
||||
|
|
@ -95,6 +86,14 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
|||
FFN_GELU_ERF,
|
||||
-1);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
|
||||
cur = ggml_norm(ctx0, cur, hparams.eps);
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
|
||||
cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
|
||||
cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
|
||||
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
|
||||
cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
|
||||
} else {
|
||||
GGML_ABORT("%s: unknown projector type", __func__);
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "clip-model.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
|
@ -8,18 +9,7 @@
|
|||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
#define WHISPER_ASSERT GGML_ASSERT
|
||||
|
||||
#define WHISPER_SAMPLE_RATE 16000
|
||||
#define WHISPER_N_FFT 400
|
||||
#define WHISPER_HOP_LENGTH 160
|
||||
#define WHISPER_CHUNK_SIZE 30
|
||||
|
||||
#define COMMON_SAMPLE_RATE 16000
|
||||
|
||||
namespace whisper_preprocessor {
|
||||
|
||||
struct whisper_mel {
|
||||
struct mtmd_audio_mel {
|
||||
int n_len;
|
||||
int n_len_org;
|
||||
int n_mel;
|
||||
|
|
@ -27,23 +17,18 @@ struct whisper_mel {
|
|||
std::vector<float> data;
|
||||
};
|
||||
|
||||
struct whisper_filters {
|
||||
int32_t n_mel;
|
||||
int32_t n_fft;
|
||||
struct mtmd_audio_preprocessor {
|
||||
const clip_hparams & hparams;
|
||||
|
||||
std::vector<float> data;
|
||||
mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
|
||||
|
||||
virtual ~mtmd_audio_preprocessor() = default;
|
||||
virtual void initialize() = 0; // NOT thread-safe
|
||||
virtual bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) = 0;
|
||||
};
|
||||
|
||||
bool preprocess_audio(
|
||||
const float * samples,
|
||||
size_t n_samples,
|
||||
const whisper_filters & filters,
|
||||
std::vector<whisper_mel> & output);
|
||||
|
||||
} // namespace whisper_preprocessor
|
||||
|
||||
namespace whisper_precalc_filters {
|
||||
|
||||
whisper_preprocessor::whisper_filters get_128_bins();
|
||||
|
||||
} // namespace whisper_precalc_filters
|
||||
struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
|
||||
mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
|
||||
void initialize() override;
|
||||
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -161,8 +161,7 @@ struct mtmd_context {
|
|||
// string template for slice image delimiters with row/col (idefics3)
|
||||
std::string sli_img_start_tmpl;
|
||||
|
||||
// for whisper, we pre-calculate the mel filter bank
|
||||
whisper_preprocessor::whisper_filters w_filters;
|
||||
std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
|
||||
|
||||
// TODO @ngxson : add timings
|
||||
|
||||
|
|
@ -327,14 +326,25 @@ struct mtmd_context {
|
|||
GGML_ASSERT(ctx_a != nullptr);
|
||||
projector_type proj = clip_get_projector_type(ctx_a);
|
||||
|
||||
if (clip_has_whisper_encoder(ctx_a)) {
|
||||
// TODO @ngxson : check if model n_mel is 128 or 80
|
||||
w_filters = whisper_precalc_filters::get_128_bins();
|
||||
}
|
||||
|
||||
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
|
||||
|
||||
// set preprocessor
|
||||
switch (proj) {
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_QWEN25O:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("unsupported audio projector type");
|
||||
}
|
||||
|
||||
// initialize audio preprocessor
|
||||
audio_preproc->initialize();
|
||||
|
||||
// set special tokens
|
||||
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||
aud_beg = "<|audio_bos|>";
|
||||
|
|
@ -663,11 +673,10 @@ struct mtmd_tokenizer {
|
|||
}
|
||||
|
||||
// preprocess audio
|
||||
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
|
||||
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
|
||||
std::vector<mtmd_audio_mel> mel_spec_chunks;
|
||||
const float * samples = (const float *)bitmap->data.data();
|
||||
size_t n_samples = bitmap->data.size() / sizeof(float);
|
||||
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
|
||||
bool ok = ctx->audio_preproc->preprocess(samples, n_samples, mel_spec_chunks);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess audio\n");
|
||||
return 2;
|
||||
|
|
@ -873,8 +882,7 @@ int mtmd_get_audio_bitrate(mtmd_context * ctx) {
|
|||
if (!ctx->ctx_a) {
|
||||
return -1;
|
||||
}
|
||||
// for now, we assume that all audio models have the same bitrate
|
||||
return 16000; // 16kHz
|
||||
return clip_get_hparams(ctx->ctx_a)->audio_sample_rate;
|
||||
}
|
||||
|
||||
//
|
||||
|
|
|
|||
|
|
@ -23,10 +23,10 @@ problem.
|
|||
8 files changed, 21 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 08681f35e..afde2f0b7 100644
|
||||
index 8547ecc84..9f37ca70c 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
@@ -112,7 +112,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
if (buffer->iface.free_buffer != NULL) {
|
||||
buffer->iface.free_buffer(buffer);
|
||||
}
|
||||
|
|
@ -34,7 +34,7 @@ index 08681f35e..afde2f0b7 100644
|
|||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||
@@ -586,6 +585,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
@@ -591,6 +590,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
|
||||
free(ctx->buffers);
|
||||
free(ctx);
|
||||
|
|
@ -42,7 +42,7 @@ index 08681f35e..afde2f0b7 100644
|
|||
}
|
||||
|
||||
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
@@ -2106,6 +2106,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -2125,6 +2125,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
GGML_ASSERT(buffer);
|
||||
ggml_aligned_free(buffer->context, buffer->size);
|
||||
|
|
@ -54,7 +54,7 @@ index 08681f35e..afde2f0b7 100644
|
|||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
@@ -2158,7 +2163,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
@@ -2177,7 +2182,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
};
|
||||
|
||||
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||
|
|
@ -156,10 +156,10 @@ index 18a45d2d9..89041805e 100644
|
|||
|
||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
index 7449a9160..e69a1ff5f 100644
|
||||
index e996d98be..84b679315 100644
|
||||
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
@@ -355,6 +355,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
@@ -356,6 +356,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
|
||||
delete ctx;
|
||||
|
|
@ -167,7 +167,7 @@ index 7449a9160..e69a1ff5f 100644
|
|||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
@@ -816,6 +817,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
@@ -817,6 +818,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
||||
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
|
|
@ -175,7 +175,7 @@ index 7449a9160..e69a1ff5f 100644
|
|||
}
|
||||
|
||||
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1158,6 +1160,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
@@ -1159,6 +1161,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
||||
|
||||
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_sycl_host_free(buffer->context);
|
||||
|
|
@ -184,10 +184,10 @@ index 7449a9160..e69a1ff5f 100644
|
|||
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index c6f5809cc..c801d2fd2 100644
|
||||
index 34ec09d40..120191ca0 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -12271,6 +12271,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -12365,6 +12365,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
|
|
@ -195,7 +195,7 @@ index c6f5809cc..c801d2fd2 100644
|
|||
}
|
||||
|
||||
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -12414,6 +12415,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
@@ -12508,6 +12509,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
||||
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ logs instead of throwing an error
|
|||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index e2cca66e4..8246a0a14 100644
|
||||
index 7b01a2edf..63250cdf1 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
|
@ -31,7 +31,7 @@ index e2cca66e4..8246a0a14 100644
|
|||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -2014,7 +2005,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -2015,7 +2006,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ filesystems for paths that include wide characters
|
|||
1 file changed, 39 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index bb922e30b..9d88dcc62 100644
|
||||
index fee49e465..38684e4d5 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -24,6 +24,19 @@
|
||||
|
|
@ -33,7 +33,7 @@ index bb922e30b..9d88dcc62 100644
|
|||
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
|
||||
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
@@ -1542,7 +1555,29 @@ struct clip_model_loader {
|
||||
@@ -1595,7 +1608,29 @@ struct clip_model_loader {
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
|
|
@ -63,7 +63,7 @@ index bb922e30b..9d88dcc62 100644
|
|||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
@@ -1569,7 +1604,11 @@ struct clip_model_loader {
|
||||
@@ -1622,7 +1657,11 @@ struct clip_model_loader {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -112,10 +112,10 @@ index e11318002..ec9e3a6df 100644
|
|||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
||||
index 277d0bcfd..b6bccd419 100644
|
||||
index 96c9598c2..2ce398f65 100644
|
||||
--- a/src/llama-hparams.cpp
|
||||
+++ b/src/llama-hparams.cpp
|
||||
@@ -163,6 +163,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
|
||||
@@ -162,6 +162,14 @@ uint32_t llama_hparams::n_pos_per_embd() const {
|
||||
return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
|
||||
}
|
||||
|
||||
|
|
@ -131,10 +131,10 @@ index 277d0bcfd..b6bccd419 100644
|
|||
if (il < n_layer) {
|
||||
return swa_layers[il];
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index c9960e916..05dbe63df 100644
|
||||
index cecb476e9..4ea71526d 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -64,6 +64,8 @@ struct llama_hparams {
|
||||
@@ -65,6 +65,8 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
|
|
@ -143,7 +143,7 @@ index c9960e916..05dbe63df 100644
|
|||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -257,6 +259,9 @@ struct llama_hparams {
|
||||
@@ -259,6 +261,9 @@ struct llama_hparams {
|
||||
|
||||
uint32_t n_pos_per_embd() const;
|
||||
|
||||
|
|
@ -154,7 +154,7 @@ index c9960e916..05dbe63df 100644
|
|||
|
||||
bool has_kv(uint32_t il) const;
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index aa3a65f87..ee303bd58 100644
|
||||
index ca2ea2461..8916a6242 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -466,7 +466,7 @@ namespace GGUFMeta {
|
||||
|
|
@ -167,10 +167,10 @@ index aa3a65f87..ee303bd58 100644
|
|||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index e4808b1e1..b7d1c7c93 100644
|
||||
index 050735afc..0c08e3e10 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1981,6 +1981,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1984,6 +1984,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
|
@ -192,7 +192,7 @@ index e4808b1e1..b7d1c7c93 100644
|
|||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -5421,6 +5436,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -5400,6 +5415,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
|
|
@ -227,7 +227,7 @@ index e4808b1e1..b7d1c7c93 100644
|
|||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -7500,6 +7543,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
@@ -7502,6 +7545,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_chameleon>(*this, params);
|
||||
} break;
|
||||
|
|
@ -238,7 +238,7 @@ index e4808b1e1..b7d1c7c93 100644
|
|||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
|
||||
@@ -7763,6 +7810,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -7766,6 +7813,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ regex
|
|||
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 8246a0a14..dfba7778b 100644
|
||||
index 63250cdf1..dd86a1745 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ index b165d8bdc..f91d4faba 100644
|
|||
}
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index dfba7778b..f72f321b9 100644
|
||||
index dd86a1745..d63ce9c84 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
|
|
|||
|
|
@ -11,10 +11,10 @@ Subject: [PATCH] graph memory reporting on failure
|
|||
4 files changed, 40 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
|
||||
index 2cb150fd2..7ab3f0192 100644
|
||||
index 78aa059dd..7fa8403b3 100644
|
||||
--- a/ggml/include/ggml-alloc.h
|
||||
+++ b/ggml/include/ggml-alloc.h
|
||||
@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n(
|
||||
@@ -72,6 +72,7 @@ GGML_API bool ggml_gallocr_reserve_n(
|
||||
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||
|
||||
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||
|
|
@ -23,10 +23,10 @@ index 2cb150fd2..7ab3f0192 100644
|
|||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index f1b740785..c54ff98bf 100644
|
||||
index 4ed5f3577..a7ebe5dcd 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -318,6 +318,7 @@ extern "C" {
|
||||
@@ -319,6 +319,7 @@ extern "C" {
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
|
@ -35,7 +35,7 @@ index f1b740785..c54ff98bf 100644
|
|||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
||||
index ec16cbda9..72f4b9f49 100644
|
||||
index 41419b617..73b39bfea 100644
|
||||
--- a/ggml/src/ggml-alloc.c
|
||||
+++ b/ggml/src/ggml-alloc.c
|
||||
@@ -485,6 +485,7 @@ struct node_alloc {
|
||||
|
|
@ -64,7 +64,7 @@ index ec16cbda9..72f4b9f49 100644
|
|||
free(galloc->buffers);
|
||||
free(galloc->buf_tallocs);
|
||||
free(galloc->node_allocs);
|
||||
@@ -901,6 +906,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
@@ -904,6 +909,8 @@ static bool ggml_gallocr_reserve_n_impl(
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -73,18 +73,19 @@ index ec16cbda9..72f4b9f49 100644
|
|||
// reallocate buffers if needed
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
// if the buffer type is used multiple times, we reuse the same buffer
|
||||
@@ -935,14 +942,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
#endif
|
||||
ggml_vbuffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
- if (galloc->buffers[i] == NULL) {
|
||||
+ if (galloc->buffers[i]) {
|
||||
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
+ } else {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
- return false;
|
||||
+ galloc->buffer_sizes[i] = new_size;
|
||||
+ success = false;
|
||||
@@ -940,15 +947,20 @@ static bool ggml_gallocr_reserve_n_impl(
|
||||
galloc->buffers[i] = NULL;
|
||||
} else {
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
- if (galloc->buffers[i] == NULL) {
|
||||
+ if (galloc->buffers[i]) {
|
||||
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
+ } else {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
- return false;
|
||||
+ galloc->buffer_sizes[i] = new_size;
|
||||
+ success = false;
|
||||
}
|
||||
}
|
||||
+ } else {
|
||||
+ galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
|
|
@ -95,8 +96,8 @@ index ec16cbda9..72f4b9f49 100644
|
|||
+ return success;
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
@@ -1097,6 +1109,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
void ggml_gallocr_reserve_n_size(
|
||||
@@ -1118,6 +1130,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
|
||||
}
|
||||
|
||||
|
|
@ -120,10 +121,10 @@ index ec16cbda9..72f4b9f49 100644
|
|||
|
||||
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index afde2f0b7..dbf8486a0 100644
|
||||
index 9f37ca70c..1459d16dd 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -1840,6 +1840,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
@@ -1859,6 +1859,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ Subject: [PATCH] ggml: Export GPU UUIDs
|
|||
3 files changed, 63 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index c54ff98bf..229bf387b 100644
|
||||
index a7ebe5dcd..03557bb31 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -158,6 +158,7 @@ extern "C" {
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
|
|||
2 files changed, 13 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
|
||||
index d06fa42e6..0f5712e21 100644
|
||||
index c63f299cd..fc8e0a637 100644
|
||||
--- a/tools/mtmd/mtmd.cpp
|
||||
+++ b/tools/mtmd/mtmd.cpp
|
||||
@@ -87,6 +87,16 @@ enum mtmd_slice_tmpl {
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ consistent performance.
|
|||
8 files changed, 58 insertions(+), 32 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 229bf387b..2763f2bd6 100644
|
||||
index 03557bb31..93c95602d 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -98,7 +98,7 @@ extern "C" {
|
||||
|
|
@ -40,8 +40,8 @@ index 229bf387b..2763f2bd6 100644
|
|||
+ GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size);
|
||||
+
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
|
||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
|
||||
index 6792ba986..0f5b03cef 100644
|
||||
--- a/ggml/src/ggml-backend-impl.h
|
||||
|
|
@ -58,10 +58,10 @@ index 6792ba986..0f5b03cef 100644
|
|||
// (optional) event synchronization
|
||||
// record an event on this stream
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index dbf8486a0..312ca873c 100644
|
||||
index 1459d16dd..498186a7c 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -348,14 +348,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
||||
@@ -353,14 +353,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
||||
}
|
||||
|
||||
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
|
|
@ -79,7 +79,7 @@ index dbf8486a0..312ca873c 100644
|
|||
}
|
||||
|
||||
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
@@ -722,6 +722,8 @@ struct ggml_backend_sched {
|
||||
@@ -727,6 +727,8 @@ struct ggml_backend_sched {
|
||||
|
||||
bool op_offload;
|
||||
|
||||
|
|
@ -88,7 +88,7 @@ index dbf8486a0..312ca873c 100644
|
|||
int debug;
|
||||
|
||||
// used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
|
||||
@@ -820,7 +822,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
@@ -825,7 +827,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
||||
// check if a backend with higher prio wants to offload the op
|
||||
|
|
@ -97,7 +97,7 @@ index dbf8486a0..312ca873c 100644
|
|||
for (int b = 0; b < src_backend_id; b++) {
|
||||
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||
SET_CAUSE(tensor, "1.off");
|
||||
@@ -1572,7 +1574,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
@@ -1577,7 +1579,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
|
||||
if (!sched->callback_eval) {
|
||||
|
|
@ -106,7 +106,7 @@ index dbf8486a0..312ca873c 100644
|
|||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
return ec;
|
||||
}
|
||||
@@ -1594,7 +1596,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
@@ -1599,7 +1601,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
|
||||
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
||||
|
||||
|
|
@ -115,7 +115,7 @@ index dbf8486a0..312ca873c 100644
|
|||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
return ec;
|
||||
}
|
||||
@@ -1684,6 +1686,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1689,6 +1691,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
|
||||
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
||||
sched->op_offload = op_offload;
|
||||
|
|
@ -123,7 +123,7 @@ index dbf8486a0..312ca873c 100644
|
|||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
@@ -1715,6 +1718,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
@@ -1720,6 +1723,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
free(sched);
|
||||
}
|
||||
|
||||
|
|
@ -278,10 +278,10 @@ index 8fc1c2fb5..ba95b4acc 100644
|
|||
|
||||
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index c801d2fd2..b2c0d0cee 100644
|
||||
index 120191ca0..5349bce24 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -13006,7 +13006,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
|
||||
@@ -13099,7 +13099,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
|
||||
return num_adds;
|
||||
}
|
||||
|
||||
|
|
@ -290,7 +290,7 @@ index c801d2fd2..b2c0d0cee 100644
|
|||
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
@@ -13241,6 +13241,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
@@ -13334,6 +13334,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
||||
UNUSED(backend);
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] fix mtmd-audio.cpp build on windows
|
|||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
|
||||
index 4d053895c..84bdc2777 100644
|
||||
index f68829a61..2024d3d37 100644
|
||||
--- a/tools/mtmd/mtmd-audio.cpp
|
||||
+++ b/tools/mtmd/mtmd-audio.cpp
|
||||
@@ -1,6 +1,6 @@
|
||||
|
|
|
|||
|
|
@ -10,13 +10,13 @@ must be recreated with no-alloc set to false before loading data.
|
|||
---
|
||||
ggml/include/ggml-backend.h | 1 +
|
||||
ggml/src/ggml-backend-impl.h | 16 +++
|
||||
ggml/src/ggml-backend.cpp | 72 +++++++++-
|
||||
ggml/src/ggml-backend.cpp | 75 ++++++++++-
|
||||
ggml/src/ggml-cuda/common.cuh | 62 ++++++++-
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 224 ++++++++++++++++++++++++++------
|
||||
5 files changed, 331 insertions(+), 44 deletions(-)
|
||||
5 files changed, 333 insertions(+), 45 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 2763f2bd6..b3b5b356a 100644
|
||||
index 93c95602d..dbbb61d9c 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -305,6 +305,7 @@ extern "C" {
|
||||
|
|
@ -75,13 +75,19 @@ index 0f5b03cef..7bdf9d81f 100644
|
|||
|
||||
struct ggml_backend {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 312ca873c..4092dfe8a 100644
|
||||
index 498186a7c..7746e8b92 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
|
||||
@@ -36,11 +36,25 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
- GGML_ASSERT(buft);
|
||||
if (size == 0) {
|
||||
// return a dummy buffer for zero-sized allocations
|
||||
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
||||
}
|
||||
|
||||
+
|
||||
+ if (buft->no_alloc) {
|
||||
+ ggml_backend_buffer_t buf;
|
||||
+
|
||||
|
|
@ -95,10 +101,11 @@ index 312ca873c..4092dfe8a 100644
|
|||
+ return buf;
|
||||
+ }
|
||||
+
|
||||
GGML_ASSERT(buft);
|
||||
+ GGML_ASSERT(buft);
|
||||
return buft->iface.alloc_buffer(buft, size);
|
||||
}
|
||||
@@ -95,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
|
||||
@@ -94,7 +108,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
/* .buft = */ buft,
|
||||
/* .context = */ context,
|
||||
/* .size = */ size,
|
||||
|
|
@ -108,7 +115,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
};
|
||||
|
||||
return buffer;
|
||||
@@ -127,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -126,6 +141,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -118,10 +125,10 @@ index 312ca873c..4092dfe8a 100644
|
|||
+ return (void *)ggml_backend_buffer_get_alignment(buffer);
|
||||
+ }
|
||||
+
|
||||
void * base = buffer->iface.get_base(buffer);
|
||||
|
||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||
@@ -731,6 +751,12 @@ struct ggml_backend_sched {
|
||||
// FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
|
||||
// I don't know whether the above comment is correct
|
||||
if (!buffer->iface.get_base) {
|
||||
@@ -736,6 +757,12 @@ struct ggml_backend_sched {
|
||||
int debug_realloc;
|
||||
int debug_graph_size;
|
||||
int debug_prev_graph_size;
|
||||
|
|
@ -134,7 +141,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
};
|
||||
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||
@@ -1630,6 +1656,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1635,6 +1662,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
size_t graph_size,
|
||||
bool parallel,
|
||||
bool op_offload) {
|
||||
|
|
@ -152,7 +159,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
GGML_ASSERT(n_backends > 0);
|
||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
@@ -1682,11 +1719,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1687,11 +1725,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
||||
}
|
||||
}
|
||||
|
|
@ -167,7 +174,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
@@ -1701,6 +1741,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
@@ -1706,6 +1747,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
ggml_backend_event_free(sched->events[b][c]);
|
||||
}
|
||||
|
|
@ -178,7 +185,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
}
|
||||
ggml_gallocr_free(sched->galloc);
|
||||
ggml_free(sched->ctx);
|
||||
@@ -1746,6 +1790,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
@@ -1765,6 +1810,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -203,7 +210,7 @@ index 312ca873c..4092dfe8a 100644
|
|||
ggml_backend_sched_reset(sched);
|
||||
|
||||
return true;
|
||||
@@ -1851,7 +1913,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||
@@ -1870,7 +1933,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] decode: disable output_all
|
|||
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 2692297dc..7e3c43f4c 100644
|
||||
index 8786d4ee3..9e6998272 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -991,8 +991,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
@@ -1051,8 +1051,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
const int64_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ unused then it can be reset to free these data structures.
|
|||
6 files changed, 32 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index b3b5b356a..69223c488 100644
|
||||
index dbbb61d9c..92ca32a4b 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -178,6 +178,7 @@ extern "C" {
|
||||
|
|
@ -43,10 +43,10 @@ index 7bdf9d81f..21b35ac5c 100644
|
|||
|
||||
struct ggml_backend_device {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 4092dfe8a..a1a19fe51 100644
|
||||
index 7746e8b92..189e97170 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
|
||||
@@ -532,6 +532,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
|
||||
return device->iface.init_backend(device, params);
|
||||
}
|
||||
|
||||
|
|
@ -122,10 +122,10 @@ index 951a88d56..4e162258d 100644
|
|||
#define cudaError_t hipError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index ab2e9868a..74c49e651 100644
|
||||
index 7ed34b80a..0664c8513 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -270,10 +270,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||
@@ -918,10 +918,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||
for (auto * dev : model->devices) {
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ fix vulkan PCI ID and ID handling
|
|||
create mode 100644 ggml/src/mem_nvml.cpp
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 69223c488..6510e0cba 100644
|
||||
index 92ca32a4b..6ad583f09 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -169,6 +169,12 @@ extern "C" {
|
||||
|
|
@ -243,7 +243,7 @@ index ba95b4acc..f6f8f7a10 100644
|
|||
/* .async = */ true,
|
||||
/* .host_buffer = */ false,
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index b2c0d0cee..d9f4d34f5 100644
|
||||
index 5349bce24..d43d46d1d 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -236,6 +236,7 @@ class vk_memory_logger;
|
||||
|
|
@ -254,7 +254,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
|
||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||
static constexpr uint32_t p021_max_gqa_ratio = 8;
|
||||
@@ -12256,6 +12257,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
@@ -12350,6 +12351,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
snprintf(description, description_size, "%s", props.deviceName.data());
|
||||
}
|
||||
|
||||
|
|
@ -284,7 +284,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
// backend interface
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
@@ -13535,15 +13559,72 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
@@ -13628,15 +13652,72 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
ggml_vk_get_device_description(dev_idx, description, description_size);
|
||||
}
|
||||
|
||||
|
|
@ -361,7 +361,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
|
||||
if (membudget_supported) {
|
||||
memprops.pNext = &budgetprops;
|
||||
@@ -13595,8 +13676,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
@@ -13688,8 +13769,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -376,7 +376,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props = {};
|
||||
@@ -13613,19 +13699,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
@@ -13706,19 +13792,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
|
||||
|
|
@ -410,7 +410,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
@@ -13637,9 +13728,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
|
||||
@@ -13730,9 +13821,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
|
|
@ -426,7 +426,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
@@ -13663,8 +13759,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
@@ -13756,8 +13852,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
|
||||
props->name = ggml_backend_vk_device_get_name(dev);
|
||||
props->description = ggml_backend_vk_device_get_description(dev);
|
||||
|
|
@ -437,7 +437,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
@@ -13672,6 +13769,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
@@ -13765,6 +13862,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
/* .buffer_from_host_ptr = */ false,
|
||||
/* .events = */ false,
|
||||
};
|
||||
|
|
@ -451,7 +451,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
@@ -14236,6 +14340,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14331,6 +14435,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
|
|
@ -460,7 +460,7 @@ index b2c0d0cee..d9f4d34f5 100644
|
|||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
@@ -14244,12 +14350,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14339,12 +14445,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ index 1c07e767a..0da3e065b 100644
|
|||
#ifdef __cplusplus
|
||||
}
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index d9f4d34f5..8a83427fb 100644
|
||||
index d43d46d1d..df79f9f79 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -74,6 +74,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
|
|
@ -49,7 +49,7 @@ index d9f4d34f5..8a83427fb 100644
|
|||
|
||||
typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
|
||||
VkStructureType sType;
|
||||
@@ -13576,6 +13577,7 @@ struct ggml_backend_vk_device_context {
|
||||
@@ -13669,6 +13670,7 @@ struct ggml_backend_vk_device_context {
|
||||
std::string pci_id;
|
||||
std::string id;
|
||||
std::string uuid;
|
||||
|
|
@ -57,7 +57,7 @@ index d9f4d34f5..8a83427fb 100644
|
|||
int major;
|
||||
int minor;
|
||||
int driver_major;
|
||||
@@ -13594,6 +13596,20 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
@@ -13687,6 +13689,20 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
|
||||
vk::PhysicalDeviceProperties2 props2;
|
||||
vkdev.getProperties2(&props2);
|
||||
|
|
@ -78,7 +78,7 @@ index d9f4d34f5..8a83427fb 100644
|
|||
|
||||
if (!is_integrated_gpu)
|
||||
{
|
||||
@@ -13625,7 +13641,6 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
@@ -13718,7 +13734,6 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
}
|
||||
// else fallback to memory budget if supported
|
||||
|
||||
|
|
@ -86,7 +86,7 @@ index d9f4d34f5..8a83427fb 100644
|
|||
if (membudget_supported) {
|
||||
memprops.pNext = &budgetprops;
|
||||
}
|
||||
@@ -14357,7 +14372,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14452,7 +14467,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ ctx,
|
||||
});
|
||||
|
|
@ -94,7 +94,7 @@ index d9f4d34f5..8a83427fb 100644
|
|||
// Gather additional information about the device
|
||||
int dev_idx = vk_instance.device_indices[i];
|
||||
vk::PhysicalDeviceProperties props1;
|
||||
@@ -14380,6 +14394,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14475,6 +14489,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
}
|
||||
}
|
||||
ctx->uuid = oss.str();
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] win: exit instead of abort
|
|||
1 file changed, 6 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index f0913cd35..ecd887583 100644
|
||||
index eb3ae72ea..c9242a15a 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -250,8 +250,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||
|
|
|
|||
|
|
@ -9,10 +9,10 @@ Rever to prior logic of assuming an empty projector type is mlp
|
|||
1 file changed, 4 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index 9d88dcc62..97c182abf 100644
|
||||
index 38684e4d5..ec26d99a3 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -934,6 +934,10 @@ struct clip_model_loader {
|
||||
@@ -961,6 +961,10 @@ struct clip_model_loader {
|
||||
if (proj_type.empty()) {
|
||||
if (modality == CLIP_MODALITY_VISION) {
|
||||
get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ struct llama_vocab * llama_load_vocab_from_file(const char * fname) {
|
|||
try {
|
||||
const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||
std::vector<std::string> splits = {};
|
||||
llama_model_loader ml(std::string(fname), splits, false, false, nullptr, nullptr);
|
||||
llama_model_loader ml(std::string(fname), splits, false, false, false, nullptr, nullptr);
|
||||
vocab->load(ml, kv);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
|
||||
|
|
|
|||
|
|
@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
|||
// call with a worst-case graph to avoid buffer reallocations
|
||||
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
||||
// returns false if the buffer allocation failed
|
||||
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
|
||||
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_gallocr_reserve_n_size(
|
||||
ggml_gallocr_t galloc,
|
||||
struct ggml_cgraph * graph,
|
||||
const int * node_buffer_ids,
|
||||
const int * leaf_buffer_ids,
|
||||
size_t * sizes);
|
||||
GGML_API bool ggml_gallocr_reserve_n(
|
||||
ggml_gallocr_t galloc,
|
||||
struct ggml_cgraph * graph,
|
||||
|
|
@ -69,6 +76,8 @@ GGML_API size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, in
|
|||
|
||||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
||||
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
||||
|
||||
|
|
|
|||
|
|
@ -319,6 +319,7 @@ extern "C" {
|
|||
GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size);
|
||||
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
|
||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
||||
|
||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||
|
|
|
|||
|
|
@ -2615,7 +2615,8 @@ extern "C" {
|
|||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
|
||||
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
|
||||
|
|
|
|||
|
|
@ -599,7 +599,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
|||
}
|
||||
|
||||
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
|
||||
return t->data != NULL // tensor data already set externally
|
||||
|| t->buffer // tensor on external buffer (but not yet allocated)
|
||||
|| ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
|
||||
}
|
||||
|
||||
// free the extra space at the end if the new tensor is smaller
|
||||
|
|
@ -828,7 +830,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|||
}
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||
static bool ggml_gallocr_reserve_n_impl(
|
||||
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
|
||||
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
||||
// add 25% margin to avoid hash collisions
|
||||
min_hash_size += min_hash_size / 4;
|
||||
|
|
@ -935,19 +938,22 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
||||
if (cur_size > 0) {
|
||||
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
|
||||
__func__, ggml_backend_buft_name(galloc->bufts[i]),
|
||||
cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
__func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
ggml_vbuffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
if (galloc->buffers[i]) {
|
||||
galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
if (no_alloc) {
|
||||
galloc->buffers[i] = NULL;
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
galloc->buffer_sizes[i] = new_size;
|
||||
success = false;
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
if (galloc->buffers[i]) {
|
||||
galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
galloc->buffer_sizes[i] = new_size;
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
galloc->buffer_sizes[i] = ggml_vbuffer_size(galloc->buffers[i]);
|
||||
|
|
@ -957,6 +963,21 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||
return success;
|
||||
}
|
||||
|
||||
void ggml_gallocr_reserve_n_size(
|
||||
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
|
||||
GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
sizes[i] = 0;
|
||||
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
|
||||
sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||
return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
|
||||
}
|
||||
|
||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
||||
}
|
||||
|
|
@ -1175,7 +1196,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|||
return true;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||
static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
|
||||
struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
|
||||
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
||||
|
||||
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||
|
|
@ -1183,6 +1205,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||
|
||||
ggml_backend_buffer_t * buffers = NULL;
|
||||
size_t n_buffers = 0;
|
||||
*nbytes_total = 0;
|
||||
|
||||
size_t cur_buf_size = 0;
|
||||
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
||||
|
|
@ -1194,10 +1217,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||
|
||||
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
|
||||
// allocate tensors in the current buffer
|
||||
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||
if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||
return NULL;
|
||||
}
|
||||
first = t;
|
||||
*nbytes_total += cur_buf_size;
|
||||
cur_buf_size = this_size;
|
||||
} else {
|
||||
cur_buf_size += this_size;
|
||||
|
|
@ -1206,15 +1230,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||
|
||||
// allocate remaining tensors
|
||||
if (cur_buf_size > 0) {
|
||||
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||
*nbytes_total += cur_buf_size;
|
||||
if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (no_alloc) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (n_buffers == 0) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
||||
#endif
|
||||
GGML_ASSERT(!buffers);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -1224,10 +1254,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||
} else {
|
||||
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
||||
}
|
||||
free(buffers);
|
||||
if (buffers) {
|
||||
free(buffers); // can be NULL if context is empty or no_alloc
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||
size_t nbytes_total = 0;
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
|
||||
GGML_ASSERT(!buf);
|
||||
return nbytes_total;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||
size_t nbytes_total = 0;
|
||||
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
||||
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -147,6 +147,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|||
return (void *)ggml_backend_buffer_get_alignment(buffer);
|
||||
}
|
||||
|
||||
// FIXME JG: a multi_buffer has a non-zero size, according to the above comment get_base is not optional,
|
||||
// I don't know whether the above comment is correct
|
||||
if (!buffer->iface.get_base) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void * base = buffer->iface.get_base(buffer);
|
||||
|
||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||
|
|
@ -1786,6 +1792,20 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|||
sched->is_alloc = false;
|
||||
}
|
||||
|
||||
void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) {
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||
GGML_ASSERT(sizes);
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes);
|
||||
}
|
||||
|
||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||
GGML_ASSERT(sched);
|
||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
|
||||
static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
||||
int16x8_t * out_mins,
|
||||
int8_t * out_scales) {
|
||||
|
|
@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
|
|||
scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
|
||||
memcpy(out_scales, scales_u32, 8);
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||
assert(QK8_0 == 32);
|
||||
|
|
|
|||
|
|
@ -769,9 +769,16 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
|||
#endif
|
||||
|
||||
dev->props.use_shared_buffers = dev->props.has_unified_memory;
|
||||
#if TARGET_OS_OSX
|
||||
// In case of eGPU, shared memory may be preferable.
|
||||
dev->props.use_shared_buffers |= [dev->mtl_device location] == MTLDeviceLocationExternal;
|
||||
#endif
|
||||
if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
|
||||
dev->props.use_shared_buffers = false;
|
||||
}
|
||||
if (getenv("GGML_METAL_SHARED_BUFFERS_ENABLE") != NULL) {
|
||||
dev->props.use_shared_buffers = true;
|
||||
}
|
||||
|
||||
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||
|
||||
|
|
|
|||
|
|
@ -661,6 +661,7 @@ struct vk_device_struct {
|
|||
vk_pipeline pipeline_cos_f32;
|
||||
vk_pipeline pipeline_log[2];
|
||||
vk_pipeline pipeline_tri[2];
|
||||
vk_pipeline pipeline_diag[2];
|
||||
vk_pipeline pipeline_clamp_f32;
|
||||
vk_pipeline pipeline_pad_f32;
|
||||
vk_pipeline pipeline_roll_f32;
|
||||
|
|
@ -724,6 +725,11 @@ struct vk_device_struct {
|
|||
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
||||
vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
|
||||
vk_pipeline pipeline_soft_max_back_f32;
|
||||
|
||||
vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16;
|
||||
vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16;
|
||||
vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16;
|
||||
|
||||
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16;
|
||||
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16;
|
||||
vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
|
||||
|
|
@ -759,7 +765,8 @@ struct vk_device_struct {
|
|||
|
||||
vk_pipeline pipeline_flash_attn_split_k_reduce;
|
||||
|
||||
vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT];
|
||||
// [2] is for whether to take n_experts from spec constant (0) or push constant (1)
|
||||
vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2];
|
||||
|
||||
std::vector<vk_pipeline_ref> all_pipelines;
|
||||
|
||||
|
|
@ -1151,6 +1158,7 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
|
|||
|
||||
struct vk_op_topk_moe_push_constants {
|
||||
uint32_t n_rows;
|
||||
uint32_t n_experts_push;
|
||||
uint32_t n_expert_used;
|
||||
float clamp_min;
|
||||
float clamp_max;
|
||||
|
|
@ -3732,6 +3740,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32], "get_rows_i32", get_rows_i32_len, get_rows_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
||||
|
|
@ -3919,6 +3928,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
|
@ -3998,6 +4010,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32, "soft_max_large1_f32", soft_max_large1_f32_len, soft_max_large1_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32, "soft_max_large2_f32", soft_max_large2_f32_len, soft_max_large2_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32, "soft_max_large3_f32", soft_max_large3_f32_len, soft_max_large3_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
|
|
@ -4206,10 +4225,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
|
||||
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true, device->subgroup_size);
|
||||
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true, device->subgroup_size);
|
||||
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true, device->subgroup_size);
|
||||
for (uint32_t use_push = 0; use_push < 2; ++use_push) {
|
||||
for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
|
||||
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0, use_push}, 1, true, true, device->subgroup_size);
|
||||
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0, use_push}, 1, true, true, device->subgroup_size);
|
||||
ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1, use_push}, 1, true, true, device->subgroup_size);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &c : compiles) {
|
||||
|
|
@ -8276,6 +8297,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||
switch (op) {
|
||||
case GGML_OP_GET_ROWS:
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
if (src0->type == GGML_TYPE_I32) {
|
||||
// i32 src only supports i32 result
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_I32);
|
||||
return ctx->device->pipeline_get_rows[src0->type];
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
return ctx->device->pipeline_get_rows[src0->type];
|
||||
}
|
||||
|
|
@ -8402,6 +8428,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||
return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16];
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_DIAG:
|
||||
if (src0->type == dst->type &&
|
||||
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
|
||||
return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16];
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_CLAMP:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_clamp_f32;
|
||||
|
|
@ -8556,7 +8588,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|||
uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
|
||||
GGML_ASSERT(idx < num_topk_moe_pipelines);
|
||||
topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
|
||||
return ctx->device->pipeline_topk_moe[idx][mode];
|
||||
// use n_experts from push constant if it's not equal to the power of two spec constant
|
||||
bool use_push = dst->ne[0] != (1u << idx);
|
||||
return ctx->device->pipeline_topk_moe[idx][mode][use_push];
|
||||
}
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
||||
|
|
@ -9093,6 +9127,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|||
case GGML_OP_COS:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_TRI:
|
||||
case GGML_OP_DIAG:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ROLL:
|
||||
|
|
@ -9780,6 +9815,12 @@ static void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|||
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p));
|
||||
}
|
||||
|
||||
static void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
||||
|
||||
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p));
|
||||
}
|
||||
|
||||
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
||||
p.param1 = ggml_get_op_params_f32(dst, 0);
|
||||
|
|
@ -10113,7 +10154,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, {
|
||||
vk_op_soft_max_push_constants pc {
|
||||
ncols,
|
||||
src1 != nullptr ? nrows_y : (uint32_t)0,
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
|
||||
|
|
@ -10124,7 +10165,55 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|||
n_head_log2,
|
||||
nrows_x,
|
||||
src2 != nullptr
|
||||
});
|
||||
};
|
||||
|
||||
if (ncols <= 16384) {
|
||||
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc));
|
||||
} else {
|
||||
|
||||
vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0);
|
||||
vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a;
|
||||
vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a;
|
||||
vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst);
|
||||
|
||||
uint32_t elems_per_wg = 128 * 4;
|
||||
uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg);
|
||||
size_t tmp_size = num_wgs * nrows_x * sizeof(float);
|
||||
|
||||
if (ctx->prealloc_size_x < tmp_size) {
|
||||
ctx->prealloc_size_x = tmp_size;
|
||||
ggml_vk_preallocate_buffers(ctx, subctx);
|
||||
}
|
||||
if (ctx->prealloc_size_y < tmp_size) {
|
||||
ctx->prealloc_size_y = tmp_size;
|
||||
ggml_vk_preallocate_buffers(ctx, subctx);
|
||||
}
|
||||
if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) {
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
|
||||
vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size };
|
||||
vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size };
|
||||
|
||||
std::array<uint32_t, 3> elements = { num_wgs, nrows_x, 1 };
|
||||
|
||||
vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32;
|
||||
vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32;
|
||||
vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32;
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1);
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
|
||||
|
||||
ctx->prealloc_x_need_sync = true;
|
||||
ctx->prealloc_y_need_sync = true;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
|
@ -10160,6 +10249,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|||
|
||||
vk_op_topk_moe_push_constants pc {};
|
||||
pc.n_rows = n_rows;
|
||||
pc.n_experts_push = n_experts;
|
||||
pc.n_expert_used = n_expert_used;
|
||||
if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
|
||||
ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
|
||||
|
|
@ -11859,6 +11949,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|||
case GGML_OP_TRI:
|
||||
ggml_vk_tri(ctx, compute_ctx, src0, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_DIAG:
|
||||
ggml_vk_diag(ctx, compute_ctx, src0, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_CLAMP:
|
||||
ggml_vk_clamp(ctx, compute_ctx, src0, node);
|
||||
|
|
@ -12859,8 +12953,7 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
|
|||
}
|
||||
|
||||
const int n_expert = softmax->ne[0];
|
||||
// n_expert must be a power of 2
|
||||
if (!is_pow2(n_expert) || n_expert > (1 << (num_topk_moe_pipelines-1))) {
|
||||
if (n_expert > (1 << (num_topk_moe_pipelines-1))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -13999,6 +14092,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_I32:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
|
@ -14123,6 +14217,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_TRI:
|
||||
case GGML_OP_DIAG:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
|
||||
op->type == op->src[0]->type;
|
||||
case GGML_OP_ARGSORT:
|
||||
|
|
@ -14751,6 +14846,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
|||
tensor_clone = ggml_log(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_TRI) {
|
||||
tensor_clone = ggml_tri(ggml_ctx, src_clone[0], ggml_get_op_params_i32(tensor, 0));
|
||||
} else if (tensor->op == GGML_OP_DIAG) {
|
||||
tensor_clone = ggml_diag(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_CLAMP) {
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,29 @@
|
|||
#version 450
|
||||
|
||||
#include "rte.glsl"
|
||||
#include "types.glsl"
|
||||
#include "generic_unary_head.glsl"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
void main() {
|
||||
const uint idx = get_idx();
|
||||
|
||||
if (idx >= p.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
|
||||
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
||||
const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
|
||||
const uint i12_offset = i12*p.ne11*p.ne10;
|
||||
const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
|
||||
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
|
||||
|
||||
if (i10 == i11) {
|
||||
const float val = float(data_a[get_aoffset() + i13*p.nb03 + i12*p.nb02 + 0*p.nb01 + i10*p.nb00]);
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val);
|
||||
} else {
|
||||
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(0);
|
||||
}
|
||||
}
|
||||
|
|
@ -256,6 +256,9 @@ void main() {
|
|||
barrier();
|
||||
}
|
||||
|
||||
// prevent race on tmpsh
|
||||
barrier();
|
||||
|
||||
// reduce across threads
|
||||
|
||||
[[unroll]] for (uint32_t r = 0; r < Br; ++r) {
|
||||
|
|
|
|||
|
|
@ -302,6 +302,9 @@ void main() {
|
|||
barrier();
|
||||
}
|
||||
|
||||
// prevent race on tmpsh
|
||||
barrier();
|
||||
|
||||
// reduce across threads
|
||||
|
||||
float rowmaxf[rows_per_thread], eMf[rows_per_thread], Moldf[rows_per_thread];
|
||||
|
|
|
|||
|
|
@ -26,9 +26,9 @@ void main() {
|
|||
const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
|
||||
|
||||
#if defined(DATA_A_BF16)
|
||||
FLOAT_TYPE v = FLOAT_TYPE(bf16_to_fp32(data_a[a_offset + i00]));
|
||||
TEMP_TYPE v = TEMP_TYPE(bf16_to_fp32(data_a[a_offset + i00]));
|
||||
#else
|
||||
FLOAT_TYPE v = FLOAT_TYPE(data_a[a_offset + i00]);
|
||||
TEMP_TYPE v = TEMP_TYPE(data_a[a_offset + i00]);
|
||||
#endif
|
||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||
data_d[d_offset + i00] = D_TYPE(v);
|
||||
|
|
|
|||
|
|
@ -7,34 +7,50 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
|||
|
||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||
const uint y_idx = i * QUANT_K + 32 * ib32;
|
||||
|
||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const float d = float(data_a[ibi].d);
|
||||
const uint qh = data_a[ibi].qh[ib32];
|
||||
const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
|
||||
const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
|
||||
|
||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i,
|
||||
const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||
const uint y_idx_base = i * QUANT_K + 32 * ib32;
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
const uint base_b_idx = (j * p.batch_stride_b + b_offset + y_idx_base) / 4;
|
||||
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
||||
const uint qs = data_a[ibi].qs[4 * ib32 + l];
|
||||
const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3);
|
||||
const int16_t grid = int16_t(iq1s_grid[qs | (idxhi << 8)]);
|
||||
const vec4 b_val_0 = vec4(data_b_v4[base_b_idx + 2 * l]);
|
||||
const vec4 b_val_1 = vec4(data_b_v4[base_b_idx + 2 * l + 1]);
|
||||
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
|
||||
vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
|
||||
// index for data_a
|
||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
||||
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
const float d = float(data_a[ibi].d);
|
||||
const uint qh = data_a[ibi].qh[ib32];
|
||||
|
||||
const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
|
||||
const uint qs = data_a[ibi].qs[4 * ib32 + l];
|
||||
const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3);
|
||||
const uint16_t grid = uint16_t(iq1s_grid[qs | (idxhi << 8)]);
|
||||
|
||||
const float delta_val = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
|
||||
const vec4 delta_v = vec4(delta_val);
|
||||
const vec4 fbits0 = vec4(
|
||||
float(bitfieldExtract(grid, 0, 2)),
|
||||
float(bitfieldExtract(grid, 2, 2)),
|
||||
float(bitfieldExtract(grid, 4, 2)),
|
||||
float(bitfieldExtract(grid, 6, 2))
|
||||
);
|
||||
const vec4 fbits1 = vec4(
|
||||
float(bitfieldExtract(grid, 8, 2)),
|
||||
float(bitfieldExtract(grid, 10, 2)),
|
||||
float(bitfieldExtract(grid, 12, 2)),
|
||||
float(bitfieldExtract(grid, 14, 2))
|
||||
);
|
||||
|
||||
vec4 sum_v = fma(b_val_0, fbits0 + delta_v, vec4(0.0));
|
||||
sum_v = fma(b_val_1, fbits1 + delta_v, sum_v);
|
||||
FLOAT_TYPE sum = dot(sum_v, vec4(1.0));
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||
[[unroll]] for (int k = 0; k < 4; ++k) {
|
||||
sum = fma(FLOAT_TYPE(b0[k]), bitfieldExtract(grid, 2 * k, 2) + delta,
|
||||
fma(FLOAT_TYPE(b4[k]), bitfieldExtract(grid, 8 + 2 * k, 2) + delta, sum));
|
||||
}
|
||||
temp[j][n] = fma(dl, sum, temp[j][n]);
|
||||
ibi += num_blocks_per_row;
|
||||
}
|
||||
}
|
||||
ibi += num_blocks_per_row;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -244,17 +244,20 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
|||
const uint iqs = idx % 128; // 0..127
|
||||
|
||||
const uint n = iqs / 64; // 0,1
|
||||
const uint b = (iqs % 64) / 32; // 0,1
|
||||
const uint b = ((iqs % 64) / 32) * 4; // 0,4
|
||||
const uint is_b = (iqs % 16) / 8; // 0,1
|
||||
const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
|
||||
const uint is = 8 * n + qhshift + is_b; // 0..15
|
||||
const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126
|
||||
const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62
|
||||
const uint qsi = n * 32 + (iqs % 32); // 0..63
|
||||
const uint qhi = n * 16 + (iqs % 16); // 0..31
|
||||
|
||||
const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
|
||||
|
||||
buf_a[buf_idx] = FLOAT_TYPE_VEC2(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32),
|
||||
dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
|
||||
const uint ql = (uint(data_a_packed16[ib].ql[qsi]) >> b) & 0x0F0F;
|
||||
const uint qh = (uint(data_a_packed16[ib].qh[qhi]) >> qhshift) & 0x0303;
|
||||
const vec2 q = (vec2(unpack8(ql | (qh << 4)).xy) - 32) * dscale;
|
||||
|
||||
buf_a[buf_idx] = FLOAT_TYPE_VEC2(q.x, q.y);
|
||||
#elif defined(DATA_A_IQ1_S)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,62 @@
|
|||
#version 450
|
||||
|
||||
#include "soft_max_large_common.glsl"
|
||||
|
||||
void main() {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint rowx = gl_WorkGroupID.y;
|
||||
const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
|
||||
|
||||
const uint32_t i03 = rowx / (p.ne01 * p.ne02);
|
||||
const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
|
||||
const uint32_t i01 = rowx % p.ne01;
|
||||
|
||||
uint rowy_start = 0;
|
||||
if (p.KY > 0) {
|
||||
rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
|
||||
}
|
||||
|
||||
if (rowx >= p.nrows_x) {
|
||||
return;
|
||||
}
|
||||
|
||||
float slope = get_slope(rowx);
|
||||
|
||||
// Find max
|
||||
FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
|
||||
|
||||
[[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
|
||||
const uint col = col0 + tid;
|
||||
|
||||
FLOAT_TYPE a = FLOAT_TYPE(0);
|
||||
if (col < p.KX) {
|
||||
a = data_a[rowx * p.KX + col];
|
||||
}
|
||||
|
||||
FLOAT_TYPE b = FLOAT_TYPE(0);
|
||||
if (p.KY > 0 && col < p.KX) {
|
||||
b = data_b[rowy_start + col];
|
||||
}
|
||||
|
||||
FLOAT_TYPE v = a * p.scale + slope * b;
|
||||
|
||||
if (col < p.KX) {
|
||||
max_val = max(max_val, v);
|
||||
}
|
||||
}
|
||||
|
||||
// reduce across the workgroup
|
||||
vals[tid] = max_val;
|
||||
barrier();
|
||||
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
vals[tid] = max(vals[tid], vals[tid + s]);
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
max_val = vals[0];
|
||||
data_m[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = max_val;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
#version 450
|
||||
|
||||
#include "soft_max_large_common.glsl"
|
||||
|
||||
void main() {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint rowx = gl_WorkGroupID.y;
|
||||
const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
|
||||
|
||||
const uint32_t i03 = rowx / (p.ne01 * p.ne02);
|
||||
const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
|
||||
const uint32_t i01 = rowx % p.ne01;
|
||||
|
||||
uint rowy_start = 0;
|
||||
if (p.KY > 0) {
|
||||
rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
|
||||
}
|
||||
|
||||
if (rowx >= p.nrows_x) {
|
||||
return;
|
||||
}
|
||||
|
||||
float slope = get_slope(rowx);
|
||||
|
||||
// Find max
|
||||
FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
|
||||
|
||||
[[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) {
|
||||
if (i + tid < gl_NumWorkGroups.x) {
|
||||
max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]);
|
||||
}
|
||||
}
|
||||
|
||||
// reduce across the workgroup
|
||||
vals[tid] = max_val;
|
||||
barrier();
|
||||
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
vals[tid] = max(max_val, vals[tid + s]);
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
max_val = vals[0];
|
||||
barrier();
|
||||
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
|
||||
|
||||
// Compute sum{exp(x - max)}
|
||||
[[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
|
||||
const uint col = col0 + tid;
|
||||
|
||||
if (col >= p.KX) {
|
||||
break;
|
||||
}
|
||||
|
||||
// compute exp(a*scale+b*slope), add it to sum
|
||||
const uint i = rowx * p.KX + col;
|
||||
FLOAT_TYPE val;
|
||||
val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy_start + col]) : FLOAT_TYPE(0.0f)) - max_val);
|
||||
sum += val;
|
||||
data_d[i] = D_TYPE(val);
|
||||
}
|
||||
|
||||
// reduce across the workgroup
|
||||
vals[tid] = sum;
|
||||
barrier();
|
||||
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
vals[tid] += vals[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
sum = vals[0];
|
||||
data_s[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
#version 450
|
||||
|
||||
#include "soft_max_large_common.glsl"
|
||||
|
||||
shared FLOAT_TYPE sumsh[BLOCK_SIZE];
|
||||
|
||||
void main() {
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint rowx = gl_WorkGroupID.y;
|
||||
const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
|
||||
|
||||
const uint32_t i03 = rowx / (p.ne01 * p.ne02);
|
||||
const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
|
||||
const uint32_t i01 = rowx % p.ne01;
|
||||
|
||||
uint rowy_start = 0;
|
||||
if (p.KY > 0) {
|
||||
rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
|
||||
}
|
||||
|
||||
if (rowx >= p.nrows_x) {
|
||||
return;
|
||||
}
|
||||
|
||||
FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
|
||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
|
||||
|
||||
[[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) {
|
||||
if (i + tid < gl_NumWorkGroups.x) {
|
||||
max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]);
|
||||
sum += data_s[rowx * gl_NumWorkGroups.x + i + tid];
|
||||
}
|
||||
}
|
||||
|
||||
// reduce across the workgroup
|
||||
vals[tid] = max_val;
|
||||
sumsh[tid] = sum;
|
||||
barrier();
|
||||
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||
if (tid < s) {
|
||||
vals[tid] = max(max_val, vals[tid + s]);
|
||||
sumsh[tid] += sumsh[tid + s];
|
||||
}
|
||||
barrier();
|
||||
}
|
||||
|
||||
max_val = vals[0];
|
||||
sum = sumsh[0];
|
||||
|
||||
if (p.has_sinks != 0) {
|
||||
sum += FLOAT_TYPE(exp(FLOAT_TYPE(data_c[i02]) - max_val));
|
||||
}
|
||||
|
||||
FLOAT_TYPE rcpdivisor = 1.0/sum;
|
||||
|
||||
[[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
|
||||
const uint col = col0 + tid;
|
||||
|
||||
if (col >= p.KX) {
|
||||
continue;
|
||||
}
|
||||
|
||||
data_d[rowx*p.KX + col] *= D_TYPE(rcpdivisor);
|
||||
}
|
||||
}
|
||||
53
ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl
vendored
Normal file
53
ml/backend/ggml/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl
vendored
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
#extension GL_EXT_control_flow_attributes : enable
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint KX;
|
||||
uint KY;
|
||||
uint ne00;
|
||||
uint ne01;
|
||||
uint ne02;
|
||||
uint ne12;
|
||||
uint ne13;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
float m1;
|
||||
uint n_head_log2;
|
||||
uint nrows_x;
|
||||
uint has_sinks;
|
||||
} p;
|
||||
|
||||
#include "types.glsl"
|
||||
|
||||
layout(constant_id = 0) const uint BLOCK_SIZE = 128;
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
layout(constant_id = 1) const uint num_iters = 4;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
|
||||
layout (binding = 2) readonly buffer Z {float data_c[];};
|
||||
layout (binding = 3) buffer D {D_TYPE data_d[];};
|
||||
layout (binding = 4) buffer M {float data_m[];};
|
||||
layout (binding = 5) buffer S {float data_s[];};
|
||||
|
||||
shared FLOAT_TYPE vals[BLOCK_SIZE];
|
||||
|
||||
float get_slope(uint rowx) {
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (p.max_bias > 0.0f) {
|
||||
const uint h = (rowx / p.ne01) % p.ne02; // head index
|
||||
|
||||
const float base = h < p.n_head_log2 ? p.m0 : p.m1;
|
||||
const uint exp = h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1;
|
||||
|
||||
slope = pow(base, exp);
|
||||
}
|
||||
|
||||
return slope;
|
||||
}
|
||||
|
|
@ -10,6 +10,7 @@
|
|||
layout (push_constant) uniform parameter
|
||||
{
|
||||
uint n_rows;
|
||||
uint n_experts_push;
|
||||
uint n_expert_used;
|
||||
float clamp_min;
|
||||
float clamp_max;
|
||||
|
|
@ -18,11 +19,16 @@ layout (push_constant) uniform parameter
|
|||
layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;
|
||||
|
||||
layout(constant_id = 0) const uint WARP_SIZE = 32;
|
||||
layout(constant_id = 1) const uint n_experts = 512;
|
||||
layout(constant_id = 1) const uint n_experts_spec = 512;
|
||||
layout(constant_id = 2) const bool with_norm = true;
|
||||
layout(constant_id = 3) const bool late_softmax = false;
|
||||
layout(constant_id = 4) const bool nexperts_use_push = false;
|
||||
|
||||
const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;
|
||||
uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec;
|
||||
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
|
||||
|
||||
const uint experts_per_thread = CEIL_DIV(n_experts_spec, WARP_SIZE);
|
||||
|
||||
layout (binding = 0, std430) readonly buffer Logits {float logits[];};
|
||||
layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
|
||||
|
|
@ -94,7 +100,7 @@ void main() {
|
|||
}
|
||||
|
||||
if (!late_softmax) {
|
||||
softmax_warp_inplace(wt, n_experts, lane, false);
|
||||
softmax_warp_inplace(wt, n_experts, lane, nexperts_use_push);
|
||||
}
|
||||
|
||||
// at this point, each thread holds a portion of softmax,
|
||||
|
|
|
|||
|
|
@ -704,13 +704,15 @@ void process_shaders() {
|
|||
shader = (tname == "f32" || tname == "f16" || tname == "bf16") ? "get_rows.comp" : "get_rows_quant.comp";
|
||||
|
||||
if (tname == "f16") {
|
||||
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
|
||||
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
|
||||
} else {
|
||||
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
|
||||
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
|
||||
}
|
||||
string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
|
||||
}
|
||||
|
||||
string_to_spv("get_rows_i32", "get_rows.comp", {{"TEMP_TYPE", "uint"}, {"A_TYPE", "uint"}, {"B_TYPE", "int"}, {"D_TYPE", "uint"}});
|
||||
|
||||
string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
|
||||
string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
|
||||
|
|
@ -854,6 +856,8 @@ void process_shaders() {
|
|||
|
||||
string_to_spv("tri_f16", "tri.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("tri_f32", "tri.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("diag_f16", "diag.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("diag_f32", "diag.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("softplus_f16", "softplus.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("softplus_f32", "softplus.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
|
@ -899,6 +903,13 @@ void process_shaders() {
|
|||
string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("soft_max_large1_f32", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("soft_max_large2_f32", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("soft_max_large3_f32", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("soft_max_large1_f32_f16", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("soft_max_large2_f32_f16", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("soft_max_large3_f32_f16", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
|
||||
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
|
||||
string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
||||
|
|
|
|||
|
|
@ -7571,6 +7571,11 @@ size_t ggml_quantize_chunk(
|
|||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void ggml_log_get(ggml_log_callback * log_callback, void ** user_data) {
|
||||
*log_callback = g_logger_state.log_callback;
|
||||
*user_data = g_logger_state.log_callback_user_data;
|
||||
}
|
||||
|
||||
void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
||||
g_logger_state.log_callback_user_data = user_data;
|
||||
|
|
|
|||
Loading…
Reference in New Issue