@@ -475,6 +475,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
475
475
}
476
476
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
477
477
478
+ // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
479
+ hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
480
+ hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
481
+
478
482
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
479
483
480
484
// non-transformer models do not have attention heads
@@ -877,6 +881,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
877
881
{
878
882
hparams.n_swa_pattern = 6;
879
883
884
+ hparams.rope_freq_base_train_swa = 10000.0f;
885
+ hparams.rope_freq_scale_train_swa = 1.0f;
886
+
880
887
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
881
888
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
882
889
@@ -1346,13 +1353,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1346
1353
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
1347
1354
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
1348
1355
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
1356
+ const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
1349
1357
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
1350
- LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev));
1358
+ LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d \n", il, ggml_backend_dev_name(cpu_dev), is_swa );
1351
1359
return {cpu_dev, &pimpl->cpu_buft_list};
1352
1360
}
1353
1361
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
1354
1362
auto * dev = devices.at(layer_gpu);
1355
- LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev));
1363
+ LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d \n", il, ggml_backend_dev_name(dev), is_swa );
1356
1364
return {dev, &pimpl->gpu_buft_list.at(dev)};
1357
1365
};
1358
1366
@@ -7381,10 +7389,10 @@ struct llm_build_gemma3 : public llm_graph_context {
7381
7389
auto * inp_attn = build_attn_inp_kv_unified(true, true);
7382
7390
7383
7391
for (int il = 0; il < n_layer; ++il) {
7384
- const bool is_sliding = hparams.is_sliding (il);
7392
+ const bool is_swa = hparams.is_swa (il);
7385
7393
7386
- const float freq_base_l = is_sliding ? 10000.0f : freq_base ;
7387
- const float freq_scale_l = is_sliding ? 1.0f : freq_scale ;
7394
+ const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base ;
7395
+ const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale ;
7388
7396
7389
7397
// norm
7390
7398
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
@@ -7973,7 +7981,7 @@ struct llm_build_cohere2 : public llm_graph_context {
7973
7981
auto * inp_attn = build_attn_inp_kv_unified(true, true);
7974
7982
7975
7983
for (int il = 0; il < n_layer; ++il) {
7976
- const bool is_sliding = hparams.is_sliding (il);
7984
+ const bool is_swa = hparams.is_swa (il);
7977
7985
7978
7986
// norm
7979
7987
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
@@ -8007,7 +8015,7 @@ struct llm_build_cohere2 : public llm_graph_context {
8007
8015
cb(Vcur, "Vcur", il);
8008
8016
}
8009
8017
8010
- if (is_sliding ) {
8018
+ if (is_swa ) {
8011
8019
Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
8012
8020
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
8013
8021
beta_fast, beta_slow);
0 commit comments