mac commited on
Commit
385c000
·
1 Parent(s): e102133

update_readme

Browse files
eagle/model/ea_model.py CHANGED
@@ -21,7 +21,6 @@ from .cnets1 import Model as Model1
21
  from .configs import EConfig
22
 
23
  """ Modified to support Eagle-3, marked by <mod> xxx </mod> """
24
- # from .modeling_minicpm_kv import HackConvertMiniCPMForCausalLM as KVMiniCPMForCausalLM # <mod> convert opensource impl to llama </mod>
25
  from .modeling_minicpm_kv import MiniCPMForCausalLM as KVMiniCPMForCausalLM # <mod> use modified opensource impl </mod>
26
 
27
  class EaModel(nn.Module):
 
21
  from .configs import EConfig
22
 
23
  """ Modified to support Eagle-3, marked by <mod> xxx </mod> """
 
24
  from .modeling_minicpm_kv import MiniCPMForCausalLM as KVMiniCPMForCausalLM # <mod> use modified opensource impl </mod>
25
 
26
  class EaModel(nn.Module):
eagle/model/modeling_minicpm_kv.py CHANGED
@@ -2443,45 +2443,3 @@ class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
2443
  attentions=transformer_outputs.attentions,
2444
  )
2445
 
2446
-
2447
-
2448
- # hack version
2449
- from .modeling_llama_kv import LlamaForCausalLM as KVLlamaForCausalLM
2450
-
2451
- class HackConvertMiniCPMForCausalLM:
2452
- def from_pretrained(model_path, **kwargs):
2453
- model = KVLlamaForCausalLM.from_pretrained(model_path, **kwargs)
2454
-
2455
- state_dict = model.state_dict()
2456
- scale_emb = 12
2457
- dim_model_base = 256
2458
- scale_depth = 1.4
2459
- num_layers = 32
2460
- hidden_size = 4096
2461
-
2462
- print(state_dict["model.embed_tokens.weight"])
2463
- embedding = state_dict["model.embed_tokens.weight"]
2464
- #model.embed_tokens.weight * scale_emb
2465
- new_emb = embedding.clone() * scale_emb
2466
- state_dict["model.embed_tokens.weight"] = new_emb
2467
-
2468
- #lm_head.weight / (hidden_size / dim_model_base)
2469
- new_emb = state_dict["lm_head.weight"].clone() / (hidden_size / dim_model_base)
2470
- state_dict["lm_head.weight"] = new_emb
2471
-
2472
- #model.layers.34.self_attn.o_proj.weight * (scale_depth / sqrt(num_layers))
2473
- for i in range(num_layers):
2474
- attn_out_name = f"model.layers.{i}.self_attn.o_proj.weight"
2475
- new_weight = state_dict[attn_out_name] * (scale_depth / math.sqrt(num_layers))
2476
- state_dict[attn_out_name] = new_weight
2477
-
2478
- ffn_down_proj_name = f"model.layers.{i}.mlp.down_proj.weight"
2479
- new_weight = state_dict[ffn_down_proj_name] * (scale_depth / math.sqrt(num_layers))
2480
- state_dict[ffn_down_proj_name] = new_weight
2481
-
2482
- print(f"Converting: reload from converted state_dict.\nCheck sd:\n{model}")
2483
-
2484
- model.load_state_dict(state_dict)
2485
- print(f"Convert to llama: DONE.")
2486
-
2487
- return model
 
2443
  attentions=transformer_outputs.attentions,
2444
  )
2445