| { | |
| "architectures": [ | |
| "LlamaForCausalLMWithGNN" | |
| ], | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 128000, | |
| "eos_token_id": [ | |
| 128001, | |
| 128008, | |
| 128009 | |
| ], | |
| "gnn_config": { | |
| "GIN_edge_weight_scaling": true, | |
| "GIN_hidden_dim_multiplier": 2, | |
| "GIN_use_MLP": true, | |
| "GIN_use_norm": true, | |
| "LlamaAttentionHierarchicalPerceiverAR_use_rope": true, | |
| "LlamaAttentionHierarchicalVariant_2_PerceiverAR_use_skip": true, | |
| "MLP_type": "standard_MLP", | |
| "N_GNN_from_attention_layers": 3, | |
| "activation": "relu", | |
| "add_rope": false, | |
| "adj_construction_method": "threshold_any", | |
| "adj_transform_hidden_dim": 128, | |
| "attention_GIN_MLP_GIN_MLP_mode": "shared", | |
| "attention_GIN_MLP_GIN_binary_scale": 1.0, | |
| "attention_GIN_MLP_GIN_learnable_threshold": false, | |
| "attention_GIN_MLP_GIN_softmax_temperature": 1.0, | |
| "attention_GIN_MLP_GIN_threshold_mode": "none", | |
| "attention_GIN_MLP_GIN_threshold_value": 0.2, | |
| "attention_GIN_MLP_GIN_top_k_fraction_of_sequence_length": 0.1, | |
| "attention_GIN_MLP_GIN_use_softmax": false, | |
| "attention_GIN_MLP_attention_mix_mode": "A", | |
| "attention_GIN_MLP_multiplier": 2, | |
| "attention_GIN_MLP_o_proj_at_end": false, | |
| "attention_GIN_MLP_scoring_hidden_dim": 512, | |
| "attention_GIN_MLP_second_order_factor": 0.1, | |
| "attention_GIN_MLP_separate_attention": false, | |
| "attention_GIN_MLP_use_scoring_fnct": true, | |
| "attention_GIN_MLP_use_second_order": false, | |
| "attention_epsilon_strategy": "default", | |
| "attention_epsilon_uniform_value": 0.5, | |
| "combined_norm": false, | |
| "continuous_transform_alpha": 10.0, | |
| "distance_scaling_method": "power", | |
| "distance_weight_strength": 1.0, | |
| "dropout": 0.1, | |
| "enforce_causality": true, | |
| "epsilon_threshold": 0.6, | |
| "gnn_logic": "before_MLP", | |
| "gnn_mode": "single", | |
| "gnn_residual": false, | |
| "gnn_type": "causal_gin", | |
| "group_tokens_for_coarse_graining": false, | |
| "hidden_dim": 128, | |
| "hierarchical_enc_dec_type": "PerceiverAR", | |
| "initial_sharpening_value": 1.0, | |
| "lambda_GNN": 1.0, | |
| "lambda_GNN_initial": null, | |
| "learnable_aggregate_activation": "softmax", | |
| "max_position_embeddings": 2048, | |
| "mix_weights_initial": 0.5, | |
| "model_type": "", | |
| "norm_to_hidden_states": false, | |
| "num_latent_layers": 4, | |
| "num_latents": 32, | |
| "num_latents_list": [ | |
| 64, | |
| 32, | |
| 8 | |
| ], | |
| "num_layers": 2, | |
| "per_head_ff": false, | |
| "plot_for_debugging": false, | |
| "remove_self_connections": true, | |
| "residual_epsilon_strategy": "default", | |
| "residual_epsilon_uniform_value": 0.1, | |
| "rms_norm_eps": 1e-05, | |
| "sharpening_value_init": "value", | |
| "soft_masking_initial_threshold": 0.01, | |
| "soft_masking_k": 10.0, | |
| "threshold": 0.2, | |
| "threshold_any_tau": 0.1, | |
| "tokenizer": null, | |
| "use_GNN_from_attention": "none", | |
| "use_GNN_from_attention_add_RoPE_at_every_layer": false, | |
| "use_differential_attention": false, | |
| "use_distance_scaling": false, | |
| "use_fixed_number_of_tokens_per_latent": false, | |
| "use_graph_property_modulation": false, | |
| "use_graph_property_modulation_with_norm": false, | |
| "use_graph_property_modulation_with_norm_use_causal_clustering": true, | |
| "use_hierarchical_attention": false, | |
| "use_layer_norm": false, | |
| "use_layer_norm_in_GIN_MLP": false, | |
| "use_no_norm_in_GIN_MLP": false, | |
| "use_original_hidden_states": false, | |
| "use_original_hidden_states_add_attention": false, | |
| "use_projection": true, | |
| "use_sharpening": false, | |
| "use_soft_masking": false, | |
| "zero_below_epsilon_threshold": true | |
| }, | |
| "head_dim": 64, | |
| "hidden_act": "silu", | |
| "hidden_size": 2048, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 8192, | |
| "max_position_embeddings": 131072, | |
| "mlp_bias": false, | |
| "model_type": "llama", | |
| "num_attention_heads": 32, | |
| "num_hidden_layers": 16, | |
| "num_key_value_heads": 8, | |
| "pretraining_tp": 1, | |
| "rms_norm_eps": 1e-05, | |
| "rope_scaling": { | |
| "factor": 32.0, | |
| "high_freq_factor": 4.0, | |
| "low_freq_factor": 1.0, | |
| "original_max_position_embeddings": 8192, | |
| "rope_type": "llama3" | |
| }, | |
| "rope_theta": 500000.0, | |
| "tie_word_embeddings": true, | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.46.3", | |
| "use_cache": false, | |
| "vocab_size": 128256 | |
| } | |