Spaces:
Running
on
Zero
Running
on
Zero
Change LoRA size from 256 to 512, also back to bidirectional_masked
Browse files- llama_diffusion_model.py +3 -3
llama_diffusion_model.py
CHANGED
|
@@ -192,7 +192,7 @@ class CustomTransformerModel(PreTrainedModel):
|
|
| 192 |
self.llama.resize_token_embeddings(config.vocab_size)
|
| 193 |
|
| 194 |
for i, layer in enumerate(self.llama.model.layers):
|
| 195 |
-
layer.self_attn = BidirectionalLlamaAttention(layer.self_attn, masking='
|
| 196 |
|
| 197 |
# Freeze Llama to retain pre-trained knowledge
|
| 198 |
for param in self.llama.parameters():
|
|
@@ -202,8 +202,8 @@ class CustomTransformerModel(PreTrainedModel):
|
|
| 202 |
param.requires_grad = True
|
| 203 |
|
| 204 |
lora_config = LoraConfig(
|
| 205 |
-
r=
|
| 206 |
-
lora_alpha=
|
| 207 |
lora_dropout=0.0,
|
| 208 |
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], # Llama-3 uses these attention modules
|
| 209 |
bias="none",
|
|
|
|
| 192 |
self.llama.resize_token_embeddings(config.vocab_size)
|
| 193 |
|
| 194 |
for i, layer in enumerate(self.llama.model.layers):
|
| 195 |
+
layer.self_attn = BidirectionalLlamaAttention(layer.self_attn, masking='bidirectional_masked')
|
| 196 |
|
| 197 |
# Freeze Llama to retain pre-trained knowledge
|
| 198 |
for param in self.llama.parameters():
|
|
|
|
| 202 |
param.requires_grad = True
|
| 203 |
|
| 204 |
lora_config = LoraConfig(
|
| 205 |
+
r=512,
|
| 206 |
+
lora_alpha=512,
|
| 207 |
lora_dropout=0.0,
|
| 208 |
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], # Llama-3 uses these attention modules
|
| 209 |
bias="none",
|