Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
fix-typo (#4)
Browse files- Move zero_grad (d09175152afd14b31ed261363a3eda417138ba56)
- code_samples/base/accelerate +1 -1
- code_samples/base/basic +2 -2
- code_samples/base/calculating_metrics +1 -1
- code_samples/base/checkpointing +1 -1
- code_samples/base/experiment_tracking +1 -1
- code_samples/base/gradient_accumulation +2 -2
- code_samples/base/initial +2 -2
- code_samples/base/initial_with_metrics +1 -1
code_samples/base/accelerate
CHANGED
|
@@ -7,11 +7,11 @@ train_dataloader, model, optimizer scheduler = accelerator.prepare(
|
|
| 7 |
|
| 8 |
model.train()
|
| 9 |
for batch in train_dataloader:
|
| 10 |
-
optimizer.zero_grad()
|
| 11 |
inputs, targets = batch
|
| 12 |
outputs = model(inputs)
|
| 13 |
loss = loss_function(outputs, targets)
|
| 14 |
accelerator.backward(loss)
|
| 15 |
optimizer.step()
|
| 16 |
scheduler.step()
|
|
|
|
| 17 |
</pre>
|
|
|
|
| 7 |
|
| 8 |
model.train()
|
| 9 |
for batch in train_dataloader:
|
|
|
|
| 10 |
inputs, targets = batch
|
| 11 |
outputs = model(inputs)
|
| 12 |
loss = loss_function(outputs, targets)
|
| 13 |
accelerator.backward(loss)
|
| 14 |
optimizer.step()
|
| 15 |
scheduler.step()
|
| 16 |
+
optimizer.zero_grad()
|
| 17 |
</pre>
|
code_samples/base/basic
CHANGED
|
@@ -7,7 +7,6 @@
|
|
| 7 |
+)
|
| 8 |
|
| 9 |
for batch in dataloader:
|
| 10 |
-
optimizer.zero_grad()
|
| 11 |
inputs, targets = batch
|
| 12 |
- inputs = inputs.to(device)
|
| 13 |
- targets = targets.to(device)
|
|
@@ -16,7 +15,8 @@ for batch in dataloader:
|
|
| 16 |
- loss.backward()
|
| 17 |
+ accelerator.backward(loss)
|
| 18 |
optimizer.step()
|
| 19 |
-
scheduler.step()
|
|
|
|
| 20 |
##
|
| 21 |
Everything around `accelerate` occurs with the `Accelerator` class. To use it, first make an object.
|
| 22 |
Then call `.prepare` passing in the PyTorch objects that you would normally train with. This will
|
|
|
|
| 7 |
+)
|
| 8 |
|
| 9 |
for batch in dataloader:
|
|
|
|
| 10 |
inputs, targets = batch
|
| 11 |
- inputs = inputs.to(device)
|
| 12 |
- targets = targets.to(device)
|
|
|
|
| 15 |
- loss.backward()
|
| 16 |
+ accelerator.backward(loss)
|
| 17 |
optimizer.step()
|
| 18 |
+
scheduler.step()
|
| 19 |
+
optimizer.zero_grad()</pre>
|
| 20 |
##
|
| 21 |
Everything around `accelerate` occurs with the `Accelerator` class. To use it, first make an object.
|
| 22 |
Then call `.prepare` passing in the PyTorch objects that you would normally train with. This will
|
code_samples/base/calculating_metrics
CHANGED
|
@@ -11,7 +11,6 @@ import evaluate
|
|
| 11 |
+)
|
| 12 |
metric = evaluate.load("accuracy")
|
| 13 |
for batch in train_dataloader:
|
| 14 |
-
optimizer.zero_grad()
|
| 15 |
inputs, targets = batch
|
| 16 |
- inputs = inputs.to(device)
|
| 17 |
- targets = targets.to(device)
|
|
@@ -20,6 +19,7 @@ for batch in train_dataloader:
|
|
| 20 |
loss.backward()
|
| 21 |
optimizer.step()
|
| 22 |
scheduler.step()
|
|
|
|
| 23 |
|
| 24 |
model.eval()
|
| 25 |
for batch in eval_dataloader:
|
|
|
|
| 11 |
+)
|
| 12 |
metric = evaluate.load("accuracy")
|
| 13 |
for batch in train_dataloader:
|
|
|
|
| 14 |
inputs, targets = batch
|
| 15 |
- inputs = inputs.to(device)
|
| 16 |
- targets = targets.to(device)
|
|
|
|
| 19 |
loss.backward()
|
| 20 |
optimizer.step()
|
| 21 |
scheduler.step()
|
| 22 |
+
optimizer.zero_grad()
|
| 23 |
|
| 24 |
model.eval()
|
| 25 |
for batch in eval_dataloader:
|
code_samples/base/checkpointing
CHANGED
|
@@ -7,13 +7,13 @@ dataloader, model, optimizer scheduler = accelerator.prepare(
|
|
| 7 |
)
|
| 8 |
|
| 9 |
for batch in dataloader:
|
| 10 |
-
optimizer.zero_grad()
|
| 11 |
inputs, targets = batch
|
| 12 |
outputs = model(inputs)
|
| 13 |
loss = loss_function(outputs, targets)
|
| 14 |
accelerator.backward(loss)
|
| 15 |
optimizer.step()
|
| 16 |
scheduler.step()
|
|
|
|
| 17 |
+accelerator.save_state("checkpoint_dir")
|
| 18 |
+accelerator.load_state("checkpoint_dir")</pre>
|
| 19 |
##
|
|
|
|
| 7 |
)
|
| 8 |
|
| 9 |
for batch in dataloader:
|
|
|
|
| 10 |
inputs, targets = batch
|
| 11 |
outputs = model(inputs)
|
| 12 |
loss = loss_function(outputs, targets)
|
| 13 |
accelerator.backward(loss)
|
| 14 |
optimizer.step()
|
| 15 |
scheduler.step()
|
| 16 |
+
optimizer.zero_grad()
|
| 17 |
+accelerator.save_state("checkpoint_dir")
|
| 18 |
+accelerator.load_state("checkpoint_dir")</pre>
|
| 19 |
##
|
code_samples/base/experiment_tracking
CHANGED
|
@@ -9,7 +9,6 @@ train_dataloader, model, optimizer scheduler = accelerator.prepare(
|
|
| 9 |
+accelerator.init_trackers()
|
| 10 |
model.train()
|
| 11 |
for batch in train_dataloader:
|
| 12 |
-
optimizer.zero_grad()
|
| 13 |
inputs, targets = batch
|
| 14 |
outputs = model(inputs)
|
| 15 |
loss = loss_function(outputs, targets)
|
|
@@ -17,6 +16,7 @@ for batch in train_dataloader:
|
|
| 17 |
accelerator.backward(loss)
|
| 18 |
optimizer.step()
|
| 19 |
scheduler.step()
|
|
|
|
| 20 |
+accelerator.end_training()
|
| 21 |
</pre>
|
| 22 |
##
|
|
|
|
| 9 |
+accelerator.init_trackers()
|
| 10 |
model.train()
|
| 11 |
for batch in train_dataloader:
|
|
|
|
| 12 |
inputs, targets = batch
|
| 13 |
outputs = model(inputs)
|
| 14 |
loss = loss_function(outputs, targets)
|
|
|
|
| 16 |
accelerator.backward(loss)
|
| 17 |
optimizer.step()
|
| 18 |
scheduler.step()
|
| 19 |
+
optimizer.zero_grad()
|
| 20 |
+accelerator.end_training()
|
| 21 |
</pre>
|
| 22 |
##
|
code_samples/base/gradient_accumulation
CHANGED
|
@@ -10,13 +10,13 @@ dataloader, model, optimizer scheduler = accelerator.prepare(
|
|
| 10 |
|
| 11 |
for batch in dataloader:
|
| 12 |
+ with accelerator.accumulate(model):
|
| 13 |
-
optimizer.zero_grad()
|
| 14 |
inputs, targets = batch
|
| 15 |
outputs = model(inputs)
|
| 16 |
loss = loss_function(outputs, targets)
|
| 17 |
accelerator.backward(loss)
|
| 18 |
optimizer.step()
|
| 19 |
-
scheduler.step()
|
|
|
|
| 20 |
|
| 21 |
##
|
| 22 |
When performing gradient accumulation in a distributed setup, there are many opportunities for efficiency mistakes
|
|
|
|
| 10 |
|
| 11 |
for batch in dataloader:
|
| 12 |
+ with accelerator.accumulate(model):
|
|
|
|
| 13 |
inputs, targets = batch
|
| 14 |
outputs = model(inputs)
|
| 15 |
loss = loss_function(outputs, targets)
|
| 16 |
accelerator.backward(loss)
|
| 17 |
optimizer.step()
|
| 18 |
+
scheduler.step()
|
| 19 |
+
optimizer.zero_grad()</pre>
|
| 20 |
|
| 21 |
##
|
| 22 |
When performing gradient accumulation in a distributed setup, there are many opportunities for efficiency mistakes
|
code_samples/base/initial
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
<pre>
|
| 2 |
for batch in dataloader:
|
| 3 |
-
optimizer.zero_grad()
|
| 4 |
inputs, targets = batch
|
| 5 |
inputs = inputs.to(device)
|
| 6 |
targets = targets.to(device)
|
|
@@ -8,4 +7,5 @@ for batch in dataloader:
|
|
| 8 |
loss = loss_function(outputs, targets)
|
| 9 |
loss.backward()
|
| 10 |
optimizer.step()
|
| 11 |
-
scheduler.step()
|
|
|
|
|
|
| 1 |
<pre>
|
| 2 |
for batch in dataloader:
|
|
|
|
| 3 |
inputs, targets = batch
|
| 4 |
inputs = inputs.to(device)
|
| 5 |
targets = targets.to(device)
|
|
|
|
| 7 |
loss = loss_function(outputs, targets)
|
| 8 |
loss.backward()
|
| 9 |
optimizer.step()
|
| 10 |
+
scheduler.step()
|
| 11 |
+
optimizer.zero_grad()</pre>
|
code_samples/base/initial_with_metrics
CHANGED
|
@@ -2,7 +2,6 @@
|
|
| 2 |
import evaluate
|
| 3 |
metric = evaluate.load("accuracy")
|
| 4 |
for batch in train_dataloader:
|
| 5 |
-
optimizer.zero_grad()
|
| 6 |
inputs, targets = batch
|
| 7 |
inputs = inputs.to(device)
|
| 8 |
targets = targets.to(device)
|
|
@@ -11,6 +10,7 @@ for batch in train_dataloader:
|
|
| 11 |
loss.backward()
|
| 12 |
optimizer.step()
|
| 13 |
scheduler.step()
|
|
|
|
| 14 |
|
| 15 |
model.eval()
|
| 16 |
for batch in eval_dataloader:
|
|
|
|
| 2 |
import evaluate
|
| 3 |
metric = evaluate.load("accuracy")
|
| 4 |
for batch in train_dataloader:
|
|
|
|
| 5 |
inputs, targets = batch
|
| 6 |
inputs = inputs.to(device)
|
| 7 |
targets = targets.to(device)
|
|
|
|
| 10 |
loss.backward()
|
| 11 |
optimizer.step()
|
| 12 |
scheduler.step()
|
| 13 |
+
optimizer.zero_grad()
|
| 14 |
|
| 15 |
model.eval()
|
| 16 |
for batch in eval_dataloader:
|