derek-thomas
commited on
Commit
·
303cbb8
1
Parent(s):
86f872b
Comment out print statements
Browse files- memory_states.py +2 -2
- plot.py +3 -3
- utilities.py +21 -21
memory_states.py
CHANGED
|
@@ -24,10 +24,10 @@ def get_my_memory_states(proj_dir, dataset, my_collection):
|
|
| 24 |
prediction.reset_index(drop=True, inplace=True)
|
| 25 |
prediction.sort_values(by=['r_history'], inplace=True)
|
| 26 |
prediction.to_csv(proj_dir / "prediction.tsv", sep='\t', index=None)
|
| 27 |
-
print("prediction.tsv saved.")
|
| 28 |
prediction['difficulty'] = prediction['difficulty'].map(lambda x: int(round(x)))
|
| 29 |
difficulty_distribution = prediction.groupby(by=['difficulty'])['count'].sum() / prediction['count'].sum()
|
| 30 |
-
print(difficulty_distribution)
|
| 31 |
difficulty_distribution_padding = np.zeros(10)
|
| 32 |
for i in range(10):
|
| 33 |
if i + 1 in difficulty_distribution.index:
|
|
|
|
| 24 |
prediction.reset_index(drop=True, inplace=True)
|
| 25 |
prediction.sort_values(by=['r_history'], inplace=True)
|
| 26 |
prediction.to_csv(proj_dir / "prediction.tsv", sep='\t', index=None)
|
| 27 |
+
# print("prediction.tsv saved.")
|
| 28 |
prediction['difficulty'] = prediction['difficulty'].map(lambda x: int(round(x)))
|
| 29 |
difficulty_distribution = prediction.groupby(by=['difficulty'])['count'].sum() / prediction['count'].sum()
|
| 30 |
+
# print(difficulty_distribution)
|
| 31 |
difficulty_distribution_padding = np.zeros(10)
|
| 32 |
for i in range(10):
|
| 33 |
if i + 1 in difficulty_distribution.index:
|
plot.py
CHANGED
|
@@ -41,7 +41,7 @@ def make_plot(proj_dir, type_sequence, w, difficulty_distribution_padding, progr
|
|
| 41 |
return w[9] * np.power(d, w[10]) * np.power(s, w[11]) * np.exp((1 - r) * w[12])
|
| 42 |
|
| 43 |
stability_list = np.array([np.power(base, i - index_offset) for i in range(index_len)])
|
| 44 |
-
print(f"terminal stability: {stability_list.max(): .2f}")
|
| 45 |
df = pd.DataFrame(columns=["retention", "difficulty", "repetitions"])
|
| 46 |
|
| 47 |
for percentage in trange(96, 70, -2, desc='Repetition vs Retention plot'):
|
|
@@ -73,7 +73,7 @@ def make_plot(proj_dir, type_sequence, w, difficulty_distribution_padding, progr
|
|
| 73 |
|
| 74 |
df.sort_values(by=["difficulty", "retention"], inplace=True)
|
| 75 |
df.to_csv(proj_dir/"expected_repetitions.csv", index=False)
|
| 76 |
-
print("expected_repetitions.csv saved.")
|
| 77 |
|
| 78 |
optimal_retention_list = np.zeros(10)
|
| 79 |
df2 = pd.DataFrame()
|
|
@@ -87,6 +87,6 @@ def make_plot(proj_dir, type_sequence, w, difficulty_distribution_padding, progr
|
|
| 87 |
|
| 88 |
fig = px.line(df2, x="retention", y="expected repetitions", color='d', log_y=True)
|
| 89 |
|
| 90 |
-
print(f"\n-----suggested retention: {np.inner(difficulty_distribution_padding, optimal_retention_list):.2f}-----")
|
| 91 |
suggested_retention_markdown = f"""# Suggested Retention: `{np.inner(difficulty_distribution_padding, optimal_retention_list):.2f}`"""
|
| 92 |
return fig, suggested_retention_markdown
|
|
|
|
| 41 |
return w[9] * np.power(d, w[10]) * np.power(s, w[11]) * np.exp((1 - r) * w[12])
|
| 42 |
|
| 43 |
stability_list = np.array([np.power(base, i - index_offset) for i in range(index_len)])
|
| 44 |
+
# print(f"terminal stability: {stability_list.max(): .2f}")
|
| 45 |
df = pd.DataFrame(columns=["retention", "difficulty", "repetitions"])
|
| 46 |
|
| 47 |
for percentage in trange(96, 70, -2, desc='Repetition vs Retention plot'):
|
|
|
|
| 73 |
|
| 74 |
df.sort_values(by=["difficulty", "retention"], inplace=True)
|
| 75 |
df.to_csv(proj_dir/"expected_repetitions.csv", index=False)
|
| 76 |
+
# print("expected_repetitions.csv saved.")
|
| 77 |
|
| 78 |
optimal_retention_list = np.zeros(10)
|
| 79 |
df2 = pd.DataFrame()
|
|
|
|
| 87 |
|
| 88 |
fig = px.line(df2, x="retention", y="expected repetitions", color='d', log_y=True)
|
| 89 |
|
| 90 |
+
# print(f"\n-----suggested retention: {np.inner(difficulty_distribution_padding, optimal_retention_list):.2f}-----")
|
| 91 |
suggested_retention_markdown = f"""# Suggested Retention: `{np.inner(difficulty_distribution_padding, optimal_retention_list):.2f}`"""
|
| 92 |
return fig, suggested_retention_markdown
|
utilities.py
CHANGED
|
@@ -26,7 +26,7 @@ def extract(file, prefix):
|
|
| 26 |
proj_dir = Path(f'projects/{prefix}_{file.orig_name.replace(".", "_").replace("@", "_")}')
|
| 27 |
with ZipFile(file, 'r') as zip_ref:
|
| 28 |
zip_ref.extractall(proj_dir)
|
| 29 |
-
print(f"Extracted {file.orig_name} successfully!")
|
| 30 |
return proj_dir
|
| 31 |
|
| 32 |
|
|
@@ -63,7 +63,7 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
|
|
| 63 |
df.sort_values(by=['cid', 'id'], inplace=True, ignore_index=True)
|
| 64 |
type_sequence = np.array(df['type'])
|
| 65 |
df.to_csv(proj_dir / "revlog.csv", index=False)
|
| 66 |
-
print("revlog.csv saved.")
|
| 67 |
df = df[(df['type'] == 0) | (df['type'] == 1)].copy()
|
| 68 |
df['real_days'] = df['review_date'] - timedelta(hours=next_day_starts_at)
|
| 69 |
df['real_days'] = pd.DatetimeIndex(df['real_days'].dt.floor('D')).to_julian_date()
|
|
@@ -94,7 +94,7 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
|
|
| 94 |
df["t_history"] = df["t_history"].map(lambda x: x[1:] if len(x) > 1 else x)
|
| 95 |
df["r_history"] = df["r_history"].map(lambda x: x[1:] if len(x) > 1 else x)
|
| 96 |
df.to_csv(proj_dir / 'revlog_history.tsv', sep="\t", index=False)
|
| 97 |
-
print("Trainset saved.")
|
| 98 |
|
| 99 |
def cal_retention(group: pd.DataFrame) -> pd.DataFrame:
|
| 100 |
group['retention'] = round(group['r'].map(lambda x: {1: 0, 2: 1, 3: 1, 4: 1}[x]).mean(), 4)
|
|
@@ -103,7 +103,7 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
|
|
| 103 |
|
| 104 |
tqdm.pandas(desc='Calculating Retention')
|
| 105 |
df = df.groupby(by=['r_history', 'delta_t']).progress_apply(cal_retention)
|
| 106 |
-
print("Retention calculated.")
|
| 107 |
df = df.drop(columns=['id', 'cid', 'usn', 'ivl', 'last_lvl', 'factor', 'time', 'type', 'create_date', 'review_date',
|
| 108 |
'real_days', 'r', 't_history'])
|
| 109 |
df.drop_duplicates(inplace=True)
|
|
@@ -128,7 +128,7 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
|
|
| 128 |
|
| 129 |
tqdm.pandas(desc='Calculating Stability')
|
| 130 |
df = df.groupby(by=['r_history']).progress_apply(cal_stability)
|
| 131 |
-
print("Stability calculated.")
|
| 132 |
df.reset_index(drop=True, inplace=True)
|
| 133 |
df.drop_duplicates(inplace=True)
|
| 134 |
df.sort_values(by=['r_history'], inplace=True, ignore_index=True)
|
|
@@ -144,11 +144,11 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
|
|
| 144 |
df['last_recall'] = df['r_history'].map(lambda x: x[-1])
|
| 145 |
df = df[df.groupby(['i', 'r_history'])['group_cnt'].transform(max) == df['group_cnt']]
|
| 146 |
df.to_csv(proj_dir / 'stability_for_analysis.tsv', sep='\t', index=None)
|
| 147 |
-
print("1:again, 2:hard, 3:good, 4:easy\n")
|
| 148 |
-
print(df[df['r_history'].str.contains(r'^[1-4][^124]*$', regex=True)][
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
print("Analysis saved!")
|
| 152 |
|
| 153 |
df_out = df[df['r_history'].str.contains(r'^[1-4][^124]*$', regex=True)][
|
| 154 |
['r_history', 'avg_interval', 'avg_retention', 'stability', 'factor', 'group_cnt']]
|
|
@@ -168,7 +168,7 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
|
|
| 168 |
tqdm.pandas(desc='Tensorizing Line')
|
| 169 |
dataset['tensor'] = dataset.progress_apply(lambda x: lineToTensor(list(zip([x['t_history']], [x['r_history']]))[0]),
|
| 170 |
axis=1)
|
| 171 |
-
print("Tensorized!")
|
| 172 |
|
| 173 |
pre_train_set = dataset[dataset['i'] == 2]
|
| 174 |
# pretrain
|
|
@@ -187,7 +187,7 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
|
|
| 187 |
{1: 0, 2: 1, 3: 1, 4: 1}[row['r']])
|
| 188 |
if np.isnan(loss.data.item()):
|
| 189 |
# Exception Case
|
| 190 |
-
print(row, output_t)
|
| 191 |
raise Exception('error case')
|
| 192 |
loss.backward()
|
| 193 |
optimizer.step()
|
|
@@ -195,7 +195,7 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
|
|
| 195 |
pbar.update()
|
| 196 |
pbar.close()
|
| 197 |
for name, param in model.named_parameters():
|
| 198 |
-
print(f"{name}: {list(map(lambda x: round(float(x), 4), param))}")
|
| 199 |
|
| 200 |
train_set = dataset[dataset['i'] > 2]
|
| 201 |
epoch_len = len(train_set)
|
|
@@ -214,7 +214,7 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
|
|
| 214 |
{1: 0, 2: 1, 3: 1, 4: 1}[row['r']])
|
| 215 |
if np.isnan(loss.data.item()):
|
| 216 |
# Exception Case
|
| 217 |
-
print(row, output_t)
|
| 218 |
raise Exception('error case')
|
| 219 |
loss.backward()
|
| 220 |
for param in model.parameters():
|
|
@@ -223,15 +223,15 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
|
|
| 223 |
model.apply(clipper)
|
| 224 |
pbar.update()
|
| 225 |
|
| 226 |
-
if (k * epoch_len + i) % print_len == 0:
|
| 227 |
-
print(f"iteration: {k * epoch_len + i + 1}")
|
| 228 |
-
for name, param in model.named_parameters():
|
| 229 |
-
print(f"{name}: {list(map(lambda x: round(float(x), 4), param))}")
|
| 230 |
pbar.close()
|
| 231 |
|
| 232 |
w = list(map(lambda x: round(float(x), 4), dict(model.named_parameters())['w'].data))
|
| 233 |
|
| 234 |
-
print("\nTraining finished!")
|
| 235 |
return w, dataset
|
| 236 |
|
| 237 |
|
|
@@ -271,12 +271,12 @@ def my_loss(dataset, w):
|
|
| 271 |
my_collection = Collection(init_w)
|
| 272 |
tqdm.pandas(desc='Calculating Loss before Training')
|
| 273 |
dataset = dataset.progress_apply(partial(log_loss, my_collection), axis=1)
|
| 274 |
-
print(f"Loss before training: {dataset['log_loss'].mean():.4f}")
|
| 275 |
loss_before = f"{dataset['log_loss'].mean():.4f}"
|
| 276 |
my_collection = Collection(w)
|
| 277 |
tqdm.pandas(desc='Calculating Loss After Training')
|
| 278 |
dataset = dataset.progress_apply(partial(log_loss, my_collection), axis=1)
|
| 279 |
-
print(f"Loss after training: {dataset['log_loss'].mean():.4f}")
|
| 280 |
loss_after = f"{dataset['log_loss'].mean():.4f}"
|
| 281 |
return f"""
|
| 282 |
*Loss before training*: {loss_before}
|
|
|
|
| 26 |
proj_dir = Path(f'projects/{prefix}_{file.orig_name.replace(".", "_").replace("@", "_")}')
|
| 27 |
with ZipFile(file, 'r') as zip_ref:
|
| 28 |
zip_ref.extractall(proj_dir)
|
| 29 |
+
# print(f"Extracted {file.orig_name} successfully!")
|
| 30 |
return proj_dir
|
| 31 |
|
| 32 |
|
|
|
|
| 63 |
df.sort_values(by=['cid', 'id'], inplace=True, ignore_index=True)
|
| 64 |
type_sequence = np.array(df['type'])
|
| 65 |
df.to_csv(proj_dir / "revlog.csv", index=False)
|
| 66 |
+
# print("revlog.csv saved.")
|
| 67 |
df = df[(df['type'] == 0) | (df['type'] == 1)].copy()
|
| 68 |
df['real_days'] = df['review_date'] - timedelta(hours=next_day_starts_at)
|
| 69 |
df['real_days'] = pd.DatetimeIndex(df['real_days'].dt.floor('D')).to_julian_date()
|
|
|
|
| 94 |
df["t_history"] = df["t_history"].map(lambda x: x[1:] if len(x) > 1 else x)
|
| 95 |
df["r_history"] = df["r_history"].map(lambda x: x[1:] if len(x) > 1 else x)
|
| 96 |
df.to_csv(proj_dir / 'revlog_history.tsv', sep="\t", index=False)
|
| 97 |
+
# print("Trainset saved.")
|
| 98 |
|
| 99 |
def cal_retention(group: pd.DataFrame) -> pd.DataFrame:
|
| 100 |
group['retention'] = round(group['r'].map(lambda x: {1: 0, 2: 1, 3: 1, 4: 1}[x]).mean(), 4)
|
|
|
|
| 103 |
|
| 104 |
tqdm.pandas(desc='Calculating Retention')
|
| 105 |
df = df.groupby(by=['r_history', 'delta_t']).progress_apply(cal_retention)
|
| 106 |
+
# print("Retention calculated.")
|
| 107 |
df = df.drop(columns=['id', 'cid', 'usn', 'ivl', 'last_lvl', 'factor', 'time', 'type', 'create_date', 'review_date',
|
| 108 |
'real_days', 'r', 't_history'])
|
| 109 |
df.drop_duplicates(inplace=True)
|
|
|
|
| 128 |
|
| 129 |
tqdm.pandas(desc='Calculating Stability')
|
| 130 |
df = df.groupby(by=['r_history']).progress_apply(cal_stability)
|
| 131 |
+
# print("Stability calculated.")
|
| 132 |
df.reset_index(drop=True, inplace=True)
|
| 133 |
df.drop_duplicates(inplace=True)
|
| 134 |
df.sort_values(by=['r_history'], inplace=True, ignore_index=True)
|
|
|
|
| 144 |
df['last_recall'] = df['r_history'].map(lambda x: x[-1])
|
| 145 |
df = df[df.groupby(['i', 'r_history'])['group_cnt'].transform(max) == df['group_cnt']]
|
| 146 |
df.to_csv(proj_dir / 'stability_for_analysis.tsv', sep='\t', index=None)
|
| 147 |
+
# print("1:again, 2:hard, 3:good, 4:easy\n")
|
| 148 |
+
# print(df[df['r_history'].str.contains(r'^[1-4][^124]*$', regex=True)][
|
| 149 |
+
# ['r_history', 'avg_interval', 'avg_retention', 'stability', 'factor', 'group_cnt']].to_string(
|
| 150 |
+
# index=False))
|
| 151 |
+
# print("Analysis saved!")
|
| 152 |
|
| 153 |
df_out = df[df['r_history'].str.contains(r'^[1-4][^124]*$', regex=True)][
|
| 154 |
['r_history', 'avg_interval', 'avg_retention', 'stability', 'factor', 'group_cnt']]
|
|
|
|
| 168 |
tqdm.pandas(desc='Tensorizing Line')
|
| 169 |
dataset['tensor'] = dataset.progress_apply(lambda x: lineToTensor(list(zip([x['t_history']], [x['r_history']]))[0]),
|
| 170 |
axis=1)
|
| 171 |
+
# print("Tensorized!")
|
| 172 |
|
| 173 |
pre_train_set = dataset[dataset['i'] == 2]
|
| 174 |
# pretrain
|
|
|
|
| 187 |
{1: 0, 2: 1, 3: 1, 4: 1}[row['r']])
|
| 188 |
if np.isnan(loss.data.item()):
|
| 189 |
# Exception Case
|
| 190 |
+
# print(row, output_t)
|
| 191 |
raise Exception('error case')
|
| 192 |
loss.backward()
|
| 193 |
optimizer.step()
|
|
|
|
| 195 |
pbar.update()
|
| 196 |
pbar.close()
|
| 197 |
for name, param in model.named_parameters():
|
| 198 |
+
# print(f"{name}: {list(map(lambda x: round(float(x), 4), param))}")
|
| 199 |
|
| 200 |
train_set = dataset[dataset['i'] > 2]
|
| 201 |
epoch_len = len(train_set)
|
|
|
|
| 214 |
{1: 0, 2: 1, 3: 1, 4: 1}[row['r']])
|
| 215 |
if np.isnan(loss.data.item()):
|
| 216 |
# Exception Case
|
| 217 |
+
# print(row, output_t)
|
| 218 |
raise Exception('error case')
|
| 219 |
loss.backward()
|
| 220 |
for param in model.parameters():
|
|
|
|
| 223 |
model.apply(clipper)
|
| 224 |
pbar.update()
|
| 225 |
|
| 226 |
+
# if (k * epoch_len + i) % print_len == 0:
|
| 227 |
+
# print(f"iteration: {k * epoch_len + i + 1}")
|
| 228 |
+
# for name, param in model.named_parameters():
|
| 229 |
+
# print(f"{name}: {list(map(lambda x: round(float(x), 4), param))}")
|
| 230 |
pbar.close()
|
| 231 |
|
| 232 |
w = list(map(lambda x: round(float(x), 4), dict(model.named_parameters())['w'].data))
|
| 233 |
|
| 234 |
+
# print("\nTraining finished!")
|
| 235 |
return w, dataset
|
| 236 |
|
| 237 |
|
|
|
|
| 271 |
my_collection = Collection(init_w)
|
| 272 |
tqdm.pandas(desc='Calculating Loss before Training')
|
| 273 |
dataset = dataset.progress_apply(partial(log_loss, my_collection), axis=1)
|
| 274 |
+
# print(f"Loss before training: {dataset['log_loss'].mean():.4f}")
|
| 275 |
loss_before = f"{dataset['log_loss'].mean():.4f}"
|
| 276 |
my_collection = Collection(w)
|
| 277 |
tqdm.pandas(desc='Calculating Loss After Training')
|
| 278 |
dataset = dataset.progress_apply(partial(log_loss, my_collection), axis=1)
|
| 279 |
+
# print(f"Loss after training: {dataset['log_loss'].mean():.4f}")
|
| 280 |
loss_after = f"{dataset['log_loss'].mean():.4f}"
|
| 281 |
return f"""
|
| 282 |
*Loss before training*: {loss_before}
|