| frames = [train, test] | |
| df = pd.concat(frames) | |
| list_frequency_encoding = ['AppVersion', | |
| 'AvSigVersion', | |
| 'Census_OSVersion', | |
| 'EngineVersion', | |
| 'OsBuildLab'] | |
| def frequency_encoding(feature): | |
| t = df[feature].value_counts().reset_index() | |
| t = t.reset_index() | |
| t.loc[t[feature] == 1, 'level_0'] = np.nan | |
| t.set_index('index', inplace=True) | |
| max_label = t['level_0'].max() + 1 | |
| t.fillna(max_label, inplace=True) | |
| return t.to_dict()['level_0'] | |
| for feature in tqdm(list_frequency_encoding): | |
| freq_enc_dict = frequency_encoding(feature) | |
| df[feature] = df[feature].map(lambda x: freq_enc_dict.get(x, np.nan)) | |
| df[feature] = df[feature].astype('int64') | |