Update data/preprocess/preprocess.py
Browse files
data/preprocess/preprocess.py
CHANGED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 5 |
+
|
| 6 |
+
BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
|
| 7 |
+
|
| 8 |
+
price_path = os.path.join(BASE_DIR, "data", "raw", "BTC_USD_price.csv")
|
| 9 |
+
output_path = os.path.join(BASE_DIR, "data", "processed", "merged_features.csv")
|
| 10 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 11 |
+
|
| 12 |
+
print(f"π Reading price data from: {price_path}")
|
| 13 |
+
if not os.path.exists(price_path):
|
| 14 |
+
raise FileNotFoundError(f"β File not found: {price_path}")
|
| 15 |
+
|
| 16 |
+
price_df = pd.read_csv(price_path)
|
| 17 |
+
print("π Available columns in CSV:", list(price_df.columns))
|
| 18 |
+
|
| 19 |
+
date_col = None
|
| 20 |
+
for col in price_df.columns:
|
| 21 |
+
if 'date' in col.lower() or 'time' in col.lower():
|
| 22 |
+
date_col = col
|
| 23 |
+
break
|
| 24 |
+
|
| 25 |
+
if date_col is None:
|
| 26 |
+
raise ValueError("β No date/time column found in the CSV.")
|
| 27 |
+
|
| 28 |
+
price_df[date_col] = pd.to_datetime(price_df[date_col])
|
| 29 |
+
price_df = price_df.set_index(date_col).sort_index()
|
| 30 |
+
|
| 31 |
+
for col in price_df.columns:
|
| 32 |
+
price_df[col] = pd.to_numeric(price_df[col], errors='coerce')
|
| 33 |
+
|
| 34 |
+
price_df = price_df.ffill().dropna()
|
| 35 |
+
|
| 36 |
+
if price_df.empty:
|
| 37 |
+
raise ValueError("β DataFrame is empty after cleaning.")
|
| 38 |
+
|
| 39 |
+
scaler = MinMaxScaler()
|
| 40 |
+
scaled_values = scaler.fit_transform(price_df)
|
| 41 |
+
normalized_df = pd.DataFrame(scaled_values, columns=price_df.columns, index=price_df.index)
|
| 42 |
+
|
| 43 |
+
normalized_df.to_csv(output_path)
|
| 44 |
+
print(f"β
Preprocessed and saved to: {output_path}")
|