Seyomi commited on
Commit
f0488fb
Β·
verified Β·
1 Parent(s): 3e965e6

Update data/preprocess/preprocess.py

Browse files
Files changed (1) hide show
  1. data/preprocess/preprocess.py +44 -0
data/preprocess/preprocess.py CHANGED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.preprocessing import MinMaxScaler
5
+
6
+ BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
7
+
8
+ price_path = os.path.join(BASE_DIR, "data", "raw", "BTC_USD_price.csv")
9
+ output_path = os.path.join(BASE_DIR, "data", "processed", "merged_features.csv")
10
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
11
+
12
+ print(f"πŸ“‚ Reading price data from: {price_path}")
13
+ if not os.path.exists(price_path):
14
+ raise FileNotFoundError(f"❌ File not found: {price_path}")
15
+
16
+ price_df = pd.read_csv(price_path)
17
+ print("πŸ“Š Available columns in CSV:", list(price_df.columns))
18
+
19
+ date_col = None
20
+ for col in price_df.columns:
21
+ if 'date' in col.lower() or 'time' in col.lower():
22
+ date_col = col
23
+ break
24
+
25
+ if date_col is None:
26
+ raise ValueError("❌ No date/time column found in the CSV.")
27
+
28
+ price_df[date_col] = pd.to_datetime(price_df[date_col])
29
+ price_df = price_df.set_index(date_col).sort_index()
30
+
31
+ for col in price_df.columns:
32
+ price_df[col] = pd.to_numeric(price_df[col], errors='coerce')
33
+
34
+ price_df = price_df.ffill().dropna()
35
+
36
+ if price_df.empty:
37
+ raise ValueError("❌ DataFrame is empty after cleaning.")
38
+
39
+ scaler = MinMaxScaler()
40
+ scaled_values = scaler.fit_transform(price_df)
41
+ normalized_df = pd.DataFrame(scaled_values, columns=price_df.columns, index=price_df.index)
42
+
43
+ normalized_df.to_csv(output_path)
44
+ print(f"βœ… Preprocessed and saved to: {output_path}")