From 24f0faa54062ea46fa5a9d7fe49d359e7921c4db Mon Sep 17 00:00:00 2001 From: 21in7 Date: Sat, 21 Mar 2026 18:31:11 +0900 Subject: [PATCH] fix(mlx): remove double normalization in walk-forward validation Add normalize=False parameter to MLXFilter.fit() so external callers can skip internal normalization. Remove the external normalization + manual _mean/_std reset hack from walk_forward_auc() in train_mlx_model.py. Co-Authored-By: Claude Sonnet 4.6 --- scripts/train_mlx_model.py | 17 +++-------------- src/mlx_filter.py | 22 ++++++++++++++-------- tests/test_ml_pipeline_fixes.py | 23 +++++++++++++++++++++++ 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/scripts/train_mlx_model.py b/scripts/train_mlx_model.py index 7ecf913..d04241c 100644 --- a/scripts/train_mlx_model.py +++ b/scripts/train_mlx_model.py @@ -219,17 +219,8 @@ def walk_forward_auc( y_tr_bal = y_tr[bal_idx] w_tr_bal = w_tr[bal_idx] - # 폴드별 정규화 (학습 데이터 기준으로 계산, 검증에도 동일 적용) - mean = X_tr_bal.mean(axis=0) - std = X_tr_bal.std(axis=0) + 1e-8 - X_tr_norm = (X_tr_bal - mean) / std - X_val_norm = (X_val_raw - mean) / std - - # DataFrame으로 래핑해서 MLXFilter.fit()에 전달 - # fit() 내부 정규화가 덮어쓰지 않도록 이미 정규화된 데이터를 넘기고 - # _mean=0, _std=1로 고정해 이중 정규화를 방지 - X_tr_df = pd.DataFrame(X_tr_norm, columns=FEATURE_COLS) - X_val_df = pd.DataFrame(X_val_norm, columns=FEATURE_COLS) + X_tr_df = pd.DataFrame(X_tr_bal, columns=FEATURE_COLS) + X_val_df = pd.DataFrame(X_val_raw, columns=FEATURE_COLS) model = MLXFilter( input_dim=len(FEATURE_COLS), @@ -239,9 +230,7 @@ def walk_forward_auc( batch_size=256, ) model.fit(X_tr_df, pd.Series(y_tr_bal), sample_weight=w_tr_bal) - # fit()이 내부에서 다시 정규화하므로 저장된 mean/std를 항등 변환으로 교체 - model._mean = np.zeros(len(FEATURE_COLS), dtype=np.float32) - model._std = np.ones(len(FEATURE_COLS), dtype=np.float32) + # fit() handles normalization internally, predict_proba() applies same mean/std proba = model.predict_proba(X_val_df) auc = roc_auc_score(y_val, proba) if len(np.unique(y_val)) > 1 else 0.5 diff --git a/src/mlx_filter.py b/src/mlx_filter.py index 639a914..7f1eb4b 100644 --- a/src/mlx_filter.py +++ b/src/mlx_filter.py @@ -141,18 +141,24 @@ class MLXFilter: X: pd.DataFrame, y: pd.Series, sample_weight: np.ndarray | None = None, + normalize: bool = True, ) -> "MLXFilter": X_np = X[FEATURE_COLS].values.astype(np.float32) y_np = y.values.astype(np.float32) - # nan-safe 정규화: nanmean/nanstd로 통계 계산 후 nan → 0.0 대치 - # (z-score 후 0.0 = 평균값, 신경망에 줄 수 있는 가장 무난한 결측 대치값) - mean_vals = np.nanmean(X_np, axis=0) - self._mean = np.nan_to_num(mean_vals, nan=0.0) # 전체-NaN 컬럼 → 평균 0.0 - std_vals = np.nanstd(X_np, axis=0) - self._std = np.nan_to_num(std_vals, nan=1.0) + 1e-8 # 전체-NaN 컬럼 → std 1.0 - X_np = (X_np - self._mean) / self._std - X_np = np.nan_to_num(X_np, nan=0.0) + if normalize: + # nan-safe 정규화: nanmean/nanstd로 통계 계산 후 nan → 0.0 대치 + # (z-score 후 0.0 = 평균값, 신경망에 줄 수 있는 가장 무난한 결측 대치값) + mean_vals = np.nanmean(X_np, axis=0) + self._mean = np.nan_to_num(mean_vals, nan=0.0) # 전체-NaN 컬럼 → 평균 0.0 + std_vals = np.nanstd(X_np, axis=0) + self._std = np.nan_to_num(std_vals, nan=1.0) + 1e-8 # 전체-NaN 컬럼 → std 1.0 + X_np = (X_np - self._mean) / self._std + X_np = np.nan_to_num(X_np, nan=0.0) + else: + self._mean = np.zeros(X_np.shape[1], dtype=np.float32) + self._std = np.ones(X_np.shape[1], dtype=np.float32) + X_np = np.nan_to_num(X_np, nan=0.0) w_np = sample_weight.astype(np.float32) if sample_weight is not None else None diff --git a/tests/test_ml_pipeline_fixes.py b/tests/test_ml_pipeline_fixes.py index fb7f0f7..24a1f6a 100644 --- a/tests/test_ml_pipeline_fixes.py +++ b/tests/test_ml_pipeline_fixes.py @@ -77,3 +77,26 @@ def test_equity_curve_includes_unrealized_pnl(): last = bt.equity_curve[-1] assert last["equity"] == 1050.0, f"Expected 1050.0 (1000+50), got {last['equity']}" + + +def test_mlx_no_double_normalization(): + """MLXFilter.fit()에 normalize=False를 전달하면 내부 정규화를 건너뛰어야 한다.""" + pytest.importorskip("mlx.core") + import numpy as np + import pandas as pd + from src.mlx_filter import MLXFilter + from src.ml_features import FEATURE_COLS + + n_features = len(FEATURE_COLS) + rng = np.random.default_rng(42) + X = pd.DataFrame( + rng.standard_normal((100, n_features)).astype(np.float32), + columns=FEATURE_COLS, + ) + y = pd.Series(rng.integers(0, 2, 100).astype(np.float32)) + + model = MLXFilter(input_dim=n_features, hidden_dim=16, epochs=1, batch_size=32) + model.fit(X, y, normalize=False) + + assert np.allclose(model._mean, 0.0), "normalize=False시 mean은 0이어야 한다" + assert np.allclose(model._std, 1.0), "normalize=False시 std는 1이어야 한다"