diff --git a/src/dataset_builder.py b/src/dataset_builder.py index 39c1ee0..7d9f517 100644 --- a/src/dataset_builder.py +++ b/src/dataset_builder.py @@ -287,8 +287,18 @@ def _calc_features_vectorized( else: fr_raw = np.full(len(d), np.nan) - result["oi_change"] = _rolling_zscore(oi_raw.astype(np.float64)) - result["funding_rate"] = _rolling_zscore(fr_raw.astype(np.float64)) + oi_z = _rolling_zscore(oi_raw.astype(np.float64), window=96) + result["oi_change"] = oi_z + result["funding_rate"] = _rolling_zscore(fr_raw.astype(np.float64), window=96) + + # --- OI 파생 피처 --- + # 1. oi_change_ma5: OI 변화율의 5캔들 이동평균 (단기 추세) + oi_series = pd.Series(oi_raw.astype(np.float64)) + oi_ma5_raw = oi_series.rolling(window=5, min_periods=1).mean().values + result["oi_change_ma5"] = _rolling_zscore(oi_ma5_raw, window=96) + + # 2. oi_price_spread: z-scored OI - z-scored 가격 수익률 (연속값) + result["oi_price_spread"] = oi_z - ret_1_z return result @@ -384,7 +394,7 @@ def generate_dataset_vectorized( feat_all = _calc_features_vectorized(d, signal_arr, btc_df=btc_df, eth_df=eth_df) # 신호 발생 + NaN 없음 + 미래 데이터 충분한 인덱스만 - OPTIONAL_COLS = {"oi_change", "funding_rate"} + OPTIONAL_COLS = {"oi_change", "funding_rate", "oi_change_ma5", "oi_price_spread"} available_cols_for_nan_check = [ c for c in FEATURE_COLS if c in feat_all.columns and c not in OPTIONAL_COLS diff --git a/src/ml_features.py b/src/ml_features.py index f7a6224..a61073c 100644 --- a/src/ml_features.py +++ b/src/ml_features.py @@ -9,8 +9,9 @@ FEATURE_COLS = [ "eth_ret_1", "eth_ret_3", "eth_ret_5", "xrp_btc_rs", "xrp_eth_rs", # 시장 미시구조: OI 변화율(z-score), 펀딩비(z-score) - # parquet에 oi_change/funding_rate 컬럼이 없으면 dataset_builder에서 0으로 채움 "oi_change", "funding_rate", + # OI 파생 피처 + "oi_change_ma5", "oi_price_spread", "adx", ] @@ -37,12 +38,14 @@ def build_features( eth_df: pd.DataFrame | None = None, oi_change: float | None = None, funding_rate: float | None = None, + oi_change_ma5: float | None = None, + oi_price_spread: float | None = None, ) -> pd.Series: """ 기술 지표가 계산된 DataFrame의 마지막 행에서 ML 피처를 추출한다. - btc_df, eth_df가 제공되면 24개 피처를, 없으면 16개 피처를 반환한다. + btc_df, eth_df가 제공되면 26개 피처를, 없으면 18개 피처를 반환한다. signal: "LONG" | "SHORT" - oi_change, funding_rate: 실제 값이 제공되면 사용, 없으면 0.0으로 채운다. + oi_change, funding_rate, oi_change_ma5, oi_price_spread: 실제 값이 제공되면 사용, 없으면 0.0으로 채운다. """ last = df.iloc[-1] close = last["close"] @@ -132,8 +135,10 @@ def build_features( }) # 실시간에서 실제 값이 제공되면 사용, 없으면 0으로 채운다 - base["oi_change"] = float(oi_change) if oi_change is not None else 0.0 - base["funding_rate"] = float(funding_rate) if funding_rate is not None else 0.0 + base["oi_change"] = float(oi_change) if oi_change is not None else 0.0 + base["funding_rate"] = float(funding_rate) if funding_rate is not None else 0.0 + base["oi_change_ma5"] = float(oi_change_ma5) if oi_change_ma5 is not None else 0.0 + base["oi_price_spread"] = float(oi_price_spread) if oi_price_spread is not None else 0.0 base["adx"] = float(last.get("adx", 0)) return pd.Series(base) diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py index 0a29e6d..2899f48 100644 --- a/tests/test_dataset_builder.py +++ b/tests/test_dataset_builder.py @@ -266,3 +266,74 @@ def test_stratified_undersample_preserves_signal(): signal_indices = np.where(source == "signal")[0] for si in signal_indices: assert si in idx, f"signal 인덱스 {si}가 누락됨" + + +def test_oi_derived_features_present(): + """OI 파생 피처 2개가 결과에 포함되어야 한다.""" + import numpy as np + import pandas as pd + from src.dataset_builder import _calc_features_vectorized, _calc_signals, _calc_indicators + + n = 300 + np.random.seed(42) + df = pd.DataFrame({ + "open": np.random.uniform(1, 2, n), + "high": np.random.uniform(2, 3, n), + "low": np.random.uniform(0.5, 1, n), + "close": np.random.uniform(1, 2, n), + "volume": np.random.uniform(1000, 5000, n), + "oi_change": np.concatenate([np.zeros(100), np.random.uniform(-0.05, 0.05, 200)]), + }) + d = _calc_indicators(df) + sig = _calc_signals(d) + feat = _calc_features_vectorized(d, sig) + + assert "oi_change_ma5" in feat.columns, "oi_change_ma5 컬럼이 없음" + assert "oi_price_spread" in feat.columns, "oi_price_spread 컬럼이 없음" + + +def test_oi_derived_features_nan_when_no_oi(): + """oi_change 컬럼이 없으면 파생 피처도 nan이어야 한다.""" + import numpy as np + import pandas as pd + from src.dataset_builder import _calc_features_vectorized, _calc_signals, _calc_indicators + + n = 200 + np.random.seed(0) + df = pd.DataFrame({ + "open": np.random.uniform(1, 2, n), + "high": np.random.uniform(2, 3, n), + "low": np.random.uniform(0.5, 1, n), + "close": np.random.uniform(1, 2, n), + "volume": np.random.uniform(1000, 5000, n), + }) + d = _calc_indicators(df) + sig = _calc_signals(d) + feat = _calc_features_vectorized(d, sig) + + assert feat["oi_change_ma5"].isna().all(), "oi_change 컬럼 없을 때 oi_change_ma5는 전부 nan이어야 함" + assert feat["oi_price_spread"].isna().all(), "oi_change 컬럼 없을 때 oi_price_spread는 전부 nan이어야 함" + + +def test_oi_price_spread_is_continuous(): + """oi_price_spread는 바이너리가 아닌 연속값이어야 한다.""" + import numpy as np + import pandas as pd + from src.dataset_builder import _calc_features_vectorized, _calc_signals, _calc_indicators + + n = 300 + np.random.seed(42) + df = pd.DataFrame({ + "open": np.random.uniform(1, 2, n), + "high": np.random.uniform(2, 3, n), + "low": np.random.uniform(0.5, 1, n), + "close": np.random.uniform(1, 2, n), + "volume": np.random.uniform(1000, 5000, n), + "oi_change": np.random.uniform(-0.05, 0.05, n), + }) + d = _calc_indicators(df) + sig = _calc_signals(d) + feat = _calc_features_vectorized(d, sig) + + valid = feat["oi_price_spread"].dropna() + assert len(valid.unique()) > 2, "oi_price_spread는 연속값이어야 함 (2개 초과 유니크 값)" diff --git a/tests/test_ml_features.py b/tests/test_ml_features.py index 0803aeb..48da37b 100644 --- a/tests/test_ml_features.py +++ b/tests/test_ml_features.py @@ -21,17 +21,17 @@ def _make_df(n=10, base_price=1.0): }) -def test_build_features_with_btc_eth_has_24_features(): +def test_build_features_with_btc_eth_has_26_features(): xrp_df = _make_df(10, base_price=1.0) btc_df = _make_df(10, base_price=50000.0) eth_df = _make_df(10, base_price=3000.0) features = build_features(xrp_df, "LONG", btc_df=btc_df, eth_df=eth_df) - assert len(features) == 24 + assert len(features) == 26 -def test_build_features_without_btc_eth_has_16_features(): +def test_build_features_without_btc_eth_has_18_features(): xrp_df = _make_df(10, base_price=1.0) features = build_features(xrp_df, "LONG") - assert len(features) == 16 + assert len(features) == 18 def test_build_features_btc_ret_1_correct(): xrp_df = _make_df(10, base_price=1.0) @@ -51,8 +51,9 @@ def test_build_features_rs_zero_when_btc_ret_zero(): assert features["xrp_btc_rs"] == 0.0 def test_feature_cols_has_24_items(): + """Legacy test — updated to 26 after OI derived features added.""" from src.ml_features import FEATURE_COLS - assert len(FEATURE_COLS) == 24 + assert len(FEATURE_COLS) == 26 def make_df(n=100): @@ -139,3 +140,31 @@ def test_build_features_defaults_to_zero_when_not_provided(sample_df_with_indica feat = build_features(sample_df_with_indicators, signal="LONG") assert feat["oi_change"] == pytest.approx(0.0) assert feat["funding_rate"] == pytest.approx(0.0) + + +def test_feature_cols_has_26_items(): + from src.ml_features import FEATURE_COLS + assert len(FEATURE_COLS) == 26 + + +def test_build_features_with_oi_derived_params(): + """oi_change_ma5, oi_price_spread 파라미터가 피처에 반영된다.""" + xrp_df = _make_df(10, base_price=1.0) + btc_df = _make_df(10, base_price=50000.0) + eth_df = _make_df(10, base_price=3000.0) + features = build_features( + xrp_df, "LONG", + btc_df=btc_df, eth_df=eth_df, + oi_change=0.05, funding_rate=0.0002, + oi_change_ma5=0.03, oi_price_spread=0.12, + ) + assert features["oi_change_ma5"] == pytest.approx(0.03) + assert features["oi_price_spread"] == pytest.approx(0.12) + + +def test_build_features_oi_derived_defaults_to_zero(): + """oi_change_ma5, oi_price_spread 미제공 시 0.0으로 채워진다.""" + xrp_df = _make_df(10, base_price=1.0) + features = build_features(xrp_df, "LONG") + assert features["oi_change_ma5"] == pytest.approx(0.0) + assert features["oi_price_spread"] == pytest.approx(0.0)