diff --git a/src/dataset_builder.py b/src/dataset_builder.py index a906354..ff32f42 100644 --- a/src/dataset_builder.py +++ b/src/dataset_builder.py @@ -323,6 +323,8 @@ def _calc_labels_vectorized( d: pd.DataFrame, feat: pd.DataFrame, sig_idx: np.ndarray, + atr_sl_mult: float = ATR_SL_MULT, + atr_tp_mult: float = ATR_TP_MULT, ) -> tuple[np.ndarray, np.ndarray]: """ label_builder.py build_labels() 로직을 numpy 2D 배열로 벡터화한다. @@ -348,11 +350,11 @@ def _calc_labels_vectorized( continue if signal == "LONG": - sl = entry - atr * ATR_SL_MULT - tp = entry + atr * ATR_TP_MULT + sl = entry - atr * atr_sl_mult + tp = entry + atr * atr_tp_mult else: - sl = entry + atr * ATR_SL_MULT - tp = entry - atr * ATR_TP_MULT + sl = entry + atr * atr_sl_mult + tp = entry - atr * atr_tp_mult end = min(idx + 1 + LOOKAHEAD, n_total) fut_high = highs[idx + 1 : end] @@ -391,6 +393,8 @@ def generate_dataset_vectorized( signal_threshold: int = 3, adx_threshold: float = 25, volume_multiplier: float = 2.5, + atr_sl_mult: float = ATR_SL_MULT, + atr_tp_mult: float = ATR_TP_MULT, ) -> pd.DataFrame: """ 전체 시계열을 1회 계산해 학습 데이터셋을 생성한다. @@ -435,7 +439,10 @@ def generate_dataset_vectorized( print(f" 신호 발생 인덱스: {len(sig_idx):,}개") print(" [3/3] 레이블 계산...") - labels, valid_mask = _calc_labels_vectorized(d, feat_all, sig_idx) + labels, valid_mask = _calc_labels_vectorized( + d, feat_all, sig_idx, + atr_sl_mult=atr_sl_mult, atr_tp_mult=atr_tp_mult, + ) final_sig_idx = sig_idx[valid_mask] available_feature_cols = [c for c in FEATURE_COLS if c in feat_all.columns] diff --git a/tests/test_ml_pipeline_fixes.py b/tests/test_ml_pipeline_fixes.py new file mode 100644 index 0000000..3405e3f --- /dev/null +++ b/tests/test_ml_pipeline_fixes.py @@ -0,0 +1,56 @@ +import numpy as np +import pandas as pd +import pytest +from src.dataset_builder import generate_dataset_vectorized, _calc_labels_vectorized + + +@pytest.fixture +def signal_df(): + """시그널이 발생하는 데이터.""" + rng = np.random.default_rng(7) + n = 800 + trend = np.linspace(1.5, 3.0, n) + noise = np.cumsum(rng.normal(0, 0.04, n)) + close = np.clip(trend + noise, 0.01, None) + high = close * (1 + rng.uniform(0, 0.015, n)) + low = close * (1 - rng.uniform(0, 0.015, n)) + volume = rng.uniform(1e6, 3e6, n) + volume[::30] *= 3.0 + return pd.DataFrame({ + "open": close, "high": high, "low": low, + "close": close, "volume": volume, + }) + + +def test_sltp_params_are_passed_through(signal_df): + """SL/TP 배수가 generate_dataset_vectorized에 전달되어야 한다.""" + # 파라미터가 수용되는지(TypeError 없이) 확인하는 것이 핵심 + r1 = generate_dataset_vectorized( + signal_df, atr_sl_mult=1.5, atr_tp_mult=2.0, + adx_threshold=0, volume_multiplier=1.5, + ) + r2 = generate_dataset_vectorized( + signal_df, atr_sl_mult=2.0, atr_tp_mult=2.0, + adx_threshold=0, volume_multiplier=1.5, + ) + # 두 결과 모두 DataFrame이어야 한다 + assert isinstance(r1, pd.DataFrame) + assert isinstance(r2, pd.DataFrame) + # 신호가 충분히 많을 경우, 다른 SL 배수는 레이블 분포에 영향을 줄 수 있다 + if len(r1) > 10 and len(r2) > 10: + assert not (r1["label"].values == r2["label"].values).all() or len(r1) != len(r2), \ + "SL 배수가 다르면 레이블이 달라져야 한다" + + +def test_default_sltp_backward_compatible(signal_df): + """SL/TP 파라미터 미지정 시 기존 기본값(1.5, 2.0)으로 동작해야 한다.""" + r_default = generate_dataset_vectorized( + signal_df, adx_threshold=0, volume_multiplier=1.5, + ) + r_explicit = generate_dataset_vectorized( + signal_df, atr_sl_mult=1.5, atr_tp_mult=2.0, + adx_threshold=0, volume_multiplier=1.5, + ) + if len(r_default) > 0: + assert len(r_default) == len(r_explicit) + assert (r_default["label"].values == r_explicit["label"].values).all()