From 6cd54b46d9aef69fc1393c731a3a1f345e8ef537 Mon Sep 17 00:00:00 2001 From: 21in7 Date: Tue, 3 Mar 2026 00:03:09 +0900 Subject: [PATCH] feat: apply stratified undersampling to training pipeline Co-Authored-By: Claude Opus 4.6 --- scripts/train_model.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/scripts/train_model.py b/scripts/train_model.py index 34dafa7..03fda16 100644 --- a/scripts/train_model.py +++ b/scripts/train_model.py @@ -22,7 +22,7 @@ from sklearn.metrics import roc_auc_score, classification_report from src.indicators import Indicators from src.ml_features import build_features, FEATURE_COLS from src.label_builder import build_labels -from src.dataset_builder import generate_dataset_vectorized +from src.dataset_builder import generate_dataset_vectorized, stratified_undersample def _cgroup_cpu_count() -> int: """cgroup v1/v2 쿼터를 읽어 실제 할당된 CPU 수를 반환한다. @@ -214,7 +214,11 @@ def train(data_path: str, time_weight_decay: float = 2.0, tuned_params_path: str df = df_raw[base_cols].copy() print("데이터셋 생성 중...") - dataset = generate_dataset_vectorized(df, btc_df=btc_df, eth_df=eth_df, time_weight_decay=time_weight_decay) + dataset = generate_dataset_vectorized( + df, btc_df=btc_df, eth_df=eth_df, + time_weight_decay=time_weight_decay, + negative_ratio=5, + ) if dataset.empty or "label" not in dataset.columns: raise ValueError(f"데이터셋 생성 실패: 샘플 0개. 위 오류 메시지를 확인하세요.") @@ -229,6 +233,7 @@ def train(data_path: str, time_weight_decay: float = 2.0, tuned_params_path: str X = dataset[actual_feature_cols] y = dataset["label"] w = dataset["sample_weight"].values + source = dataset["source"].values if "source" in dataset.columns else np.full(len(X), "signal") split = int(len(X) * 0.8) X_train, X_val = X.iloc[:split], X.iloc[split:] @@ -238,21 +243,19 @@ def train(data_path: str, time_weight_decay: float = 2.0, tuned_params_path: str lgbm_params, weight_scale = _load_lgbm_params(tuned_params_path) w_train = (w[:split] * weight_scale).astype(np.float32) - # --- 클래스 불균형 처리: 언더샘플링 (시간 가중치 인덱스 보존) --- - pos_idx = np.where(y_train == 1)[0] - neg_idx = np.where(y_train == 0)[0] - - if len(neg_idx) > len(pos_idx): - np.random.seed(42) - neg_idx = np.random.choice(neg_idx, size=len(pos_idx), replace=False) - - balanced_idx = np.sort(np.concatenate([pos_idx, neg_idx])) # 시간 순서 유지 + # --- 계층적 샘플링: signal 전수 유지, HOLD negative만 양성 수 만큼 --- + source_train = source[:split] + balanced_idx = stratified_undersample(y_train.values, source_train, seed=42) X_train = X_train.iloc[balanced_idx] y_train = y_train.iloc[balanced_idx] w_train = w_train[balanced_idx] - print(f"\n언더샘플링 후 학습 데이터: {len(X_train)}개 (양성={y_train.sum()}, 음성={(y_train==0).sum()})") + sig_count = (source_train[balanced_idx] == "signal").sum() + hold_count = (source_train[balanced_idx] == "hold_negative").sum() + print(f"\n계층적 샘플링 후 학습 데이터: {len(X_train)}개 " + f"(Signal={sig_count}, HOLD={hold_count}, " + f"양성={int(y_train.sum())}, 음성={int((y_train==0).sum())})") print(f"검증 데이터: {len(X_val)}개 (양성={int(y_val.sum())}, 음성={int((y_val==0).sum())})") # --------------------------------------------------------------- @@ -354,13 +357,16 @@ def walk_forward_auc( df = df_raw[base_cols].copy() dataset = generate_dataset_vectorized( - df, btc_df=btc_df, eth_df=eth_df, time_weight_decay=time_weight_decay + df, btc_df=btc_df, eth_df=eth_df, + time_weight_decay=time_weight_decay, + negative_ratio=5, ) actual_feature_cols = [c for c in FEATURE_COLS if c in dataset.columns] X = dataset[actual_feature_cols].values y = dataset["label"].values w = dataset["sample_weight"].values n = len(dataset) + source = dataset["source"].values if "source" in dataset.columns else np.full(n, "signal") lgbm_params, weight_scale = _load_lgbm_params(tuned_params_path) w = (w * weight_scale).astype(np.float32) @@ -378,12 +384,8 @@ def walk_forward_auc( X_tr, y_tr, w_tr = X[:tr_end], y[:tr_end], w[:tr_end] X_val, y_val = X[tr_end:val_end], y[tr_end:val_end] - pos_idx = np.where(y_tr == 1)[0] - neg_idx = np.where(y_tr == 0)[0] - if len(neg_idx) > len(pos_idx): - np.random.seed(42) - neg_idx = np.random.choice(neg_idx, size=len(pos_idx), replace=False) - idx = np.sort(np.concatenate([pos_idx, neg_idx])) + source_tr = source[:tr_end] + idx = stratified_undersample(y_tr, source_tr, seed=42) model = lgb.LGBMClassifier(**lgbm_params, random_state=42, verbose=-1) with warnings.catch_warnings():