feat: apply stratified undersampling to training pipeline

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
21in7
2026-03-03 00:03:09 +09:00
parent 0af138d8ee
commit 6cd54b46d9

View File

@@ -22,7 +22,7 @@ from sklearn.metrics import roc_auc_score, classification_report
from src.indicators import Indicators from src.indicators import Indicators
from src.ml_features import build_features, FEATURE_COLS from src.ml_features import build_features, FEATURE_COLS
from src.label_builder import build_labels from src.label_builder import build_labels
from src.dataset_builder import generate_dataset_vectorized from src.dataset_builder import generate_dataset_vectorized, stratified_undersample
def _cgroup_cpu_count() -> int: def _cgroup_cpu_count() -> int:
"""cgroup v1/v2 쿼터를 읽어 실제 할당된 CPU 수를 반환한다. """cgroup v1/v2 쿼터를 읽어 실제 할당된 CPU 수를 반환한다.
@@ -214,7 +214,11 @@ def train(data_path: str, time_weight_decay: float = 2.0, tuned_params_path: str
df = df_raw[base_cols].copy() df = df_raw[base_cols].copy()
print("데이터셋 생성 중...") print("데이터셋 생성 중...")
dataset = generate_dataset_vectorized(df, btc_df=btc_df, eth_df=eth_df, time_weight_decay=time_weight_decay) dataset = generate_dataset_vectorized(
df, btc_df=btc_df, eth_df=eth_df,
time_weight_decay=time_weight_decay,
negative_ratio=5,
)
if dataset.empty or "label" not in dataset.columns: if dataset.empty or "label" not in dataset.columns:
raise ValueError(f"데이터셋 생성 실패: 샘플 0개. 위 오류 메시지를 확인하세요.") raise ValueError(f"데이터셋 생성 실패: 샘플 0개. 위 오류 메시지를 확인하세요.")
@@ -229,6 +233,7 @@ def train(data_path: str, time_weight_decay: float = 2.0, tuned_params_path: str
X = dataset[actual_feature_cols] X = dataset[actual_feature_cols]
y = dataset["label"] y = dataset["label"]
w = dataset["sample_weight"].values w = dataset["sample_weight"].values
source = dataset["source"].values if "source" in dataset.columns else np.full(len(X), "signal")
split = int(len(X) * 0.8) split = int(len(X) * 0.8)
X_train, X_val = X.iloc[:split], X.iloc[split:] X_train, X_val = X.iloc[:split], X.iloc[split:]
@@ -238,21 +243,19 @@ def train(data_path: str, time_weight_decay: float = 2.0, tuned_params_path: str
lgbm_params, weight_scale = _load_lgbm_params(tuned_params_path) lgbm_params, weight_scale = _load_lgbm_params(tuned_params_path)
w_train = (w[:split] * weight_scale).astype(np.float32) w_train = (w[:split] * weight_scale).astype(np.float32)
# --- 클래스 불균형 처리: 언더샘플링 (시간 가중치 인덱스 보존) --- # --- 계층적 샘플링: signal 전수 유지, HOLD negative만 양성 수 만큼 ---
pos_idx = np.where(y_train == 1)[0] source_train = source[:split]
neg_idx = np.where(y_train == 0)[0] balanced_idx = stratified_undersample(y_train.values, source_train, seed=42)
if len(neg_idx) > len(pos_idx):
np.random.seed(42)
neg_idx = np.random.choice(neg_idx, size=len(pos_idx), replace=False)
balanced_idx = np.sort(np.concatenate([pos_idx, neg_idx])) # 시간 순서 유지
X_train = X_train.iloc[balanced_idx] X_train = X_train.iloc[balanced_idx]
y_train = y_train.iloc[balanced_idx] y_train = y_train.iloc[balanced_idx]
w_train = w_train[balanced_idx] w_train = w_train[balanced_idx]
print(f"\n언더샘플링 후 학습 데이터: {len(X_train)}개 (양성={y_train.sum()}, 음성={(y_train==0).sum()})") sig_count = (source_train[balanced_idx] == "signal").sum()
hold_count = (source_train[balanced_idx] == "hold_negative").sum()
print(f"\n계층적 샘플링 후 학습 데이터: {len(X_train)}"
f"(Signal={sig_count}, HOLD={hold_count}, "
f"양성={int(y_train.sum())}, 음성={int((y_train==0).sum())})")
print(f"검증 데이터: {len(X_val)}개 (양성={int(y_val.sum())}, 음성={int((y_val==0).sum())})") print(f"검증 데이터: {len(X_val)}개 (양성={int(y_val.sum())}, 음성={int((y_val==0).sum())})")
# --------------------------------------------------------------- # ---------------------------------------------------------------
@@ -354,13 +357,16 @@ def walk_forward_auc(
df = df_raw[base_cols].copy() df = df_raw[base_cols].copy()
dataset = generate_dataset_vectorized( dataset = generate_dataset_vectorized(
df, btc_df=btc_df, eth_df=eth_df, time_weight_decay=time_weight_decay df, btc_df=btc_df, eth_df=eth_df,
time_weight_decay=time_weight_decay,
negative_ratio=5,
) )
actual_feature_cols = [c for c in FEATURE_COLS if c in dataset.columns] actual_feature_cols = [c for c in FEATURE_COLS if c in dataset.columns]
X = dataset[actual_feature_cols].values X = dataset[actual_feature_cols].values
y = dataset["label"].values y = dataset["label"].values
w = dataset["sample_weight"].values w = dataset["sample_weight"].values
n = len(dataset) n = len(dataset)
source = dataset["source"].values if "source" in dataset.columns else np.full(n, "signal")
lgbm_params, weight_scale = _load_lgbm_params(tuned_params_path) lgbm_params, weight_scale = _load_lgbm_params(tuned_params_path)
w = (w * weight_scale).astype(np.float32) w = (w * weight_scale).astype(np.float32)
@@ -378,12 +384,8 @@ def walk_forward_auc(
X_tr, y_tr, w_tr = X[:tr_end], y[:tr_end], w[:tr_end] X_tr, y_tr, w_tr = X[:tr_end], y[:tr_end], w[:tr_end]
X_val, y_val = X[tr_end:val_end], y[tr_end:val_end] X_val, y_val = X[tr_end:val_end], y[tr_end:val_end]
pos_idx = np.where(y_tr == 1)[0] source_tr = source[:tr_end]
neg_idx = np.where(y_tr == 0)[0] idx = stratified_undersample(y_tr, source_tr, seed=42)
if len(neg_idx) > len(pos_idx):
np.random.seed(42)
neg_idx = np.random.choice(neg_idx, size=len(pos_idx), replace=False)
idx = np.sort(np.concatenate([pos_idx, neg_idx]))
model = lgb.LGBMClassifier(**lgbm_params, random_state=42, verbose=-1) model = lgb.LGBMClassifier(**lgbm_params, random_state=42, verbose=-1)
with warnings.catch_warnings(): with warnings.catch_warnings():