From 4afc7506d73c11bff076ba82c6d1f9c3ad8c5118 Mon Sep 17 00:00:00 2001
From: 21in7 <skawksmsduzo@gmail.com>
Date: Mon, 2 Mar 2026 14:45:15 +0900
Subject: [PATCH] feat: connect Optuna tuning results to train_model.py via
 --tuned-params
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- _load_lgbm_params() 헬퍼 추가: 기본 파라미터 반환, JSON 주어지면 덮어씀
- train(): tuned_params_path 인자 추가, weight_scale 적용
- walk_forward_auc(): tuned_params_path 인자 추가, weight_scale 적용
- main(): --tuned-params argparse 인자 추가, 두 함수에 전달
- training_log.json에 tuned_params_path, lgbm_params, weight_scale 기록

Made-with: Cursor
---
 scripts/train_model.py | 87 +++++++++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 30 deletions(-)

diff --git a/scripts/train_model.py b/scripts/train_model.py
index 8d25de2..baab02e 100644
--- a/scripts/train_model.py
+++ b/scripts/train_model.py
@@ -146,7 +146,36 @@ def generate_dataset(df: pd.DataFrame, n_jobs: int | None = None) -> pd.DataFram
     return pd.DataFrame(rows)
 
 
-def train(data_path: str, time_weight_decay: float = 2.0):
+def _load_lgbm_params(tuned_params_path: str | None) -> tuple[dict, float]:
+    """기본 LightGBM 파라미터를 반환하고, 튜닝 JSON이 주어지면 덮어쓴다.
+    반환: (lgbm_params, weight_scale)
+    """
+    lgbm_params: dict = {
+        "n_estimators":      500,
+        "learning_rate":     0.05,
+        "num_leaves":        31,
+        "min_child_samples": 15,
+        "subsample":         0.8,
+        "colsample_bytree":  0.8,
+        "reg_alpha":         0.05,
+        "reg_lambda":        0.1,
+    }
+    weight_scale = 1.0
+
+    if tuned_params_path:
+        with open(tuned_params_path, "r", encoding="utf-8") as f:
+            tune_data = json.load(f)
+        best_params = dict(tune_data["best_trial"]["params"])
+        weight_scale = float(best_params.pop("weight_scale", 1.0))
+        lgbm_params.update(best_params)
+        print(f"\n[Optuna] 튜닝 파라미터 로드: {tuned_params_path}")
+        print(f"[Optuna] 적용 파라미터: {lgbm_params}")
+        print(f"[Optuna] weight_scale: {weight_scale}\n")
+
+    return lgbm_params, weight_scale
+
+
+def train(data_path: str, time_weight_decay: float = 2.0, tuned_params_path: str | None = None):
     print(f"데이터 로드: {data_path}")
     df_raw = pd.read_parquet(data_path)
     print(f"캔들 수: {len(df_raw)}, 컬럼: {list(df_raw.columns)}")
@@ -188,7 +217,10 @@ def train(data_path: str, time_weight_decay: float = 2.0):
     split = int(len(X) * 0.8)
     X_train, X_val = X.iloc[:split], X.iloc[split:]
     y_train, y_val = y.iloc[:split], y.iloc[split:]
-    w_train = w[:split]
+
+    # 튜닝 파라미터 로드 (없으면 기본값 사용)
+    lgbm_params, weight_scale = _load_lgbm_params(tuned_params_path)
+    w_train = (w[:split] * weight_scale).astype(np.float32)
 
     # --- 클래스 불균형 처리: 언더샘플링 (시간 가중치 인덱스 보존) ---
     pos_idx = np.where(y_train == 1)[0]
@@ -208,18 +240,7 @@ def train(data_path: str, time_weight_decay: float = 2.0):
     print(f"검증 데이터: {len(X_val)}개 (양성={int(y_val.sum())}, 음성={int((y_val==0).sum())})")
     # ---------------------------------------------------------------
 
-    model = lgb.LGBMClassifier(
-        n_estimators=500,
-        learning_rate=0.05,
-        num_leaves=31,
-        min_child_samples=15,
-        subsample=0.8,
-        colsample_bytree=0.8,
-        reg_alpha=0.05,
-        reg_lambda=0.1,
-        random_state=42,
-        verbose=-1,
-    )
+    model = lgb.LGBMClassifier(**lgbm_params, random_state=42, verbose=-1)
     model.fit(
         X_train, y_train,
         sample_weight=w_train,
@@ -268,7 +289,7 @@ def train(data_path: str, time_weight_decay: float = 2.0):
     if LOG_PATH.exists():
         with open(LOG_PATH) as f:
             log = json.load(f)
-    log.append({
+    log_entry: dict = {
         "date": datetime.now().isoformat(),
         "backend": "lgbm",
         "auc": round(auc, 4),
@@ -279,7 +300,11 @@ def train(data_path: str, time_weight_decay: float = 2.0):
         "features": len(actual_feature_cols),
         "time_weight_decay": time_weight_decay,
         "model_path": str(MODEL_PATH),
-    })
+        "tuned_params_path": tuned_params_path,
+        "lgbm_params": lgbm_params,
+        "weight_scale": weight_scale,
+    }
+    log.append(log_entry)
     with open(LOG_PATH, "w") as f:
         json.dump(log, f, indent=2)
 
@@ -291,6 +316,7 @@ def walk_forward_auc(
     time_weight_decay: float = 2.0,
     n_splits: int = 5,
     train_ratio: float = 0.6,
+    tuned_params_path: str | None = None,
 ) -> None:
     """Walk-Forward 검증: 슬라이딩 윈도우로 n_splits번 학습/검증 반복.
 
@@ -320,6 +346,9 @@ def walk_forward_auc(
     w = dataset["sample_weight"].values
     n = len(dataset)
 
+    lgbm_params, weight_scale = _load_lgbm_params(tuned_params_path)
+    w = (w * weight_scale).astype(np.float32)
+
     step = max(1, int(n * (1 - train_ratio) / n_splits))
     train_end_start = int(n * train_ratio)
 
@@ -340,18 +369,7 @@ def walk_forward_auc(
             neg_idx = np.random.choice(neg_idx, size=len(pos_idx), replace=False)
         idx = np.sort(np.concatenate([pos_idx, neg_idx]))
 
-        model = lgb.LGBMClassifier(
-            n_estimators=500,
-            learning_rate=0.05,
-            num_leaves=31,
-            min_child_samples=15,
-            subsample=0.8,
-            colsample_bytree=0.8,
-            reg_alpha=0.05,
-            reg_lambda=0.1,
-            random_state=42,
-            verbose=-1,
-        )
+        model = lgb.LGBMClassifier(**lgbm_params, random_state=42, verbose=-1)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
             model.fit(X_tr[idx], y_tr[idx], sample_weight=w_tr[idx])
@@ -377,12 +395,21 @@ def main():
     )
     parser.add_argument("--wf", action="store_true", help="Walk-Forward 검증 실행")
     parser.add_argument("--wf-splits", type=int, default=5, help="Walk-Forward 폴드 수")
+    parser.add_argument(
+        "--tuned-params", type=str, default=None,
+        help="Optuna 튜닝 결과 JSON 경로 (지정 시 기본 파라미터를 덮어씀)",
+    )
     args = parser.parse_args()
 
     if args.wf:
-        walk_forward_auc(args.data, time_weight_decay=args.decay, n_splits=args.wf_splits)
+        walk_forward_auc(
+            args.data,
+            time_weight_decay=args.decay,
+            n_splits=args.wf_splits,
+            tuned_params_path=args.tuned_params,
+        )
     else:
-        train(args.data, time_weight_decay=args.decay)
+        train(args.data, time_weight_decay=args.decay, tuned_params_path=args.tuned_params)
 
 
 if __name__ == "__main__":