feat: enhance model training and deployment scripts with time-weighted sampling

- Updated `train_model.py` and `train_mlx_model.py` to include a time weight decay parameter for improved sample weighting during training. - Modified dataset generation to incorporate sample weights based on time decay, enhancing model performance. - Adjusted deployment scripts to support new backend options and improved error handling for model file transfers. - Added new entries to the training log for better tracking of model performance metrics over time. - Included ONNX model export functionality in the MLX filter for compatibility with Linux servers.
2026-03-01 21:25:06 +09:00
parent 301457ce57
commit db144750a3
10 changed files with 324 additions and 97 deletions
--- a/scripts/train_mlx_model.py
+++ b/scripts/train_mlx_model.py
@@ -25,14 +25,14 @@ MLX_MODEL_PATH = Path("models/mlx_filter.weights")
 LOG_PATH = Path("models/training_log.json")


-def train_mlx(data_path: str) -> float:
+def train_mlx(data_path: str, time_weight_decay: float = 2.0) -> float:
    print(f"데이터 로드: {data_path}")
    df = pd.read_parquet(data_path)
    print(f"캔들 수: {len(df)}")

    print("\n데이터셋 생성 중...")
    t0 = time.perf_counter()
-    dataset = generate_dataset_vectorized(df)
+    dataset = generate_dataset_vectorized(df, time_weight_decay=time_weight_decay)
    t1 = time.perf_counter()
    print(f"데이터셋 생성 완료: {t1 - t0:.1f}초, {len(dataset)}개 샘플")

@@ -46,10 +46,30 @@ def train_mlx(data_path: str) -> float:

    X = dataset[FEATURE_COLS]
    y = dataset["label"]
+    w = dataset["sample_weight"].values

    split = int(len(X) * 0.8)
    X_train, X_val = X.iloc[:split], X.iloc[split:]
    y_train, y_val = y.iloc[:split], y.iloc[split:]
+    w_train = w[:split]
+
+    # --- 클래스 불균형 처리: 언더샘플링 (가중치 인덱스 보존) ---
+    pos_idx = np.where(y_train == 1)[0]
+    neg_idx = np.where(y_train == 0)[0]
+
+    if len(neg_idx) > len(pos_idx):
+        np.random.seed(42)
+        neg_idx = np.random.choice(neg_idx, size=len(pos_idx), replace=False)
+
+    balanced_idx = np.concatenate([pos_idx, neg_idx])
+    np.random.shuffle(balanced_idx)
+
+    X_train = X_train.iloc[balanced_idx]
+    y_train = y_train.iloc[balanced_idx]
+    w_train = w_train[balanced_idx]
+
+    print(f"\n언더샘플링 적용 후 학습 데이터: {len(X_train)}개 (양성={y_train.sum()}, 음성={(y_train==0).sum()})")
+    # --------------------------------------

    print("\nMLX 신경망 학습 시작 (GPU)...")
    t2 = time.perf_counter()
@@ -60,7 +80,7 @@ def train_mlx(data_path: str) -> float:
        epochs=100,
        batch_size=256,
    )
-    model.fit(X_train, y_train)
+    model.fit(X_train, y_train, sample_weight=w_train)
    t3 = time.perf_counter()
    print(f"학습 완료: {t3 - t2:.1f}초")

@@ -83,6 +103,7 @@ def train_mlx(data_path: str) -> float:
        "auc": round(auc, 4),
        "samples": len(dataset),
        "train_sec": round(t3 - t2, 1),
+        "time_weight_decay": time_weight_decay,
        "model_path": str(MLX_MODEL_PATH),
    })
    with open(LOG_PATH, "w") as f:
@@ -94,8 +115,12 @@ def train_mlx(data_path: str) -> float:
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", default="data/xrpusdt_1m.parquet")
+    parser.add_argument(
+        "--decay", type=float, default=2.0,
+        help="시간 가중치 감쇠 강도 (0=균등, 2.0=최신이 ~7.4배 높음)",
+    )
    args = parser.parse_args()
-    train_mlx(args.data)
+    train_mlx(args.data, time_weight_decay=args.decay)


 if __name__ == "__main__":