feat: implement BTC/ETH correlation features for improved model accuracy

- Added a new design document outlining the integration of BTC/ETH candle data as additional features in the XRP ML filter, enhancing prediction accuracy. - Introduced `MultiSymbolStream` for combined WebSocket data retrieval of XRP, BTC, and ETH. - Expanded feature set from 13 to 21 by including 8 new BTC/ETH-related features. - Updated various scripts and modules to support the new feature set and data handling. - Enhanced training and deployment scripts to accommodate the new dataset structure. This commit lays the groundwork for improved model performance by leveraging the correlation between BTC and ETH with XRP.
2026-03-01 19:30:17 +09:00
parent c4062c39d3
commit d1af736bfc
15 changed files with 1448 additions and 68 deletions
--- a/scripts/train_model.py
+++ b/scripts/train_model.py
@@ -148,11 +148,28 @@ def generate_dataset(df: pd.DataFrame, n_jobs: int | None = None) -> pd.DataFram

 def train(data_path: str):
    print(f"데이터 로드: {data_path}")
-    df = pd.read_parquet(data_path)
-    print(f"캔들 수: {len(df)}")
+    df_raw = pd.read_parquet(data_path)
+    print(f"캔들 수: {len(df_raw)}, 컬럼: {list(df_raw.columns)}")
+
+    # 병합 데이터셋 여부 판별
+    btc_df = None
+    eth_df = None
+    base_cols = ["open", "high", "low", "close", "volume"]
+
+    if "close_btc" in df_raw.columns:
+        btc_df = df_raw[[c + "_btc" for c in base_cols]].copy()
+        btc_df.columns = base_cols
+        print("BTC 피처 활성화")
+
+    if "close_eth" in df_raw.columns:
+        eth_df = df_raw[[c + "_eth" for c in base_cols]].copy()
+        eth_df.columns = base_cols
+        print("ETH 피처 활성화")
+
+    df = df_raw[base_cols].copy()

    print("데이터셋 생성 중...")
-    dataset = generate_dataset_vectorized(df)
+    dataset = generate_dataset_vectorized(df, btc_df=btc_df, eth_df=eth_df)

    if dataset.empty or "label" not in dataset.columns:
        raise ValueError(f"데이터셋 생성 실패: 샘플 0개. 위 오류 메시지를 확인하세요.")
@@ -162,7 +179,9 @@ def train(data_path: str):
    if len(dataset) < 200:
        raise ValueError(f"학습 샘플 부족: {len(dataset)}개 (최소 200 필요)")

-    X = dataset[FEATURE_COLS]
+    actual_feature_cols = [c for c in FEATURE_COLS if c in dataset.columns]
+    print(f"사용 피처: {len(actual_feature_cols)}개 {actual_feature_cols}")
+    X = dataset[actual_feature_cols]
    y = dataset["label"]

    split = int(len(X) * 0.8)
@@ -208,6 +227,7 @@ def train(data_path: str):
        "date": datetime.now().isoformat(),
        "auc": round(auc, 4),
        "samples": len(dataset),
+        "features": len(actual_feature_cols),
        "model_path": str(MODEL_PATH),
    })
    with open(LOG_PATH, "w") as f: