feat: enhance data fetching and model training with OI and funding rate integration

- Updated `fetch_history.py` to collect open interest (OI) and funding rate data from Binance, improving the dataset for model training. - Modified `train_and_deploy.sh` to include options for OI and funding rate collection during data fetching. - Enhanced `dataset_builder.py` to incorporate OI change and funding rate features with rolling z-score normalization. - Updated training logs to reflect new metrics and features, ensuring comprehensive tracking of model performance. - Adjusted feature columns in `ml_features.py` to include OI and funding rate for improved model robustness.
2026-03-01 22:25:38 +09:00
parent 4245d7cdbf
commit 24d3ba9411
8 changed files with 232 additions and 13 deletions
--- a/scripts/fetch_history.py
+++ b/scripts/fetch_history.py
@@ -2,6 +2,11 @@
 바이낸스 선물 REST API로 과거 캔들 데이터를 수집해 parquet으로 저장한다.
 사용법: python scripts/fetch_history.py --symbol XRPUSDT --interval 1m --days 90
       python scripts/fetch_history.py --symbols XRPUSDT BTCUSDT ETHUSDT --days 90
+
+OI/펀딩비 수집 제약:
+  - OI 히스토리: 바이낸스 API 제한으로 최근 30일치만 제공 (period=15m, limit=500/req)
+  - 펀딩비: 8시간 주기 → 15분봉에 forward-fill 병합
+  - 30일 이전 구간은 oi_change=0, funding_rate=0으로 채움
 """
 import sys
 from pathlib import Path
@@ -9,6 +14,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent))

 import asyncio
 import argparse
+import aiohttp
 from datetime import datetime, timezone, timedelta
 import pandas as pd
 from binance import AsyncClient
@@ -21,6 +27,7 @@ load_dotenv()
 # 1500개씩 가져오므로 90일 1m 데이터 = ~65회 요청/심볼
 # 심볼 간 딜레이 없이 연속 요청하면 레이트 리밋(-1003) 발생
 _REQUEST_DELAY = 0.3  # 초당 ~3.3 req → 안전 마진 충분
+_FAPI_BASE = "https://fapi.binance.com"


 def _now_ms() -> int:
@@ -107,6 +114,151 @@ async def fetch_klines_all(
    return dfs


+async def _fetch_oi_hist(
+    session: aiohttp.ClientSession,
+    symbol: str,
+    period: str = "15m",
+) -> pd.DataFrame:
+    """
+    바이낸스 /futures/data/openInterestHist 엔드포인트로 OI 히스토리를 수집한다.
+    API 제한: 최근 30일치만 제공, 1회 최대 500개.
+    """
+    url = f"{_FAPI_BASE}/futures/data/openInterestHist"
+    all_rows = []
+    # 30일 전부터 현재까지 수집
+    start_ts = int((datetime.now(timezone.utc) - timedelta(days=30)).timestamp() * 1000)
+    now_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
+
+    print(f"  [{symbol}] OI 히스토리 수집 중 (최근 30일)...")
+    while start_ts < now_ms:
+        params = {
+            "symbol": symbol,
+            "period": period,
+            "limit": 500,
+            "startTime": start_ts,
+        }
+        async with session.get(url, params=params) as resp:
+            data = await resp.json()
+
+        if not data or not isinstance(data, list):
+            break
+
+        all_rows.extend(data)
+        last_ts = int(data[-1]["timestamp"])
+        if last_ts >= now_ms or len(data) < 500:
+            break
+        start_ts = last_ts + 1
+        await asyncio.sleep(_REQUEST_DELAY)
+
+    if not all_rows:
+        print(f"  [{symbol}] OI 데이터 없음 — 빈 DataFrame 반환")
+        return pd.DataFrame(columns=["oi", "oi_value"])
+
+    df = pd.DataFrame(all_rows)
+    df["timestamp"] = pd.to_datetime(df["timestamp"].astype(int), unit="ms", utc=True)
+    df = df.set_index("timestamp")
+    df = df[["sumOpenInterest", "sumOpenInterestValue"]].copy()
+    df.columns = ["oi", "oi_value"]
+    df["oi"] = df["oi"].astype(float)
+    df["oi_value"] = df["oi_value"].astype(float)
+    # OI 변화율 (1캔들 전 대비)
+    df["oi_change"] = df["oi"].pct_change(1).fillna(0)
+    print(f"  [{symbol}] OI 수집 완료: {len(df):,}행")
+    return df[["oi_change"]]
+
+
+async def _fetch_funding_rate(
+    session: aiohttp.ClientSession,
+    symbol: str,
+    days: int,
+) -> pd.DataFrame:
+    """
+    바이낸스 /fapi/v1/fundingRate 엔드포인트로 펀딩비 히스토리를 수집한다.
+    8시간 주기 데이터 → 15분봉 인덱스에 forward-fill로 병합 예정.
+    """
+    url = f"{_FAPI_BASE}/fapi/v1/fundingRate"
+    all_rows = []
+    start_ts = int((datetime.now(timezone.utc) - timedelta(days=days)).timestamp() * 1000)
+    now_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
+
+    print(f"  [{symbol}] 펀딩비 히스토리 수집 중 ({days}일)...")
+    while start_ts < now_ms:
+        params = {
+            "symbol": symbol,
+            "startTime": start_ts,
+            "limit": 1000,
+        }
+        async with session.get(url, params=params) as resp:
+            data = await resp.json()
+
+        if not data or not isinstance(data, list):
+            break
+
+        all_rows.extend(data)
+        last_ts = int(data[-1]["fundingTime"])
+        if last_ts >= now_ms or len(data) < 1000:
+            break
+        start_ts = last_ts + 1
+        await asyncio.sleep(_REQUEST_DELAY)
+
+    if not all_rows:
+        print(f"  [{symbol}] 펀딩비 데이터 없음 — 빈 DataFrame 반환")
+        return pd.DataFrame(columns=["funding_rate"])
+
+    df = pd.DataFrame(all_rows)
+    df["timestamp"] = pd.to_datetime(df["fundingTime"].astype(int), unit="ms", utc=True)
+    df = df.set_index("timestamp")
+    df["funding_rate"] = df["fundingRate"].astype(float)
+    print(f"  [{symbol}] 펀딩비 수집 완료: {len(df):,}행")
+    return df[["funding_rate"]]
+
+
+def _merge_oi_funding(
+    candles: pd.DataFrame,
+    oi_df: pd.DataFrame,
+    funding_df: pd.DataFrame,
+) -> pd.DataFrame:
+    """
+    캔들 DataFrame에 OI 변화율과 펀딩비를 병합한다.
+    - oi_change: 15분봉 인덱스에 nearest merge (없는 구간은 0)
+    - funding_rate: 8시간 주기 → forward-fill 후 병합 (없는 구간은 0)
+    """
+    result = candles.copy()
+
+    # OI 병합: 타임스탬프 기준 reindex + nearest fill
+    if not oi_df.empty:
+        oi_reindexed = oi_df.reindex(result.index, method="nearest", tolerance=pd.Timedelta("8min"))
+        result["oi_change"] = oi_reindexed["oi_change"].fillna(0).astype(float)
+    else:
+        result["oi_change"] = 0.0
+
+    # 펀딩비 병합: forward-fill (8시간 주기이므로 다음 펀딩 시점까지 이전 값 유지)
+    if not funding_df.empty:
+        funding_reindexed = funding_df.reindex(
+            result.index.union(funding_df.index)
+        ).sort_index()
+        funding_reindexed = funding_reindexed["funding_rate"].ffill()
+        result["funding_rate"] = funding_reindexed.reindex(result.index).fillna(0).astype(float)
+    else:
+        result["funding_rate"] = 0.0
+
+    return result
+
+
+async def _fetch_oi_and_funding(
+    symbol: str,
+    days: int,
+    candles: pd.DataFrame,
+) -> pd.DataFrame:
+    """단일 심볼의 OI + 펀딩비를 수집해 캔들에 병합한다."""
+    async with aiohttp.ClientSession() as session:
+        oi_df = await _fetch_oi_hist(session, symbol)
+        await asyncio.sleep(1)
+        funding_df = await _fetch_funding_rate(session, symbol, days)
+
+    return _merge_oi_funding(candles, oi_df, funding_df)
+
+
 def main():
    parser = argparse.ArgumentParser(
        description="바이낸스 선물 과거 캔들 수집. 단일 심볼 또는 멀티 심볼 병합 저장."
@@ -116,6 +268,10 @@ def main():
    parser.add_argument("--interval", default="15m")
    parser.add_argument("--days",     type=int, default=365)
    parser.add_argument("--output",   default="data/combined_15m.parquet")
+    parser.add_argument(
+        "--no-oi", action="store_true",
+        help="OI/펀딩비 수집을 건너뜀 (캔들 데이터만 저장)",
+    )
    args = parser.parse_args()

    # 하위 호환: --symbol 단독 사용 시 symbols로 통합
@@ -124,8 +280,11 @@ def main():

    if len(args.symbols) == 1:
        df = asyncio.run(fetch_klines(args.symbols[0], args.interval, args.days))
+        if not args.no_oi:
+            print(f"\n[OI/펀딩비] {args.symbols[0]} 수집 중...")
+            df = asyncio.run(_fetch_oi_and_funding(args.symbols[0], args.days, df))
        df.to_parquet(args.output)
-        print(f"저장 완료: {args.output} ({len(df):,}행)")
+        print(f"저장 완료: {args.output} ({len(df):,}행, {len(df.columns)}컬럼)")
    else:
        # 멀티 심볼: 단일 클라이언트로 순차 수집 후 타임스탬프 기준 inner join 병합
        dfs = asyncio.run(fetch_klines_all(args.symbols, args.interval, args.days))
@@ -139,6 +298,11 @@ def main():
                how="inner",
            )

+        # 주 심볼(XRP)에 대해서만 OI/펀딩비 수집 후 병합
+        if not args.no_oi:
+            print(f"\n[OI/펀딩비] {primary} 수집 중...")
+            merged = asyncio.run(_fetch_oi_and_funding(primary, args.days, merged))
+
        output = args.output.replace("xrpusdt", "combined")
        merged.to_parquet(output)
        print(f"\n병합 저장 완료: {output} ({len(merged):,}행, {len(merged.columns)}컬럼)")
--- a/scripts/train_and_deploy.sh
+++ b/scripts/train_and_deploy.sh
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 # 맥미니에서 전체 학습 파이프라인을 실행하고 LXC로 배포한다.
-# 사용법: bash scripts/train_and_deploy.sh [mlx|lgbm]
+# 사용법: bash scripts/train_and_deploy.sh [mlx|lgbm] [wf-splits]
 #
 # 예시:
-#   bash scripts/train_and_deploy.sh        # LightGBM (기본값)
-#   bash scripts/train_and_deploy.sh mlx    # MLX GPU 학습
+#   bash scripts/train_and_deploy.sh             # LightGBM + Walk-Forward 5폴드 (기본값)
+#   bash scripts/train_and_deploy.sh mlx         # MLX GPU 학습 + Walk-Forward 5폴드
+#   bash scripts/train_and_deploy.sh lgbm 3      # LightGBM + Walk-Forward 3폴드
+#   bash scripts/train_and_deploy.sh lgbm 0      # Walk-Forward 건너뜀 (단일 학습만)

 set -euo pipefail

@@ -20,10 +22,11 @@ else
 fi

 BACKEND="${1:-lgbm}"
+WF_SPLITS="${2:-5}"   # 두 번째 인자: Walk-Forward 폴드 수 (0이면 건너뜀)

 cd "$PROJECT_ROOT"

-echo "=== [1/3] 데이터 수집 (XRP + BTC + ETH 3심볼, 1년치) ==="
+echo "=== [1/3] 데이터 수집 (XRP + BTC + ETH 3심볼, 1년치 + OI/펀딩비) ==="
 python scripts/fetch_history.py \
    --symbols XRPUSDT BTCUSDT ETHUSDT \
    --interval 15m \
@@ -31,7 +34,7 @@ python scripts/fetch_history.py \
    --output data/combined_15m.parquet

 echo ""
-echo "=== [2/3] 모델 학습 (21개 피처: XRP 13 + BTC/ETH 상관관계 8) ==="
+echo "=== [2/3] 모델 학습 (23개 피처: XRP 13 + BTC/ETH 8 + OI/펀딩비 2) ==="
 DECAY="${TIME_WEIGHT_DECAY:-2.0}"
 if [ "$BACKEND" = "mlx" ]; then
    echo "  백엔드: MLX (Apple Silicon GPU), decay=${DECAY}"
@@ -41,6 +44,17 @@ else
    python scripts/train_model.py --data data/combined_15m.parquet --decay "$DECAY"
 fi

+# Walk-Forward 검증 (WF_SPLITS > 0 인 경우, lgbm 백엔드만 지원)
+if [ "$WF_SPLITS" -gt 0 ] 2>/dev/null && [ "$BACKEND" != "mlx" ]; then
+    echo ""
+    echo "=== [2.5/3] Walk-Forward 검증 (${WF_SPLITS}폴드) ==="
+    python scripts/train_model.py \
+        --data data/combined_15m.parquet \
+        --decay "$DECAY" \
+        --wf \
+        --wf-splits "$WF_SPLITS"
+fi
+
 echo ""
 echo "=== [3/3] LXC 배포 ==="
 bash scripts/deploy_model.sh "$BACKEND"