feat: enhance data handling and model training features

- Updated .gitignore to include .venv and .worktrees. - Removed symlink for the virtual environment. - Added new parquet files for dogeusdt, trxusdt, and xrpusdt datasets. - Introduced model files and training logs for dogeusdt, trxusdt, and xrpusdt. - Enhanced fetch_history.py to support caching of correlation symbols. - Updated train_and_deploy.sh to manage correlation cache directory.
2026-03-05 23:57:44 +09:00
parent d92fae13f8
commit 2b3f39b5d1
13 changed files with 130 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -14,4 +14,5 @@ data/*.parquet
 .DS_Store
 .cursor/

-.worktrees/
+.worktrees/
+.venv
--- a/.venv
+++ b/.venv
@@ -1 +0,0 @@
-/Users/gihyeon/github/cointrader/.venv
--- a/data/dogeusdt/combined_15m.parquet
+++ b/data/dogeusdt/combined_15m.parquet
--- a/data/trxusdt/combined_15m.parquet
+++ b/data/trxusdt/combined_15m.parquet
--- a/data/xrpusdt/combined_15m.parquet
+++ b/data/xrpusdt/combined_15m.parquet
--- a/models/dogeusdt/lgbm_filter.pkl
+++ b/models/dogeusdt/lgbm_filter.pkl
--- a/models/dogeusdt/training_log.json
+++ b/models/dogeusdt/training_log.json
@@ -0,0 +1,27 @@
+[
+  {
+    "date": "2026-03-05T23:54:51.517734",
+    "backend": "lgbm",
+    "auc": 0.9565,
+    "best_threshold": 0.3318,
+    "best_precision": 0.548,
+    "best_recall": 0.489,
+    "samples": 3330,
+    "features": 26,
+    "time_weight_decay": 2.0,
+    "model_path": "models/dogeusdt/lgbm_filter.pkl",
+    "tuned_params_path": null,
+    "lgbm_params": {
+      "n_estimators": 434,
+      "learning_rate": 0.123659,
+      "max_depth": 6,
+      "num_leaves": 14,
+      "min_child_samples": 10,
+      "subsample": 0.929062,
+      "colsample_bytree": 0.94633,
+      "reg_alpha": 0.573971,
+      "reg_lambda": 0.000157
+    },
+    "weight_scale": 1.783105
+  }
+]
--- a/models/trxusdt/lgbm_filter.pkl
+++ b/models/trxusdt/lgbm_filter.pkl
--- a/models/trxusdt/training_log.json
+++ b/models/trxusdt/training_log.json
@@ -0,0 +1,27 @@
+[
+  {
+    "date": "2026-03-05T23:54:05.625978",
+    "backend": "lgbm",
+    "auc": 0.947,
+    "best_threshold": 0.2822,
+    "best_precision": 0.446,
+    "best_recall": 0.763,
+    "samples": 2940,
+    "features": 26,
+    "time_weight_decay": 2.0,
+    "model_path": "models/trxusdt/lgbm_filter.pkl",
+    "tuned_params_path": null,
+    "lgbm_params": {
+      "n_estimators": 434,
+      "learning_rate": 0.123659,
+      "max_depth": 6,
+      "num_leaves": 14,
+      "min_child_samples": 10,
+      "subsample": 0.929062,
+      "colsample_bytree": 0.94633,
+      "reg_alpha": 0.573971,
+      "reg_lambda": 0.000157
+    },
+    "weight_scale": 1.783105
+  }
+]
--- a/models/xrpusdt/lgbm_filter.pkl
+++ b/models/xrpusdt/lgbm_filter.pkl
--- a/models/xrpusdt/training_log.json
+++ b/models/xrpusdt/training_log.json
@@ -0,0 +1,27 @@
+[
+  {
+    "date": "2026-03-05T23:53:20.451588",
+    "backend": "lgbm",
+    "auc": 0.9428,
+    "best_threshold": 0.8486,
+    "best_precision": 0.583,
+    "best_recall": 0.171,
+    "samples": 3222,
+    "features": 26,
+    "time_weight_decay": 2.0,
+    "model_path": "models/xrpusdt/lgbm_filter.pkl",
+    "tuned_params_path": null,
+    "lgbm_params": {
+      "n_estimators": 434,
+      "learning_rate": 0.123659,
+      "max_depth": 6,
+      "num_leaves": 14,
+      "min_child_samples": 10,
+      "subsample": 0.929062,
+      "colsample_bytree": 0.94633,
+      "reg_alpha": 0.573971,
+      "reg_lambda": 0.000157
+    },
+    "weight_scale": 1.783105
+  }
+]
--- a/scripts/fetch_history.py
+++ b/scripts/fetch_history.py
@@ -331,6 +331,10 @@ def main():
        "--no-upsert", action="store_true",
        help="기존 parquet을 Upsert하지 않고 새로 덮어씀 (기본: Upsert 활성화)",
    )
+    parser.add_argument(
+        "--corr-cache-dir", default=None,
+        help="상관 심볼(BTC/ETH) 캐시 디렉토리. 첫 수집 시 저장, 이후 재사용",
+    )
    args = parser.parse_args()

    # --symbol 모드: 단일 거래 심볼 + 상관관계 심볼 자동 추가, 출력 경로 자동 결정
@@ -360,8 +364,43 @@ def main():
        df.to_parquet(args.output)
        print(f"{'Upsert' if not args.no_upsert else '저장'} 완료: {args.output} ({len(df):,}행, {len(df.columns)}컬럼)")
    else:
-        # 멀티 심볼: 단일 클라이언트로 순차 수집 후 타임스탬프 기준 inner join 병합
-        dfs = asyncio.run(fetch_klines_all(args.symbols, args.interval, args.days))
+        # 멀티 심볼: 상관 심볼 캐시 활용
+        corr_cache_dir = args.corr_cache_dir
+        cached_symbols = {}
+        symbols_to_fetch = list(args.symbols)
+
+        if corr_cache_dir:
+            os.makedirs(corr_cache_dir, exist_ok=True)
+            remaining = []
+            for sym in args.symbols:
+                cache_file = os.path.join(corr_cache_dir, f"{sym.lower()}_{args.interval}.parquet")
+                if os.path.exists(cache_file):
+                    print(f"  [{sym}] 캐시 사용: {cache_file}")
+                    cached_symbols[sym] = pd.read_parquet(cache_file)
+                else:
+                    remaining.append(sym)
+            symbols_to_fetch = remaining
+
+        if symbols_to_fetch:
+            dfs = asyncio.run(fetch_klines_all(symbols_to_fetch, args.interval, args.days))
+        else:
+            dfs = {}
+
+        # 캐시에 저장 (상관 심볼만)
+        if corr_cache_dir:
+            from src.config import Config
+            try:
+                corr_list = Config().correlation_symbols
+            except Exception:
+                corr_list = ["BTCUSDT", "ETHUSDT"]
+            for sym, df in dfs.items():
+                if sym in corr_list:
+                    cache_file = os.path.join(corr_cache_dir, f"{sym.lower()}_{args.interval}.parquet")
+                    df.to_parquet(cache_file)
+                    print(f"  [{sym}] 캐시 저장: {cache_file}")
+
+        # 캐시 + 새로 수집한 데이터 합치기
+        dfs.update(cached_symbols)

        primary = args.symbols[0]
        merged = dfs[primary].copy()
@@ -377,7 +416,7 @@ def main():
            print(f"\n[OI/펀딩비] {primary} 수집 중...")
            merged = asyncio.run(_fetch_oi_and_funding(primary, args.days, merged))

-        output = args.output.replace("xrpusdt", "combined")
+        output = args.output
        if not args.no_upsert:
            merged = upsert_parquet(output, merged)
        merged.to_parquet(output)
--- a/scripts/train_and_deploy.sh
+++ b/scripts/train_and_deploy.sh
@@ -68,6 +68,7 @@ else
 fi

 DECAY="${TIME_WEIGHT_DECAY:-2.0}"
+CORR_CACHE_DIR="data/.corr_cache"

 echo ""
 echo "========================================"
@@ -106,6 +107,7 @@ for SYM in "${TARGETS[@]}"; do
        --symbol "$SYM" \
        --interval 15m \
        --days "$FETCH_DAYS" \
+        --corr-cache-dir "$CORR_CACHE_DIR" \
        $UPSERT_FLAG

    # === [1.5/3] OI 파생 피처 A/B 비교 ===
@@ -154,6 +156,9 @@ for SYM in "${TARGETS[@]}"; do
    echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
 done

+# 상관 심볼 캐시 정리
+rm -rf "$CORR_CACHE_DIR"
+
 echo ""
 echo "=== 전체 파이프라인 완료: $(date '+%Y-%m-%d %H:%M:%S %Z') ==="
 echo ""