feat: enhance data handling and model training features
- Updated .gitignore to include .venv and .worktrees. - Removed symlink for the virtual environment. - Added new parquet files for dogeusdt, trxusdt, and xrpusdt datasets. - Introduced model files and training logs for dogeusdt, trxusdt, and xrpusdt. - Enhanced fetch_history.py to support caching of correlation symbols. - Updated train_and_deploy.sh to manage correlation cache directory.
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -14,4 +14,5 @@ data/*.parquet
|
||||
.DS_Store
|
||||
.cursor/
|
||||
|
||||
.worktrees/
|
||||
.worktrees/
|
||||
.venv
|
||||
|
||||
BIN
data/dogeusdt/combined_15m.parquet
Normal file
BIN
data/dogeusdt/combined_15m.parquet
Normal file
Binary file not shown.
BIN
data/trxusdt/combined_15m.parquet
Normal file
BIN
data/trxusdt/combined_15m.parquet
Normal file
Binary file not shown.
BIN
data/xrpusdt/combined_15m.parquet
Normal file
BIN
data/xrpusdt/combined_15m.parquet
Normal file
Binary file not shown.
BIN
models/dogeusdt/lgbm_filter.pkl
Normal file
BIN
models/dogeusdt/lgbm_filter.pkl
Normal file
Binary file not shown.
27
models/dogeusdt/training_log.json
Normal file
27
models/dogeusdt/training_log.json
Normal file
@@ -0,0 +1,27 @@
|
||||
[
|
||||
{
|
||||
"date": "2026-03-05T23:54:51.517734",
|
||||
"backend": "lgbm",
|
||||
"auc": 0.9565,
|
||||
"best_threshold": 0.3318,
|
||||
"best_precision": 0.548,
|
||||
"best_recall": 0.489,
|
||||
"samples": 3330,
|
||||
"features": 26,
|
||||
"time_weight_decay": 2.0,
|
||||
"model_path": "models/dogeusdt/lgbm_filter.pkl",
|
||||
"tuned_params_path": null,
|
||||
"lgbm_params": {
|
||||
"n_estimators": 434,
|
||||
"learning_rate": 0.123659,
|
||||
"max_depth": 6,
|
||||
"num_leaves": 14,
|
||||
"min_child_samples": 10,
|
||||
"subsample": 0.929062,
|
||||
"colsample_bytree": 0.94633,
|
||||
"reg_alpha": 0.573971,
|
||||
"reg_lambda": 0.000157
|
||||
},
|
||||
"weight_scale": 1.783105
|
||||
}
|
||||
]
|
||||
BIN
models/trxusdt/lgbm_filter.pkl
Normal file
BIN
models/trxusdt/lgbm_filter.pkl
Normal file
Binary file not shown.
27
models/trxusdt/training_log.json
Normal file
27
models/trxusdt/training_log.json
Normal file
@@ -0,0 +1,27 @@
|
||||
[
|
||||
{
|
||||
"date": "2026-03-05T23:54:05.625978",
|
||||
"backend": "lgbm",
|
||||
"auc": 0.947,
|
||||
"best_threshold": 0.2822,
|
||||
"best_precision": 0.446,
|
||||
"best_recall": 0.763,
|
||||
"samples": 2940,
|
||||
"features": 26,
|
||||
"time_weight_decay": 2.0,
|
||||
"model_path": "models/trxusdt/lgbm_filter.pkl",
|
||||
"tuned_params_path": null,
|
||||
"lgbm_params": {
|
||||
"n_estimators": 434,
|
||||
"learning_rate": 0.123659,
|
||||
"max_depth": 6,
|
||||
"num_leaves": 14,
|
||||
"min_child_samples": 10,
|
||||
"subsample": 0.929062,
|
||||
"colsample_bytree": 0.94633,
|
||||
"reg_alpha": 0.573971,
|
||||
"reg_lambda": 0.000157
|
||||
},
|
||||
"weight_scale": 1.783105
|
||||
}
|
||||
]
|
||||
BIN
models/xrpusdt/lgbm_filter.pkl
Normal file
BIN
models/xrpusdt/lgbm_filter.pkl
Normal file
Binary file not shown.
27
models/xrpusdt/training_log.json
Normal file
27
models/xrpusdt/training_log.json
Normal file
@@ -0,0 +1,27 @@
|
||||
[
|
||||
{
|
||||
"date": "2026-03-05T23:53:20.451588",
|
||||
"backend": "lgbm",
|
||||
"auc": 0.9428,
|
||||
"best_threshold": 0.8486,
|
||||
"best_precision": 0.583,
|
||||
"best_recall": 0.171,
|
||||
"samples": 3222,
|
||||
"features": 26,
|
||||
"time_weight_decay": 2.0,
|
||||
"model_path": "models/xrpusdt/lgbm_filter.pkl",
|
||||
"tuned_params_path": null,
|
||||
"lgbm_params": {
|
||||
"n_estimators": 434,
|
||||
"learning_rate": 0.123659,
|
||||
"max_depth": 6,
|
||||
"num_leaves": 14,
|
||||
"min_child_samples": 10,
|
||||
"subsample": 0.929062,
|
||||
"colsample_bytree": 0.94633,
|
||||
"reg_alpha": 0.573971,
|
||||
"reg_lambda": 0.000157
|
||||
},
|
||||
"weight_scale": 1.783105
|
||||
}
|
||||
]
|
||||
@@ -331,6 +331,10 @@ def main():
|
||||
"--no-upsert", action="store_true",
|
||||
help="기존 parquet을 Upsert하지 않고 새로 덮어씀 (기본: Upsert 활성화)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--corr-cache-dir", default=None,
|
||||
help="상관 심볼(BTC/ETH) 캐시 디렉토리. 첫 수집 시 저장, 이후 재사용",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# --symbol 모드: 단일 거래 심볼 + 상관관계 심볼 자동 추가, 출력 경로 자동 결정
|
||||
@@ -360,8 +364,43 @@ def main():
|
||||
df.to_parquet(args.output)
|
||||
print(f"{'Upsert' if not args.no_upsert else '저장'} 완료: {args.output} ({len(df):,}행, {len(df.columns)}컬럼)")
|
||||
else:
|
||||
# 멀티 심볼: 단일 클라이언트로 순차 수집 후 타임스탬프 기준 inner join 병합
|
||||
dfs = asyncio.run(fetch_klines_all(args.symbols, args.interval, args.days))
|
||||
# 멀티 심볼: 상관 심볼 캐시 활용
|
||||
corr_cache_dir = args.corr_cache_dir
|
||||
cached_symbols = {}
|
||||
symbols_to_fetch = list(args.symbols)
|
||||
|
||||
if corr_cache_dir:
|
||||
os.makedirs(corr_cache_dir, exist_ok=True)
|
||||
remaining = []
|
||||
for sym in args.symbols:
|
||||
cache_file = os.path.join(corr_cache_dir, f"{sym.lower()}_{args.interval}.parquet")
|
||||
if os.path.exists(cache_file):
|
||||
print(f" [{sym}] 캐시 사용: {cache_file}")
|
||||
cached_symbols[sym] = pd.read_parquet(cache_file)
|
||||
else:
|
||||
remaining.append(sym)
|
||||
symbols_to_fetch = remaining
|
||||
|
||||
if symbols_to_fetch:
|
||||
dfs = asyncio.run(fetch_klines_all(symbols_to_fetch, args.interval, args.days))
|
||||
else:
|
||||
dfs = {}
|
||||
|
||||
# 캐시에 저장 (상관 심볼만)
|
||||
if corr_cache_dir:
|
||||
from src.config import Config
|
||||
try:
|
||||
corr_list = Config().correlation_symbols
|
||||
except Exception:
|
||||
corr_list = ["BTCUSDT", "ETHUSDT"]
|
||||
for sym, df in dfs.items():
|
||||
if sym in corr_list:
|
||||
cache_file = os.path.join(corr_cache_dir, f"{sym.lower()}_{args.interval}.parquet")
|
||||
df.to_parquet(cache_file)
|
||||
print(f" [{sym}] 캐시 저장: {cache_file}")
|
||||
|
||||
# 캐시 + 새로 수집한 데이터 합치기
|
||||
dfs.update(cached_symbols)
|
||||
|
||||
primary = args.symbols[0]
|
||||
merged = dfs[primary].copy()
|
||||
@@ -377,7 +416,7 @@ def main():
|
||||
print(f"\n[OI/펀딩비] {primary} 수집 중...")
|
||||
merged = asyncio.run(_fetch_oi_and_funding(primary, args.days, merged))
|
||||
|
||||
output = args.output.replace("xrpusdt", "combined")
|
||||
output = args.output
|
||||
if not args.no_upsert:
|
||||
merged = upsert_parquet(output, merged)
|
||||
merged.to_parquet(output)
|
||||
|
||||
@@ -68,6 +68,7 @@ else
|
||||
fi
|
||||
|
||||
DECAY="${TIME_WEIGHT_DECAY:-2.0}"
|
||||
CORR_CACHE_DIR="data/.corr_cache"
|
||||
|
||||
echo ""
|
||||
echo "========================================"
|
||||
@@ -106,6 +107,7 @@ for SYM in "${TARGETS[@]}"; do
|
||||
--symbol "$SYM" \
|
||||
--interval 15m \
|
||||
--days "$FETCH_DAYS" \
|
||||
--corr-cache-dir "$CORR_CACHE_DIR" \
|
||||
$UPSERT_FLAG
|
||||
|
||||
# === [1.5/3] OI 파생 피처 A/B 비교 ===
|
||||
@@ -154,6 +156,9 @@ for SYM in "${TARGETS[@]}"; do
|
||||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||||
done
|
||||
|
||||
# 상관 심볼 캐시 정리
|
||||
rm -rf "$CORR_CACHE_DIR"
|
||||
|
||||
echo ""
|
||||
echo "=== 전체 파이프라인 완료: $(date '+%Y-%m-%d %H:%M:%S %Z') ==="
|
||||
echo ""
|
||||
|
||||
Reference in New Issue
Block a user