You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
356 lines
10 KiB
356 lines
10 KiB
|
3 weeks ago
|
{
|
||
|
|
"cells": [
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"# CTA 1D Baseline XGBoost Model\n",
|
||
|
|
"\n",
|
||
|
|
"Train and evaluate a baseline XGBoost model for CTA 1-day return prediction.\n",
|
||
|
|
"\n",
|
||
|
|
"**Purpose**: Establish a baseline performance benchmark with standard configuration."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"import pandas as pd\n",
|
||
|
|
"import numpy as np\n",
|
||
|
|
"import matplotlib.pyplot as plt\n",
|
||
|
|
"import json\n",
|
||
|
|
"from datetime import datetime\n",
|
||
|
|
"\n",
|
||
|
|
"from qshare.data.pandas.cta_1d import load_dataset\n",
|
||
|
|
"from qshare.algo.learning.cta_trainer import CTAXGBTrainer\n",
|
||
|
|
"from qshare.eval.cta.backtest import CTABacktester\n",
|
||
|
|
"\n",
|
||
|
|
"import sys\n",
|
||
|
|
"sys.path.insert(0, '../')\n",
|
||
|
|
"from common.plotting import setup_plot_style, plot_ic_series, plot_cumulative_returns\n",
|
||
|
|
"from common.paths import create_experiment_dir\n",
|
||
|
|
"from src.labels import get_blend_weights, describe_blend_config\n",
|
||
|
|
"\n",
|
||
|
|
"setup_plot_style()"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 1. Configuration\n",
|
||
|
|
"\n",
|
||
|
|
"Edit this cell to modify experiment parameters."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"CONFIG = {\n",
|
||
|
|
" # Experiment\n",
|
||
|
|
" 'experiment_name': 'baseline_xgb', # Will be appended with timestamp\n",
|
||
|
|
" \n",
|
||
|
|
" # Date ranges\n",
|
||
|
|
" 'dt_range': ['2020-01-01', '2024-12-31'],\n",
|
||
|
|
" 'train_range': ['2020-01-01', '2022-12-31'],\n",
|
||
|
|
" 'test_range': ['2023-01-01', '2024-12-31'],\n",
|
||
|
|
" 'fit_range': ['2020-01-01', '2021-06-30'], # For normalization fitting\n",
|
||
|
|
" \n",
|
||
|
|
" # Data\n",
|
||
|
|
" 'feature_sets': ['alpha158', 'hffactor'],\n",
|
||
|
|
" 'return_type': 'o2c_twap1min',\n",
|
||
|
|
" 'normalization': 'dual',\n",
|
||
|
|
" 'blend_weights': None, # Use default [0.2, 0.1, 0.3, 0.4] or specify name/list\n",
|
||
|
|
" 'weight_factors': {'positive': 1.0, 'negative': 2.0},\n",
|
||
|
|
" \n",
|
||
|
|
" # Model\n",
|
||
|
|
" 'xgb_params': {\n",
|
||
|
|
" 'booster': 'gblinear',\n",
|
||
|
|
" 'eta': 0.5,\n",
|
||
|
|
" 'lambda_reg': 0.1,\n",
|
||
|
|
" 'num_round': 20,\n",
|
||
|
|
" },\n",
|
||
|
|
" \n",
|
||
|
|
" # Backtest\n",
|
||
|
|
" 'backtest_params': {\n",
|
||
|
|
" 'num_trades': 4,\n",
|
||
|
|
" 'signal_dist': 'normal',\n",
|
||
|
|
" 'pos_weight': True,\n",
|
||
|
|
" },\n",
|
||
|
|
" \n",
|
||
|
|
" # Output\n",
|
||
|
|
" 'save_results': True,\n",
|
||
|
|
"}\n",
|
||
|
|
"\n",
|
||
|
|
"print(\"Configuration:\")\n",
|
||
|
|
"print(f\" Experiment: {CONFIG['experiment_name']}\")\n",
|
||
|
|
"print(f\" Train: {CONFIG['train_range'][0]} to {CONFIG['train_range'][1]}\")\n",
|
||
|
|
"print(f\" Test: {CONFIG['test_range'][0]} to {CONFIG['test_range'][1]}\")\n",
|
||
|
|
"print(f\" Blend: {describe_blend_config(CONFIG['blend_weights'] or 'default')}\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 2. Load Dataset"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"print(\"Loading dataset...\")\n",
|
||
|
|
"df_full = load_dataset(\n",
|
||
|
|
" dt_range=CONFIG['dt_range'],\n",
|
||
|
|
" return_type=CONFIG['return_type'],\n",
|
||
|
|
" normalization=CONFIG['normalization'],\n",
|
||
|
|
" feature_sets=CONFIG['feature_sets'],\n",
|
||
|
|
" fit_range=CONFIG['fit_range'],\n",
|
||
|
|
" weight_factors=CONFIG['weight_factors'],\n",
|
||
|
|
" blend_weights=CONFIG['blend_weights'],\n",
|
||
|
|
")\n",
|
||
|
|
"\n",
|
||
|
|
"print(f\"\\nDataset shape: {df_full.shape}\")\n",
|
||
|
|
"print(f\"Columns: {len(df_full.columns)}\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Split train/test\n",
|
||
|
|
"df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n",
|
||
|
|
"df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n",
|
||
|
|
"\n",
|
||
|
|
"print(f\"Train: {df_train.shape}\")\n",
|
||
|
|
"print(f\"Test: {df_test.shape}\")\n",
|
||
|
|
"\n",
|
||
|
|
"# Get feature columns\n",
|
||
|
|
"feature_cols = [c for c in df_train.columns\n",
|
||
|
|
" if c.startswith(('alpha158_', 'hf_', 'f_'))]\n",
|
||
|
|
"print(f\"\\nFeatures: {len(feature_cols)}\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 3. Train Model"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"print(\"Training XGBoost model...\")\n",
|
||
|
|
"print(f\" Params: {CONFIG['xgb_params']}\")\n",
|
||
|
|
"\n",
|
||
|
|
"trainer = CTAXGBTrainer(**CONFIG['xgb_params'])\n",
|
||
|
|
"\n",
|
||
|
|
"trainer.fit(\n",
|
||
|
|
" df_train,\n",
|
||
|
|
" feature_cols=feature_cols,\n",
|
||
|
|
" target_col='label',\n",
|
||
|
|
" weight_col='weight'\n",
|
||
|
|
")\n",
|
||
|
|
"\n",
|
||
|
|
"print(\"\\nTraining complete!\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Feature importance\n",
|
||
|
|
"importance = trainer.get_feature_importance()\n",
|
||
|
|
"print(\"\\nTop 10 Features:\")\n",
|
||
|
|
"print(importance.head(10))\n",
|
||
|
|
"\n",
|
||
|
|
"# Plot\n",
|
||
|
|
"fig, ax = plt.subplots(figsize=(10, 6))\n",
|
||
|
|
"importance.head(20).plot(kind='barh', ax=ax)\n",
|
||
|
|
"ax.set_title('Top 20 Feature Importance')\n",
|
||
|
|
"plt.tight_layout()\n",
|
||
|
|
"plt.show()"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 4. Generate Predictions"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"print(\"Generating predictions on test set...\")\n",
|
||
|
|
"df_signal = trainer.predict(df_test)\n",
|
||
|
|
"\n",
|
||
|
|
"print(f\"\\nSignal statistics:\")\n",
|
||
|
|
"print(df_signal.describe())\n",
|
||
|
|
"\n",
|
||
|
|
"# Plot signal distribution\n",
|
||
|
|
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
|
||
|
|
"\n",
|
||
|
|
"df_signal.hist(bins=100, ax=axes[0], edgecolor='black')\n",
|
||
|
|
"axes[0].set_title('Signal Distribution')\n",
|
||
|
|
"axes[0].axvline(x=0, color='red', linestyle='--')\n",
|
||
|
|
"\n",
|
||
|
|
"signal_by_date = df_signal.groupby(level=0).mean()\n",
|
||
|
|
"axes[1].plot(signal_by_date.index, signal_by_date.values)\n",
|
||
|
|
"axes[1].set_title('Mean Signal by Date')\n",
|
||
|
|
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
|
||
|
|
"\n",
|
||
|
|
"plt.tight_layout()\n",
|
||
|
|
"plt.show()"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 5. Evaluate with Backtest"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"print(\"Running backtest...\")\n",
|
||
|
|
"\n",
|
||
|
|
"returns = df_test['return'] if 'return' in df_test.columns else df_test['label']\n",
|
||
|
|
"\n",
|
||
|
|
"backtester = CTABacktester(**CONFIG['backtest_params'])\n",
|
||
|
|
"results = backtester.run(returns, df_signal)\n",
|
||
|
|
"\n",
|
||
|
|
"summary = backtester.summary()\n",
|
||
|
|
"print(\"\\nBacktest Summary:\")\n",
|
||
|
|
"for key, value in summary.items():\n",
|
||
|
|
" if isinstance(value, float):\n",
|
||
|
|
" print(f\" {key}: {value:.4f}\")\n",
|
||
|
|
" else:\n",
|
||
|
|
" print(f\" {key}: {value}\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# IC Analysis\n",
|
||
|
|
"ic_by_date = results.groupby(results.index.get_level_values(0))['ic'].mean()\n",
|
||
|
|
"\n",
|
||
|
|
"fig = plot_ic_series(ic_by_date, title=\"IC Over Time (Test Set)\")\n",
|
||
|
|
"plt.show()\n",
|
||
|
|
"\n",
|
||
|
|
"print(f\"\\nIC Statistics:\")\n",
|
||
|
|
"print(f\" Mean: {ic_by_date.mean():.4f}\")\n",
|
||
|
|
"print(f\" Std: {ic_by_date.std():.4f}\")\n",
|
||
|
|
"print(f\" IR: {ic_by_date.mean() / ic_by_date.std():.4f}\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Cumulative returns\n",
|
||
|
|
"daily_returns = results.groupby(results.index.get_level_values(0))['pos_return'].mean()\n",
|
||
|
|
"\n",
|
||
|
|
"fig = plot_cumulative_returns(daily_returns, title=\"Cumulative Strategy Returns\")\n",
|
||
|
|
"plt.show()\n",
|
||
|
|
"\n",
|
||
|
|
"total_return = (1 + daily_returns).prod() - 1\n",
|
||
|
|
"annual_return = (1 + total_return) ** (252 / len(daily_returns)) - 1\n",
|
||
|
|
"sharpe = daily_returns.mean() / daily_returns.std() * np.sqrt(252)\n",
|
||
|
|
"\n",
|
||
|
|
"print(f\"\\nReturn Statistics:\")\n",
|
||
|
|
"print(f\" Total Return: {total_return:.2%}\")\n",
|
||
|
|
"print(f\" Annual Return: {annual_return:.2%}\")\n",
|
||
|
|
"print(f\" Sharpe Ratio: {sharpe:.2f}\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 6. Save Results\n",
|
||
|
|
"\n",
|
||
|
|
"Save model, predictions, and metrics for later analysis."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"if CONFIG['save_results']:\n",
|
||
|
|
" # Create output directory\n",
|
||
|
|
" output_dir = create_experiment_dir('cta_1d', CONFIG['experiment_name'])\n",
|
||
|
|
" print(f\"Saving results to: {output_dir}\")\n",
|
||
|
|
" \n",
|
||
|
|
" # Save config\n",
|
||
|
|
" with open(output_dir / 'config.json', 'w') as f:\n",
|
||
|
|
" json.dump(CONFIG, f, indent=2, default=str)\n",
|
||
|
|
" \n",
|
||
|
|
" # Save model\n",
|
||
|
|
" trainer.save_model(str(output_dir / 'model.pkl'))\n",
|
||
|
|
" \n",
|
||
|
|
" # Save feature importance\n",
|
||
|
|
" importance.to_csv(output_dir / 'feature_importance.csv')\n",
|
||
|
|
" \n",
|
||
|
|
" # Save predictions\n",
|
||
|
|
" df_signal.to_csv(output_dir / 'predictions.csv')\n",
|
||
|
|
" \n",
|
||
|
|
" # Save backtest results\n",
|
||
|
|
" results.to_csv(output_dir / 'backtest_results.csv')\n",
|
||
|
|
" \n",
|
||
|
|
" # Save summary\n",
|
||
|
|
" with open(output_dir / 'summary.json', 'w') as f:\n",
|
||
|
|
" json.dump(summary, f, indent=2, default=str)\n",
|
||
|
|
" \n",
|
||
|
|
" print(\"\\nFiles saved:\")\n",
|
||
|
|
" for f in output_dir.iterdir():\n",
|
||
|
|
" print(f\" - {f.name}\")\n",
|
||
|
|
"else:\n",
|
||
|
|
" print(\"Results not saved (save_results=False)\")"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"metadata": {
|
||
|
|
"kernelspec": {
|
||
|
|
"display_name": "Python 3",
|
||
|
|
"language": "python",
|
||
|
|
"name": "python3"
|
||
|
|
},
|
||
|
|
"language_info": {
|
||
|
|
"name": "python",
|
||
|
|
"version": "3.8.0"
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"nbformat": 4,
|
||
|
|
"nbformat_minor": 4
|
||
|
|
}
|