{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Stock 15m Data Exploration\n", "\n", "Load and explore 15-minute return prediction data.\n", "\n", "**Purpose**: Understand data structure, check data quality, and visualize key statistics." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import polars as pl\n", "import matplotlib.pyplot as plt\n", "\n", "from qshare.data.polars.ret15m import load_dataset, calculate_weights\n", "from qshare.io.polars import load_from_pq\n", "\n", "import sys\n", "sys.path.insert(0, '../')\n", "from common.plotting import setup_plot_style\n", "\n", "setup_plot_style()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Configuration\n", "\n", "Define data paths and parameters." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "CONFIG = {\n", " # Data paths (adjust as needed)\n", " 'path_a158': '/data/parquet/stock_1min_alpha158',\n", " 'path_kline': '/data/parquet/stock_1min',\n", " 'path_kline_daily': '/data/parquet/stock_1day',\n", " 'path_industry': '/data/parquet/industry_idx',\n", " \n", " # Date range\n", " 'dt_range': ['2022-01-01', '2024-12-31'],\n", " \n", " # Normalization mode\n", " 'normalization_mode': 'dual', # 'industry', 'cs_zscore', or 'dual'\n", " \n", " # Sample weights\n", " 'positive_factor': 1.0,\n", " 'negative_factor': 2.0,\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Load Raw Data\n", "\n", "Load data as Polars lazy frames first." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load data sources\n", "print(\"Loading data sources...\")\n", "\n", "pl_ldf_a158 = load_from_pq(\n", " path=CONFIG['path_a158'],\n", " table_alias=\"a158\",\n", " start_time=CONFIG['dt_range'][0],\n", " as_struct=True\n", ")\n", "\n", "pl_ldf_kline = load_from_pq(\n", " path=CONFIG['path_kline'],\n", " table_alias=\"kline_1min\",\n", " start_time=CONFIG['dt_range'][0],\n", " as_struct=True\n", ")\n", "\n", "pl_ldf_kline_daily = load_from_pq(\n", " path=CONFIG['path_kline_daily'],\n", " table_alias=\"kline_1day\",\n", " start_time=CONFIG['dt_range'][0],\n", ")\n", "\n", "pl_ldf_industry = load_from_pq(\n", " path=CONFIG['path_industry'],\n", " table_alias=\"indus_idx\",\n", " start_time=CONFIG['dt_range'][0],\n", ")\n", "\n", "print(\"Data sources loaded as lazy frames\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check schemas\n", "print(\"Alpha158 schema:\")\n", "print(pl_ldf_a158.schema)\n", "\n", "print(\"\\nKline 1min schema:\")\n", "print(pl_ldf_kline.schema)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Load Training Dataset\n", "\n", "Use qshare's load_dataset to construct the full training data." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Loading training dataset...\")\n", "print(f\" Date range: {CONFIG['dt_range']}\")\n", "print(f\" Normalization: {CONFIG['normalization_mode']}\")\n", "\n", "pl_df_train = load_dataset(\n", " pl_ldf_a158_1min=pl_ldf_a158,\n", " pl_ldf_kline_1min=pl_ldf_kline,\n", " pl_ldf_kline_1day=pl_ldf_kline_daily,\n", " pl_ldf_indus_idx=pl_ldf_industry,\n", " dt_range=CONFIG['dt_range'],\n", " normalization_mode=CONFIG['normalization_mode'],\n", " negative_factor=CONFIG['negative_factor'],\n", " positive_factor=CONFIG['positive_factor'],\n", ")\n", "\n", "# Convert to pandas for easier exploration\n", "df_train = pl_df_train.to_pandas()\n", "\n", "print(f\"\\nDataset shape: {df_train.shape}\")\n", "print(f\"Columns: {len(df_train.columns)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check column types\n", "feature_cols = [c for c in df_train.columns if c.startswith('alpha158_')]\n", "print(f\"\\nAlpha158 features: {len(feature_cols)}\")\n", "print(f\" Example: {feature_cols[:5]}\")\n", "\n", "print(f\"\\nTarget column: {[c for c in df_train.columns if 'return' in c.lower()]}\")\n", "print(f\"Weight column: {[c for c in df_train.columns if 'weight' in c.lower()]}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Data Quality Check" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Missing values\n", "missing = df_train.isnull().sum()\n", "missing_pct = missing / len(df_train) * 100\n", "\n", "print(\"Missing values:\")\n", "print(f\" Columns with missing: {(missing > 0).sum()}\")\n", "if (missing > 0).sum() > 0:\n", " print(\"\\nTop columns by missing %:\")\n", " print(missing_pct[missing_pct > 0].sort_values(ascending=False).head(10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data coverage by date\n", "df_train['datetime'] = pd.to_datetime(df_train.index.get_level_values(0))\n", "df_train['instrument'] = df_train.index.get_level_values(1)\n", "\n", "daily_counts = df_train.groupby('datetime')['instrument'].nunique()\n", "\n", "fig, ax = plt.subplots(figsize=(14, 4))\n", "daily_counts.plot(ax=ax)\n", "ax.set_title('Number of Instruments per Day')\n", "ax.set_xlabel('Date')\n", "ax.set_ylabel('Instrument Count')\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "print(f\"\\nInstruments per day: {daily_counts.mean():.0f} avg, {daily_counts.min()}-{daily_counts.max()} range\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Target Analysis\n", "\n", "Analyze the 15-minute return target distribution." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Identify target column\n", "target_col = [c for c in df_train.columns if 'return' in c.lower()][0]\n", "print(f\"Target column: {target_col}\")\n", "\n", "# Target statistics\n", "print(f\"\\nTarget statistics:\")\n", "print(df_train[target_col].describe())\n", "\n", "print(f\"\\nSkewness: {df_train[target_col].skew():.3f}\")\n", "print(f\"Kurtosis: {df_train[target_col].kurtosis():.3f}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Target distribution\n", "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n", "\n", "# Histogram\n", "df_train[target_col].hist(bins=100, ax=axes[0], edgecolor='black', alpha=0.7)\n", "axes[0].set_title(f'{target_col} Distribution')\n", "axes[0].axvline(x=0, color='red', linestyle='--')\n", "axes[0].set_xlim(-0.05, 0.05) # Focus on main distribution\n", "\n", "# Time series of daily mean target\n", "daily_mean_target = df_train.groupby('datetime')[target_col].mean()\n", "axes[1].plot(daily_mean_target.index, daily_mean_target.values)\n", "axes[1].set_title('Daily Mean Target')\n", "axes[1].axhline(y=0, color='red', linestyle='--')\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Feature Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Feature statistics\n", "feature_stats = df_train[feature_cols].describe().T\n", "\n", "print(\"Feature statistics summary:\")\n", "print(f\" Mean range: [{feature_stats['mean'].min():.4f}, {feature_stats['mean'].max():.4f}]\")\n", "print(f\" Std range: [{feature_stats['std'].min():.4f}, {feature_stats['std'].max():.4f}]\")\n", "\n", "# Check for features with extreme values\n", "extreme_features = feature_stats[\n", " (feature_stats['mean'].abs() > 10) | (feature_stats['std'] > 100)\n", "]\n", "if len(extreme_features) > 0:\n", " print(f\"\\nFeatures with extreme values: {len(extreme_features)}\")\n", " print(extreme_features.head())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Sample a few features for visualization\n", "sample_features = feature_cols[:4]\n", "\n", "fig, axes = plt.subplots(2, 2, figsize=(12, 8))\n", "axes = axes.flatten()\n", "\n", "for i, feat in enumerate(sample_features):\n", " df_train[feat].hist(bins=100, ax=axes[i], edgecolor='black', alpha=0.7)\n", " axes[i].set_title(feat)\n", " axes[i].axvline(x=0, color='red', linestyle='--')\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Sample Weights Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check weights if available\n", "weight_cols = [c for c in df_train.columns if 'weight' in c.lower()]\n", "if weight_cols:\n", " weight_col = weight_cols[0]\n", " print(f\"Weight column: {weight_col}\")\n", " print(f\"\\nWeight statistics:\")\n", " print(df_train[weight_col].describe())\n", " \n", " # Plot weight distribution by target sign\n", " fig, ax = plt.subplots(figsize=(10, 4))\n", " \n", " positive_mask = df_train[target_col] > 0\n", " df_train.loc[positive_mask, weight_col].hist(\n", " bins=50, alpha=0.5, label='Positive target', ax=ax\n", " )\n", " df_train.loc[~positive_mask, weight_col].hist(\n", " bins=50, alpha=0.5, label='Negative target', ax=ax\n", " )\n", " ax.set_title('Weight Distribution by Target Sign')\n", " ax.legend()\n", " plt.tight_layout()\n", " plt.show()\n", "else:\n", " print(\"No weight column found\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 4 } Let me now create the baseline model notebook for stock_15m: { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Stock 15m Baseline Model\n", "\n", "Train and evaluate a baseline XGBoost model for 15-minute return prediction.\n", "\n", "**Purpose**: Establish baseline performance with standard configuration." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import polars as pl\n", "import matplotlib.pyplot as plt\n", "import xgboost as xgb\n", "from sklearn.metrics import r2_score\n", "\n", "from qshare.data.polars.ret15m import load_dataset\n", "from qshare.io.polars import load_from_pq\n", "\n", "import sys\n", "sys.path.insert(0, '../')\n", "from common.plotting import setup_plot_style\n", "from common.paths import create_experiment_dir\n", "\n", "setup_plot_style()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Configuration" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "CONFIG = {\n", " # Experiment\n", " 'experiment_name': 'baseline_xgb',\n", " 'save_results': True,\n", " \n", " # Data paths\n", " 'path_a158': '/data/parquet/stock_1min_alpha158',\n", " 'path_kline': '/data/parquet/stock_1min',\n", " 'path_kline_daily': '/data/parquet/stock_1day',\n", " 'path_industry': '/data/parquet/industry_idx',\n", " \n", " # Date ranges\n", " 'dt_range': ['2022-01-01', '2024-12-31'],\n", " 'train_range': ['2022-01-01', '2023-12-31'],\n", " 'test_range': ['2024-01-01', '2024-12-31'],\n", " \n", " # Data config\n", " 'normalization_mode': 'dual',\n", " 'positive_factor': 1.0,\n", " 'negative_factor': 2.0,\n", " \n", " # Model\n", " 'model_params': {\n", " 'objective': 'reg:squarederror',\n", " 'eval_metric': 'rmse',\n", " 'max_depth': 6,\n", " 'learning_rate': 0.1,\n", " 'n_estimators': 100,\n", " 'subsample': 0.8,\n", " 'colsample_bytree': 0.8,\n", " 'random_state': 42,\n", " },\n", "}\n", "\n", "print(\"Configuration:\")\n", "for key, value in CONFIG.items():\n", " if not isinstance(value, dict):\n", " print(f\" {key}: {value}\")\n", "print(f\"\\nModel params: {CONFIG['model_params']}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Load Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Loading data sources...\")\n", "\n", "pl_ldf_a158 = load_from_pq(\n", " path=CONFIG['path_a158'],\n", " table_alias=\"a158\",\n", " start_time=CONFIG['dt_range'][0],\n", " as_struct=True\n", ")\n", "\n", "pl_ldf_kline = load_from_pq(\n", " path=CONFIG['path_kline'],\n", " table_alias=\"kline_1min\",\n", " start_time=CONFIG['dt_range'][0],\n", " as_struct=True\n", ")\n", "\n", "pl_ldf_kline_daily = load_from_pq(\n", " path=CONFIG['path_kline_daily'],\n", " table_alias=\"kline_1day\",\n", " start_time=CONFIG['dt_range'][0],\n", ")\n", "\n", "pl_ldf_industry = load_from_pq(\n", " path=CONFIG['path_industry'],\n", " table_alias=\"indus_idx\",\n", " start_time=CONFIG['dt_range'][0],\n", ")\n", "\n", "print(\"Loading dataset...\")\n", "pl_df = load_dataset(\n", " pl_ldf_a158_1min=pl_ldf_a158,\n", " pl_ldf_kline_1min=pl_ldf_kline,\n", " pl_ldf_kline_1day=pl_ldf_kline_daily,\n", " pl_ldf_indus_idx=pl_ldf_industry,\n", " dt_range=CONFIG['dt_range'],\n", " normalization_mode=CONFIG['normalization_mode'],\n", " negative_factor=CONFIG['negative_factor'],\n", " positive_factor=CONFIG['positive_factor'],\n", ")\n", "\n", "# Convert to pandas\n", "df_full = pl_df.to_pandas()\n", "print(f\"\\nFull dataset shape: {df_full.shape}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Prepare Train/Test Split" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Identify columns\n", "feature_cols = [c for c in df_full.columns if c.startswith('alpha158_')]\n", "target_cols = [c for c in df_full.columns if 'return' in c.lower()]\n", "weight_cols = [c for c in df_full.columns if 'weight' in c.lower()]\n", "\n", "print(f\"Features: {len(feature_cols)}\")\n", "print(f\"Targets: {target_cols}\")\n", "print(f\"Weights: {weight_cols}\")\n", "\n", "# Select target\n", "target_col = target_cols[0]\n", "weight_col = weight_cols[0] if weight_cols else None\n", "\n", "# Split by date\n", "df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n", "df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n", "\n", "print(f\"\\nTrain: {df_train.shape}\")\n", "print(f\"Test: {df_test.shape}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Train Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Prepare data\n", "X_train = df_train[feature_cols]\n", "y_train = df_train[target_col]\n", "w_train = df_train[weight_col] if weight_col else None\n", "\n", "X_test = df_test[feature_cols]\n", "y_test = df_test[target_col]\n", "\n", "# Handle missing values\n", "X_train = X_train.fillna(X_train.median())\n", "X_test = X_test.fillna(X_train.median()) # Use train median\n", "\n", "print(\"Training XGBoost model...\")\n", "print(f\" X shape: {X_train.shape}\")\n", "print(f\" y mean: {y_train.mean():.6f}, std: {y_train.std():.6f}\")\n", "\n", "model = xgb.XGBRegressor(**CONFIG['model_params'])\n", "\n", "model.fit(\n", " X_train, y_train,\n", " sample_weight=w_train,\n", " eval_set=[(X_test, y_test)],\n", " verbose=False\n", ")\n", "\n", "print(\"Training complete!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Feature importance\n", "importance = pd.DataFrame({\n", " 'feature': feature_cols,\n", " 'importance': model.feature_importances_\n", "}).sort_values('importance', ascending=False)\n", "\n", "print(\"\\nTop 10 Features:\")\n", "print(importance.head(10))\n", "\n", "# Plot\n", "fig, ax = plt.subplots(figsize=(10, 6))\n", "importance.head(20).plot(x='feature', y='importance', kind='barh', ax=ax)\n", "ax.set_title('Top 20 Feature Importance')\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Evaluate" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Generate predictions\n", "y_pred_train = model.predict(X_train)\n", "y_pred_test = model.predict(X_test)\n", "\n", "# Calculate metrics\n", "train_r2 = r2_score(y_train, y_pred_train)\n", "test_r2 = r2_score(y_test, y_pred_test)\n", "\n", "# IC (Information Coefficient)\n", "train_ic = np.corrcoef(y_train, y_pred_train)[0, 1]\n", "test_ic = np.corrcoef(y_test, y_pred_test)[0, 1]\n", "\n", "print(\"Performance Metrics:\")\n", "print(f\" Train R2: {train_r2:.4f}\")\n", "print(f\" Test R2: {test_r2:.4f}\")\n", "print(f\" Train IC: {train_ic:.4f}\")\n", "print(f\" Test IC: {test_ic:.4f}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Daily IC analysis\n", "df_test_eval = df_test.copy()\n", "df_test_eval['pred'] = y_pred_test\n", "df_test_eval['target'] = y_test\n", "\n", "df_test_eval['datetime'] = df_test_eval.index.get_level_values(0)\n", "\n", "# Calculate daily IC\n", "daily_ic = df_test_eval.groupby('datetime').apply(\n", " lambda x: x['target'].corr(x['pred'])\n", ")\n", "\n", "print(\"\\nDaily IC Statistics:\")\n", "print(f\" Mean: {daily_ic.mean():.4f}\")\n", "print(f\" Std: {daily_ic.std():.4f}\")\n", "print(f\" IR: {daily_ic.mean() / daily_ic.std():.4f}\")\n", "print(f\" >0: {(daily_ic > 0).mean():.1%}\")\n", "\n", "# Plot\n", "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n", "\n", "# IC distribution\n", "daily_ic.hist(bins=50, ax=axes[0], edgecolor='black')\n", "axes[0].axvline(x=0, color='red', linestyle='--')\n", "axes[0].axvline(x=daily_ic.mean(), color='green', linestyle='--', label=f'Mean: {daily_ic.mean():.3f}')\n", "axes[0].set_title('Daily IC Distribution')\n", "axes[0].legend()\n", "\n", "# IC time series\n", "daily_ic.rolling(20, min_periods=5).mean().plot(ax=axes[1])\n", "axes[1].axhline(y=0, color='red', linestyle='--')\n", "axes[1].set_title('Rolling IC (20-day)')\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Prediction vs Actual scatter\n", "fig, ax = plt.subplots(figsize=(8, 8))\n", "\n", "# Sample for plotting\n", "sample_idx = np.random.choice(len(y_test), size=min(10000, len(y_test)), replace=False)\n", "ax.scatter(y_test.iloc[sample_idx], y_pred_test[sample_idx], alpha=0.3, s=1)\n", "\n", "# Perfect prediction line\n", "lims = [min(y_test.min(), y_pred_test.min()), max(y_test.max(), y_pred_test.max())]\n", "ax.plot(lims, lims, 'r--', alpha=0.5)\n", "\n", "ax.set_xlabel('Actual')\n", "ax.set_ylabel('Predicted')\n", "ax.set_title(f'Prediction vs Actual (IC={test_ic:.3f})')\n", "ax.grid(True, alpha=0.3)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Save Results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if CONFIG['save_results']:\n", " import pickle\n", " import json\n", " \n", " output_dir = create_experiment_dir('stock_15m', CONFIG['experiment_name'])\n", " print(f\"Saving results to: {output_dir}\")\n", " \n", " # Save config\n", " with open(output_dir / 'config.json', 'w') as f:\n", " json.dump(CONFIG, f, indent=2, default=str)\n", " \n", " # Save model\n", " with open(output_dir / 'model.pkl', 'wb') as f:\n", " pickle.dump(model, f)\n", " \n", " # Save importance\n", " importance.to_csv(output_dir / 'feature_importance.csv', index=False)\n", " \n", " # Save predictions\n", " predictions = pd.DataFrame({\n", " 'actual': y_test,\n", " 'predicted': y_pred_test\n", " }, index=df_test.index)\n", " predictions.to_csv(output_dir / 'predictions.csv')\n", " \n", " # Save metrics\n", " metrics = {\n", " 'train_r2': float(train_r2),\n", " 'test_r2': float(test_r2),\n", " 'train_ic': float(train_ic),\n", " 'test_ic': float(test_ic),\n", " 'daily_ic_mean': float(daily_ic.mean()),\n", " 'daily_ic_std': float(daily_ic.std()),\n", " 'daily_ir': float(daily_ic.mean() / daily_ic.std()),\n", " }\n", " with open(output_dir / 'metrics.json', 'w') as f:\n", " json.dump(metrics, f, indent=2)\n", " \n", " print(\"\\nFiles saved:\")\n", " for f in output_dir.iterdir():\n", " print(f\" - {f.name}\")\n", "else:\n", " print(\"Results not saved\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 4 }