You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
307 lines
9.7 KiB
307 lines
9.7 KiB
#!/usr/bin/env python
|
|
"""
|
|
Compare 0_7 vs 0_7_beta predictions.
|
|
|
|
This script:
|
|
1. Loads original 0_7 predictions (from DDB)
|
|
2. Loads 0_7_beta predictions (from new embeddings)
|
|
3. Calculates correlation between predictions
|
|
4. Compares metrics (IC, RankIC, etc.) if actual returns available
|
|
"""
|
|
|
|
import os
|
|
import numpy as np
|
|
import polars as pl
|
|
import pandas as pd
|
|
from scipy.stats import spearmanr
|
|
from typing import Optional, Dict
|
|
|
|
# File paths
|
|
PRED_0_7_FILE = "../data/original_predictions_0_7.parquet"
|
|
PRED_0_7_BETA_FILE = "../data/predictions_beta_embedding.parquet"
|
|
ACTUAL_RETURNS_FILE = "../data/actual_returns.parquet"
|
|
|
|
|
|
def load_and_align_predictions():
|
|
"""Load both prediction files and align them by datetime and instrument."""
|
|
print("Loading predictions...")
|
|
|
|
# Load 0_7 predictions
|
|
df_0_7 = pl.read_parquet(PRED_0_7_FILE)
|
|
print(f"0_7 predictions: {df_0_7.shape}")
|
|
print(f" Date range: {df_0_7['datetime'].min()} to {df_0_7['datetime'].max()}")
|
|
print(f" Unique instruments: {df_0_7['instrument'].n_unique()}")
|
|
|
|
# Load 0_7_beta predictions
|
|
df_beta = pl.read_parquet(PRED_0_7_BETA_FILE)
|
|
print(f"\n0_7_beta predictions: {df_beta.shape}")
|
|
print(f" Date range: {df_beta['datetime'].min()} to {df_beta['datetime'].max()}")
|
|
print(f" Unique instruments: {df_beta['instrument'].n_unique()}")
|
|
|
|
# Ensure compatible types
|
|
df_0_7 = df_0_7.with_columns([
|
|
pl.col('datetime').cast(pl.Int64),
|
|
pl.col('instrument').cast(pl.Int64)
|
|
])
|
|
df_beta = df_beta.with_columns([
|
|
pl.col('datetime').cast(pl.Int64),
|
|
pl.col('instrument').cast(pl.Int64)
|
|
])
|
|
|
|
# Rename prediction columns
|
|
df_0_7 = df_0_7.rename({'prediction': 'pred_0_7'})
|
|
df_beta = df_beta.rename({'prediction': 'pred_beta'})
|
|
|
|
# Join on datetime and instrument
|
|
df_joined = df_0_7.join(
|
|
df_beta,
|
|
on=['datetime', 'instrument'],
|
|
how='inner'
|
|
)
|
|
|
|
print(f"\nJoined predictions: {df_joined.shape}")
|
|
print(f" Overlapping dates: {df_joined['datetime'].n_unique()}")
|
|
print(f" Overlapping instruments: {df_joined['instrument'].n_unique()}")
|
|
|
|
return df_joined
|
|
|
|
|
|
def calculate_correlation(df: pl.DataFrame) -> Dict[str, float]:
|
|
"""Calculate correlation between 0_7 and 0_7_beta predictions."""
|
|
df_pd = df.to_pandas()
|
|
|
|
# Overall correlation
|
|
pearson_corr = df_pd['pred_0_7'].corr(df_pd['pred_beta'])
|
|
spearman_corr, _ = spearmanr(df_pd['pred_0_7'], df_pd['pred_beta'])
|
|
|
|
# Correlation by date
|
|
daily_corrs = []
|
|
for date, group in df_pd.groupby('datetime'):
|
|
if len(group) >= 2:
|
|
corr = group['pred_0_7'].corr(group['pred_beta'])
|
|
daily_corrs.append(corr)
|
|
|
|
daily_corr_mean = np.mean(daily_corrs)
|
|
daily_corr_std = np.std(daily_corrs)
|
|
|
|
return {
|
|
'pearson_corr': pearson_corr,
|
|
'spearman_corr': spearman_corr,
|
|
'daily_corr_mean': daily_corr_mean,
|
|
'daily_corr_std': daily_corr_std
|
|
}
|
|
|
|
|
|
def calculate_ic_metrics(df: pl.DataFrame, actual_returns: pl.DataFrame) -> Dict:
|
|
"""Calculate IC metrics for both prediction sets."""
|
|
|
|
# Join with actual returns
|
|
df_joined = df.join(
|
|
actual_returns,
|
|
on=['datetime', 'instrument'],
|
|
how='inner'
|
|
)
|
|
|
|
print(f"\nJoined with returns: {df_joined.shape}")
|
|
|
|
df_pd = df_joined.to_pandas()
|
|
|
|
# Find return column
|
|
return_col = None
|
|
for col in ['v2v_5d', 'return', 'actual_return', 'ret']:
|
|
if col in df_pd.columns:
|
|
return_col = col
|
|
break
|
|
|
|
if return_col is None:
|
|
print("No return column found!")
|
|
return {}
|
|
|
|
print(f"Using return column: {return_col}")
|
|
|
|
# Calculate daily IC for both predictions
|
|
results_0_7 = []
|
|
results_beta = []
|
|
|
|
for date, group in df_pd.groupby('datetime'):
|
|
if len(group) < 5: # Need enough samples
|
|
continue
|
|
|
|
# IC (Pearson)
|
|
ic_0_7 = group['pred_0_7'].corr(group[return_col])
|
|
ic_beta = group['pred_beta'].corr(group[return_col])
|
|
|
|
# RankIC (Spearman)
|
|
rankic_0_7, _ = spearmanr(group['pred_0_7'], group[return_col])
|
|
rankic_beta, _ = spearmanr(group['pred_beta'], group[return_col])
|
|
|
|
results_0_7.append({'date': date, 'ic': ic_0_7, 'rankic': rankic_0_7})
|
|
results_beta.append({'date': date, 'ic': ic_beta, 'rankic': rankic_beta})
|
|
|
|
df_ic_0_7 = pd.DataFrame(results_0_7)
|
|
df_ic_beta = pd.DataFrame(results_beta)
|
|
|
|
metrics = {
|
|
'0_7': {
|
|
'ic_mean': df_ic_0_7['ic'].mean(),
|
|
'ic_std': df_ic_0_7['ic'].std(),
|
|
'ic_ir': df_ic_0_7['ic'].mean() / df_ic_0_7['ic'].std() if df_ic_0_7['ic'].std() > 0 else 0,
|
|
'rankic_mean': df_ic_0_7['rankic'].mean(),
|
|
'rankic_std': df_ic_0_7['rankic'].std(),
|
|
'rankic_ir': df_ic_0_7['rankic'].mean() / df_ic_0_7['rankic'].std() if df_ic_0_7['rankic'].std() > 0 else 0,
|
|
},
|
|
'0_7_beta': {
|
|
'ic_mean': df_ic_beta['ic'].mean(),
|
|
'ic_std': df_ic_beta['ic'].std(),
|
|
'ic_ir': df_ic_beta['ic'].mean() / df_ic_beta['ic'].std() if df_ic_beta['ic'].std() > 0 else 0,
|
|
'rankic_mean': df_ic_beta['rankic'].mean(),
|
|
'rankic_std': df_ic_beta['rankic'].std(),
|
|
'rankic_ir': df_ic_beta['rankic'].mean() / df_ic_beta['rankic'].std() if df_ic_beta['rankic'].std() > 0 else 0,
|
|
}
|
|
}
|
|
|
|
return metrics
|
|
|
|
|
|
def calculate_top_tier_return(df: pl.DataFrame, actual_returns: pl.DataFrame, top_pct: float = 0.1) -> Dict:
|
|
"""Calculate top-tier returns for both predictions."""
|
|
|
|
# Join with actual returns
|
|
df_joined = df.join(
|
|
actual_returns,
|
|
on=['datetime', 'instrument'],
|
|
how='inner'
|
|
)
|
|
|
|
df_pd = df_joined.to_pandas()
|
|
|
|
# Find return column
|
|
return_col = None
|
|
for col in ['v2v_5d', 'return', 'actual_return', 'ret']:
|
|
if col in df_pd.columns:
|
|
return_col = col
|
|
break
|
|
|
|
if return_col is None:
|
|
return {}
|
|
|
|
# Calculate top-tier returns
|
|
top_returns_0_7 = []
|
|
top_returns_beta = []
|
|
|
|
for date, group in df_pd.groupby('datetime'):
|
|
if len(group) < 10:
|
|
continue
|
|
|
|
n_top = max(1, int(len(group) * top_pct))
|
|
|
|
# Top predictions from 0_7
|
|
top_0_7 = group.nlargest(n_top, 'pred_0_7')
|
|
top_returns_0_7.append(top_0_7[return_col].mean())
|
|
|
|
# Top predictions from beta
|
|
top_beta = group.nlargest(n_top, 'pred_beta')
|
|
top_returns_beta.append(top_beta[return_col].mean())
|
|
|
|
return {
|
|
'0_7': {
|
|
'top_tier_return': np.mean(top_returns_0_7),
|
|
'top_tier_std': np.std(top_returns_0_7)
|
|
},
|
|
'0_7_beta': {
|
|
'top_tier_return': np.mean(top_returns_beta),
|
|
'top_tier_std': np.std(top_returns_beta)
|
|
}
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Main comparison function."""
|
|
print("=" * 70)
|
|
print("COMPARISON: Alpha158 0_7 vs 0_7_beta Predictions")
|
|
print("=" * 70)
|
|
|
|
# Load and align predictions
|
|
df_joined = load_and_align_predictions()
|
|
|
|
if len(df_joined) == 0:
|
|
print("\nERROR: No overlapping predictions found!")
|
|
return
|
|
|
|
# Calculate correlation
|
|
print("\n" + "-" * 70)
|
|
print("PREDICTION CORRELATION")
|
|
print("-" * 70)
|
|
|
|
corr_metrics = calculate_correlation(df_joined)
|
|
print(f"Overall Pearson correlation: {corr_metrics['pearson_corr']:.4f}")
|
|
print(f"Overall Spearman correlation: {corr_metrics['spearman_corr']:.4f}")
|
|
print(f"Daily correlation mean: {corr_metrics['daily_corr_mean']:.4f}")
|
|
print(f"Daily correlation std: {corr_metrics['daily_corr_std']:.4f}")
|
|
|
|
# Prediction statistics
|
|
print("\n" + "-" * 70)
|
|
print("PREDICTION STATISTICS")
|
|
print("-" * 70)
|
|
|
|
df_pd = df_joined.to_pandas()
|
|
print(f"0_7 predictions:")
|
|
print(f" Mean: {df_pd['pred_0_7'].mean():.6f}")
|
|
print(f" Std: {df_pd['pred_0_7'].std():.6f}")
|
|
print(f" Min: {df_pd['pred_0_7'].min():.6f}")
|
|
print(f" Max: {df_pd['pred_0_7'].max():.6f}")
|
|
|
|
print(f"\n0_7_beta predictions:")
|
|
print(f" Mean: {df_pd['pred_beta'].mean():.6f}")
|
|
print(f" Std: {df_pd['pred_beta'].std():.6f}")
|
|
print(f" Min: {df_pd['pred_beta'].min():.6f}")
|
|
print(f" Max: {df_pd['pred_beta'].max():.6f}")
|
|
|
|
# Load actual returns and calculate IC metrics if available
|
|
if os.path.exists(ACTUAL_RETURNS_FILE):
|
|
print("\n" + "-" * 70)
|
|
print("IC METRICS (with actual returns)")
|
|
print("-" * 70)
|
|
|
|
actual_returns = pl.read_parquet(ACTUAL_RETURNS_FILE)
|
|
print(f"Loaded actual returns: {actual_returns.shape}")
|
|
|
|
ic_metrics = calculate_ic_metrics(df_joined, actual_returns)
|
|
|
|
if ic_metrics:
|
|
print(f"\n{'Metric':<20} {'0_7':<12} {'0_7_beta':<12} {'Diff':<12}")
|
|
print("-" * 56)
|
|
|
|
for metric in ['ic_mean', 'ic_std', 'ic_ir', 'rankic_mean', 'rankic_std', 'rankic_ir']:
|
|
v0 = ic_metrics['0_7'][metric]
|
|
v1 = ic_metrics['0_7_beta'][metric]
|
|
diff = v1 - v0
|
|
print(f"{metric:<20} {v0:>11.4f} {v1:>11.4f} {diff:>+11.4f}")
|
|
|
|
# Top-tier returns
|
|
print("\n" + "-" * 70)
|
|
print("TOP-TIER RETURNS (top 10%)")
|
|
print("-" * 70)
|
|
|
|
top_tier = calculate_top_tier_return(df_joined, actual_returns, top_pct=0.1)
|
|
|
|
if top_tier:
|
|
print(f"{'':<20} {'0_7':<12} {'0_7_beta':<12} {'Diff':<12}")
|
|
print("-" * 56)
|
|
|
|
t0 = top_tier['0_7']['top_tier_return']
|
|
t1 = top_tier['0_7_beta']['top_tier_return']
|
|
diff = t1 - t0
|
|
print(f"{'Top-tier return':<20} {t0:>11.4f} {t1:>11.4f} {diff:>+11.4f}")
|
|
else:
|
|
print(f"\nActual returns file not found: {ACTUAL_RETURNS_FILE}")
|
|
print("Skipping IC metrics calculation.")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("Comparison complete!")
|
|
print("=" * 70)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|