You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
346 lines
10 KiB
346 lines
10 KiB
|
4 days ago
|
#!/usr/bin/env python
|
||
|
|
"""
|
||
|
|
Main pipeline orchestration script for Alpha158 0_7 vs 0_7_beta comparison.
|
||
|
|
|
||
|
|
This script orchestrates the full workflow:
|
||
|
|
1. Generate beta embeddings from alpha158_0_7_beta factors
|
||
|
|
2. Fetch original 0_7 predictions from DolphinDB
|
||
|
|
3. Generate predictions using beta embeddings
|
||
|
|
4. Generate actual returns from kline data
|
||
|
|
5. Compare predictions (IC, RankIC, correlation, etc.)
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python pipeline.py --start-date 2019-01-01 --end-date 2020-11-30 --skip-embeddings --skip-fetch
|
||
|
|
|
||
|
|
Arguments:
|
||
|
|
--start-date: Start date for data loading (default: 2019-01-01)
|
||
|
|
--end-date: End date for data loading (default: 2020-11-30)
|
||
|
|
--skip-embeddings: Skip embeddings generation (use existing)
|
||
|
|
--skip-fetch: Skip fetching original predictions (use existing)
|
||
|
|
--skip-returns: Skip returns generation (use existing)
|
||
|
|
--skip-comparison: Skip final comparison
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import argparse
|
||
|
|
from datetime import datetime
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
# Add scripts directory to path
|
||
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'scripts'))
|
||
|
|
|
||
|
|
|
||
|
|
def step_generate_embeddings(start_date: str, end_date: str, data_dir: str) -> bool:
|
||
|
|
"""Step 1: Generate beta embeddings."""
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
print("STEP 1: Generate Beta Embeddings")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
embedding_file = os.path.join(data_dir, "embedding_0_7_beta.parquet")
|
||
|
|
|
||
|
|
if os.path.exists(embedding_file):
|
||
|
|
print(f"Embeddings file already exists: {embedding_file}")
|
||
|
|
response = input("Regenerate? (y/N): ").strip().lower()
|
||
|
|
if response != 'y':
|
||
|
|
print("Skipping embeddings generation.")
|
||
|
|
return True
|
||
|
|
|
||
|
|
try:
|
||
|
|
from generate_beta_embedding import generate_embeddings
|
||
|
|
|
||
|
|
df = generate_embeddings(
|
||
|
|
start_date=start_date,
|
||
|
|
end_date=end_date,
|
||
|
|
output_file=embedding_file,
|
||
|
|
use_vae=True
|
||
|
|
)
|
||
|
|
|
||
|
|
print(f"\nGenerated {len(df)} embeddings")
|
||
|
|
return True
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error generating embeddings: {e}")
|
||
|
|
import traceback
|
||
|
|
traceback.print_exc()
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def step_fetch_predictions(start_date: str, end_date: str, data_dir: str) -> bool:
|
||
|
|
"""Step 2: Fetch original predictions from DolphinDB."""
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
print("STEP 2: Fetch Original Predictions from DolphinDB")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
predictions_file = os.path.join(data_dir, "original_predictions_0_7.parquet")
|
||
|
|
|
||
|
|
if os.path.exists(predictions_file):
|
||
|
|
print(f"Predictions file already exists: {predictions_file}")
|
||
|
|
response = input("Refetch? (y/N): ").strip().lower()
|
||
|
|
if response != 'y':
|
||
|
|
print("Skipping fetch.")
|
||
|
|
return True
|
||
|
|
|
||
|
|
try:
|
||
|
|
from fetch_predictions import fetch_original_predictions
|
||
|
|
|
||
|
|
df = fetch_original_predictions(
|
||
|
|
start_date=start_date,
|
||
|
|
end_date=end_date,
|
||
|
|
output_file=predictions_file
|
||
|
|
)
|
||
|
|
|
||
|
|
print(f"\nFetched {len(df)} predictions")
|
||
|
|
return True
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error fetching predictions: {e}")
|
||
|
|
import traceback
|
||
|
|
traceback.print_exc()
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def step_generate_beta_predictions(data_dir: str) -> bool:
|
||
|
|
"""Step 3: Generate predictions using beta embeddings."""
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
print("STEP 3: Generate Predictions with Beta Embeddings")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
embedding_file = os.path.join(data_dir, "embedding_0_7_beta.parquet")
|
||
|
|
predictions_file = os.path.join(data_dir, "predictions_beta_embedding.parquet")
|
||
|
|
|
||
|
|
if not os.path.exists(embedding_file):
|
||
|
|
print(f"Embeddings file not found: {embedding_file}")
|
||
|
|
print("Run step 1 first.")
|
||
|
|
return False
|
||
|
|
|
||
|
|
if os.path.exists(predictions_file):
|
||
|
|
print(f"Beta predictions file already exists: {predictions_file}")
|
||
|
|
response = input("Regenerate? (y/N): ").strip().lower()
|
||
|
|
if response != 'y':
|
||
|
|
print("Skipping prediction generation.")
|
||
|
|
return True
|
||
|
|
|
||
|
|
try:
|
||
|
|
from predict_with_embedding import generate_predictions
|
||
|
|
|
||
|
|
df = generate_predictions(
|
||
|
|
embedding_file=embedding_file,
|
||
|
|
output_file=predictions_file,
|
||
|
|
seq_len=40,
|
||
|
|
batch_size=1000
|
||
|
|
)
|
||
|
|
|
||
|
|
print(f"\nGenerated {len(df)} predictions")
|
||
|
|
return True
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error generating predictions: {e}")
|
||
|
|
import traceback
|
||
|
|
traceback.print_exc()
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def step_generate_returns(data_dir: str) -> bool:
|
||
|
|
"""Step 4: Generate actual returns from kline data."""
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
print("STEP 4: Generate Actual Returns")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
predictions_file = os.path.join(data_dir, "original_predictions_0_7.parquet")
|
||
|
|
returns_file = os.path.join(data_dir, "actual_returns.parquet")
|
||
|
|
|
||
|
|
if os.path.exists(returns_file):
|
||
|
|
print(f"Returns file already exists: {returns_file}")
|
||
|
|
response = input("Regenerate? (y/N): ").strip().lower()
|
||
|
|
if response != 'y':
|
||
|
|
print("Skipping returns generation.")
|
||
|
|
return True
|
||
|
|
|
||
|
|
try:
|
||
|
|
from generate_returns import generate_real_returns_from_kline
|
||
|
|
|
||
|
|
# Use prediction file to determine date range if available
|
||
|
|
prediction_file = predictions_file if os.path.exists(predictions_file) else None
|
||
|
|
|
||
|
|
df = generate_real_returns_from_kline(
|
||
|
|
input_kline_path="/data/parquet/dataset/stg_1day_wind_kline_adjusted_1D/",
|
||
|
|
prediction_file=prediction_file,
|
||
|
|
output_file=returns_file,
|
||
|
|
return_days=5
|
||
|
|
)
|
||
|
|
|
||
|
|
if df is not None:
|
||
|
|
print(f"\nGenerated {len(df)} returns")
|
||
|
|
return True
|
||
|
|
else:
|
||
|
|
print("\nFailed to generate returns")
|
||
|
|
return False
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error generating returns: {e}")
|
||
|
|
import traceback
|
||
|
|
traceback.print_exc()
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def step_compare_predictions(data_dir: str) -> bool:
|
||
|
|
"""Step 5: Compare 0_7 vs 0_7_beta predictions."""
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
print("STEP 5: Compare Predictions")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
required_files = [
|
||
|
|
os.path.join(data_dir, "original_predictions_0_7.parquet"),
|
||
|
|
os.path.join(data_dir, "predictions_beta_embedding.parquet"),
|
||
|
|
]
|
||
|
|
|
||
|
|
for f in required_files:
|
||
|
|
if not os.path.exists(f):
|
||
|
|
print(f"Required file not found: {f}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Import and run comparison
|
||
|
|
from compare_predictions import main as compare_main
|
||
|
|
|
||
|
|
compare_main()
|
||
|
|
return True
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error comparing predictions: {e}")
|
||
|
|
import traceback
|
||
|
|
traceback.print_exc()
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
"""Main pipeline orchestration."""
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description="Alpha158 0_7 vs 0_7_beta Comparison Pipeline"
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
"--start-date",
|
||
|
|
type=str,
|
||
|
|
default="2019-01-01",
|
||
|
|
help="Start date (YYYY-MM-DD)"
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
"--end-date",
|
||
|
|
type=str,
|
||
|
|
default="2020-11-30",
|
||
|
|
help="End date (YYYY-MM-DD)"
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
"--skip-embeddings",
|
||
|
|
action="store_true",
|
||
|
|
help="Skip embeddings generation"
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
"--skip-fetch",
|
||
|
|
action="store_true",
|
||
|
|
help="Skip fetching original predictions"
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
"--skip-returns",
|
||
|
|
action="store_true",
|
||
|
|
help="Skip returns generation"
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
"--skip-comparison",
|
||
|
|
action="store_true",
|
||
|
|
help="Skip final comparison"
|
||
|
|
)
|
||
|
|
parser.add_argument(
|
||
|
|
"--data-dir",
|
||
|
|
type=str,
|
||
|
|
default=None,
|
||
|
|
help="Data directory (default: ./data)"
|
||
|
|
)
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Determine data directory
|
||
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||
|
|
data_dir = args.data_dir or os.path.join(script_dir, "data")
|
||
|
|
|
||
|
|
print("=" * 70)
|
||
|
|
print("Alpha158 0_7 vs 0_7_beta Comparison Pipeline")
|
||
|
|
print("=" * 70)
|
||
|
|
print(f"Date range: {args.start_date} to {args.end_date}")
|
||
|
|
print(f"Data directory: {data_dir}")
|
||
|
|
|
||
|
|
# Ensure data directory exists
|
||
|
|
os.makedirs(data_dir, exist_ok=True)
|
||
|
|
|
||
|
|
# Track results
|
||
|
|
results = {}
|
||
|
|
|
||
|
|
# Step 1: Generate embeddings
|
||
|
|
if not args.skip_embeddings:
|
||
|
|
results['embeddings'] = step_generate_embeddings(
|
||
|
|
args.start_date, args.end_date, data_dir
|
||
|
|
)
|
||
|
|
else:
|
||
|
|
print("\nSkipping embeddings generation (as requested)")
|
||
|
|
results['embeddings'] = True
|
||
|
|
|
||
|
|
# Step 2: Fetch original predictions
|
||
|
|
if not args.skip_fetch:
|
||
|
|
results['fetch'] = step_fetch_predictions(
|
||
|
|
args.start_date, args.end_date, data_dir
|
||
|
|
)
|
||
|
|
else:
|
||
|
|
print("\nSkipping fetch (as requested)")
|
||
|
|
results['fetch'] = True
|
||
|
|
|
||
|
|
# Step 3: Generate beta predictions
|
||
|
|
if results.get('embeddings', True):
|
||
|
|
results['beta_predictions'] = step_generate_beta_predictions(data_dir)
|
||
|
|
else:
|
||
|
|
print("\nSkipping beta predictions (embeddings generation failed)")
|
||
|
|
results['beta_predictions'] = False
|
||
|
|
|
||
|
|
# Step 4: Generate returns
|
||
|
|
if not args.skip_returns:
|
||
|
|
results['returns'] = step_generate_returns(data_dir)
|
||
|
|
else:
|
||
|
|
print("\nSkipping returns generation (as requested)")
|
||
|
|
results['returns'] = True
|
||
|
|
|
||
|
|
# Step 5: Compare predictions
|
||
|
|
if not args.skip_comparison:
|
||
|
|
if all([
|
||
|
|
results.get('fetch', True),
|
||
|
|
results.get('beta_predictions', True)
|
||
|
|
]):
|
||
|
|
results['comparison'] = step_compare_predictions(data_dir)
|
||
|
|
else:
|
||
|
|
print("\nSkipping comparison (previous steps failed)")
|
||
|
|
results['comparison'] = False
|
||
|
|
else:
|
||
|
|
print("\nSkipping comparison (as requested)")
|
||
|
|
results['comparison'] = True
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
print("PIPELINE SUMMARY")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
for step, success in results.items():
|
||
|
|
status = "✓ PASSED" if success else "✗ FAILED"
|
||
|
|
print(f" {step:20s}: {status}")
|
||
|
|
|
||
|
|
all_passed = all(results.values())
|
||
|
|
|
||
|
|
print("=" * 70)
|
||
|
|
if all_passed:
|
||
|
|
print("Pipeline completed successfully!")
|
||
|
|
else:
|
||
|
|
print("Pipeline completed with errors.")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|