#!/usr/bin/env python """ Script to generate and dump transformed features from the alpha158_beta pipeline. This script provides fine-grained control over the feature generation and dumping process: - Select which feature groups to dump (alpha158, market_ext, market_flag, merged, vae_input) - Choose output format (parquet, pickle, numpy) - Control date range and universe filtering - Save intermediate pipeline outputs - Enable streaming mode for large datasets (>1 year) Usage: # Dump all features to parquet python dump_features.py --start-date 2025-01-01 --end-date 2025-01-31 # Dump only alpha158 features to pickle python dump_features.py --groups alpha158 --format pickle # Dump with custom output path python dump_features.py --output /path/to/output.parquet # Dump merged features with all columns python dump_features.py --groups merged --verbose # Use streaming mode for large date ranges (>1 year) python dump_features.py --start-date 2020-01-01 --end-date 2023-12-31 --streaming """ import os import sys import argparse from pathlib import Path from typing import Optional, List # Add src to path for imports SCRIPT_DIR = Path(__file__).parent sys.path.insert(0, str(SCRIPT_DIR.parent / 'src')) from processors import ( FeaturePipeline, FeatureGroups, VAE_INPUT_DIM, ALPHA158_COLS, MARKET_EXT_BASE_COLS, COLUMNS_TO_REMOVE, get_groups, dump_to_parquet, dump_to_pickle, dump_to_numpy, ) # Default output directory DEFAULT_OUTPUT_DIR = SCRIPT_DIR.parent / "data" def generate_and_dump( start_date: str, end_date: str, output_path: str, output_format: str = 'parquet', groups: List[str] = None, universe: str = 'csiallx', filter_universe: bool = True, robust_zscore_params_path: Optional[str] = None, verbose: bool = True, pack_struct: bool = False, streaming: bool = False, ) -> None: """ Generate features and dump to file. Args: start_date: Start date in YYYY-MM-DD format end_date: End date in YYYY-MM-DD format output_path: Output file path output_format: Output format ('parquet', 'pickle', 'numpy') groups: Feature groups to dump (default: ['merged']) universe: Stock universe name filter_universe: Whether to filter to stock universe robust_zscore_params_path: Path to robust zscore parameters verbose: Whether to print progress pack_struct: If True, pack each feature group into struct columns (features_alpha158, features_market_ext, features_market_flag) streaming: If True, use Polars streaming mode for large datasets (>1 year) """ if groups is None: groups = ['merged'] print("=" * 60) print("Feature Dump Script") print("=" * 60) print(f"Date range: {start_date} to {end_date}") print(f"Output format: {output_format}") print(f"Feature groups: {groups}") print(f"Universe: {universe} (filter: {filter_universe})") print(f"Pack struct: {pack_struct}") print(f"Output path: {output_path}") print("=" * 60) # Initialize pipeline pipeline = FeaturePipeline( robust_zscore_params_path=robust_zscore_params_path ) # Load data feature_groups = pipeline.load_data( start_date, end_date, filter_universe=filter_universe, universe_name=universe, streaming=streaming ) # Apply transformations - get merged DataFrame (pipeline always returns merged DataFrame now) df_transformed = pipeline.transform(feature_groups, pack_struct=pack_struct) # Select feature groups from merged DataFrame outputs = get_groups(df_transformed, groups, verbose, use_struct=False) # Ensure output directory exists output_dir = os.path.dirname(output_path) if output_dir: os.makedirs(output_dir, exist_ok=True) # Dump to file(s) if output_format == 'numpy': # For numpy, we save the merged features dump_to_numpy(feature_groups, output_path, include_metadata=True, verbose=verbose) elif output_format == 'pickle': if 'merged' in outputs: dump_to_pickle(outputs['merged'], output_path, verbose=verbose) elif len(outputs) == 1: # Single group output key = list(outputs.keys())[0] base_path = Path(output_path) dump_path = str(base_path.with_name(f"{base_path.stem}_{key}{base_path.suffix}")) dump_to_pickle(outputs[key], dump_path, verbose=verbose) else: # Multiple groups - save each separately base_path = Path(output_path) for key, df_out in outputs.items(): dump_path = str(base_path.with_name(f"{base_path.stem}_{key}{base_path.suffix}")) dump_to_pickle(df_out, dump_path, verbose=verbose) else: # parquet if 'merged' in outputs: dump_to_parquet(outputs['merged'], output_path, verbose=verbose) elif len(outputs) == 1: # Single group output key = list(outputs.keys())[0] base_path = Path(output_path) dump_path = str(base_path.with_name(f"{base_path.stem}_{key}{base_path.suffix}")) dump_to_parquet(outputs[key], dump_path, verbose=verbose) else: # Multiple groups - save each separately base_path = Path(output_path) for key, df_out in outputs.items(): dump_path = str(base_path.with_name(f"{base_path.stem}_{key}{base_path.suffix}")) dump_to_parquet(df_out, dump_path, verbose=verbose) print("=" * 60) print("Feature dump complete!") print("=" * 60) def main(): parser = argparse.ArgumentParser( description="Generate and dump transformed features from alpha158_beta pipeline" ) # Date range parser.add_argument( "--start-date", type=str, required=True, help="Start date in YYYY-MM-DD format" ) parser.add_argument( "--end-date", type=str, required=True, help="End date in YYYY-MM-DD format" ) # Output settings parser.add_argument( "--output", "-o", type=str, default=None, help=f"Output file path (default: {DEFAULT_OUTPUT_DIR}/features.parquet)" ) parser.add_argument( "--format", "-f", type=str, default='parquet', choices=['parquet', 'pickle', 'numpy'], help="Output format (default: parquet)" ) # Feature groups parser.add_argument( "--groups", "-g", type=str, nargs='+', default=['merged'], choices=['merged', 'alpha158', 'market_ext', 'market_flag', 'vae_input'], help="Feature groups to dump (default: merged)" ) # Universe settings parser.add_argument( "--universe", type=str, default='csiallx', help="Stock universe name (default: csiallx)" ) parser.add_argument( "--no-filter-universe", action="store_true", help="Disable stock universe filtering" ) # Robust zscore parameters parser.add_argument( "--robust-zscore-params", type=str, default=None, help="Path to robust zscore parameters directory" ) # Verbose mode parser.add_argument( "--verbose", "-v", action="store_true", default=True, help="Enable verbose output (default: True)" ) parser.add_argument( "--quiet", "-q", action="store_true", help="Disable verbose output" ) # Struct option parser.add_argument( "--pack-struct", "-s", action="store_true", help="Pack each feature group into separate struct columns (features_alpha158, features_market_ext, features_market_flag)" ) # Streaming option parser.add_argument( "--streaming", action="store_true", help="Use Polars streaming mode for large datasets (recommended for date ranges > 1 year)" ) args = parser.parse_args() # Handle verbose/quiet flags verbose = args.verbose and not args.quiet # Set default output path if args.output is None: # Build default output path: {data_dir}/features_{group}.parquet # Note: generate_and_dump will add group suffix, so use base name "features" output_path = str(DEFAULT_OUTPUT_DIR / "features.parquet") else: output_path = args.output # Generate and dump generate_and_dump( start_date=args.start_date, end_date=args.end_date, output_path=output_path, output_format=args.format, groups=args.groups, universe=args.universe, filter_universe=not args.no_filter_universe, robust_zscore_params_path=args.robust_zscore_params, verbose=verbose, pack_struct=args.pack_struct, streaming=args.streaming, ) if __name__ == "__main__": main()