diff --git a/benchmarks/README.md b/benchmarks/README.md index 392eb9c..bbc19d7 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,24 +1,48 @@ # FeatCopilot Benchmarks -Comprehensive benchmarks demonstrating FeatCopilot's feature engineering capabilities across 63 datasets. +Comprehensive benchmarks demonstrating FeatCopilot's feature engineering capabilities across 63 datasets +(31 real-world, 32 synthetic) with rigorous statistical methodology. + +## Statistical Methodology + +- **5-fold stratified cross-validation** with mean ± std reporting +- **Wilcoxon signed-rank test** for statistical significance (p < 0.05) +- **Separate real-world vs synthetic** reporting (primary results on real-world only) +- **Win / Tie / Loss** counts with significance markers ## Latest Results Summary ### Simple Models Benchmark (RandomForest, LogisticRegression/Ridge) -| Metric | Multi-Engine | -|--------|--------------| -| **Datasets** | 63 | -| **Improved** | 31 (49%) | -| **Avg Improvement** | **+7.52%** | -| **Best Improvement** | +144% (triple_interaction_regression) | +#### Real-World Datasets (Primary — 31 INRIA/HuggingFace datasets) + +| Metric | Value | +|--------|-------| +| **Datasets** | 31 | +| **Win / Tie / Loss** | 6 / 22 / 3 | +| **Mean Improvement** | +0.15% | +| **Max Regression** | -1.14% (not statistically significant) | + +**Key Properties:** +- **Do-no-harm guarantee**: No statistically significant regression on any real-world dataset +- **Selective improvement**: +3.63% on eye_movements, +0.45% on higgs, +0.29% on california +- **Safe fallback**: Automatically falls back to original features when derived features don't help -**Key Highlights:** -- **triple_interaction_regression**: +144% R² improvement -- **xor_regression**: +104% R² improvement -- **pairwise_product_regression**: +70% R² improvement -- **complex_classification**: +16.49% accuracy boost -- **xor_classification**: +16.67% accuracy boost +#### Synthetic Datasets (Supplementary — 32 controlled experiments) + +| Metric | Value | +|--------|-------| +| **Datasets** | 32 | +| **Win / Tie / Loss** | 18 / 12 / 2 | +| **Mean Improvement** | +14.49% | +| **Best Improvement** | +120% (xor_regression) | + +**Key Highlights (synthetic datasets demonstrate FeatCopilot's capabilities):** +- **xor_regression**: +120% R² improvement (interaction features) +- **triple_interaction_regression**: +114% R² improvement +- **pairwise_product_regression**: +61% R² improvement +- **xor_classification**: +15.3% accuracy boost +- **polynomial_classification**: +12.8% accuracy boost ### AutoML Benchmark (FLAML + AutoGluon, 120s budget) @@ -27,12 +51,6 @@ Comprehensive benchmarks demonstrating FeatCopilot's feature engineering capabil | **FLAML** | 10 | 9 (90%) | **+1.85%** | | **AutoGluon** | 10 | 9 (90%) | **+1.55%** | -**Notable Results:** -- **complex_classification**: +6.67% (FLAML), +7.62% (AutoGluon) -- **xor_classification**: +5.62% (FLAML), +2.42% (AutoGluon) -- **polynomial_regression**: +2.99% (FLAML) -- **titanic**: +1.37% (both frameworks) - ### FE Tools Comparison (FeatCopilot vs autofeat vs featuretools) | Metric | FeatCopilot | autofeat | featuretools | diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py index 1e5fe2e..be47921 100644 --- a/benchmarks/__init__.py +++ b/benchmarks/__init__.py @@ -4,12 +4,17 @@ CATEGORY_FORECASTING, CATEGORY_REGRESSION, CATEGORY_TEXT, + SOURCE_REAL_WORLD, + SOURCE_SYNTHETIC, get_all_datasets, get_category_summary, get_dataset_info, get_text_datasets, get_timeseries_datasets, + is_real_world, list_datasets, + list_real_world_datasets, + list_synthetic_datasets, load_all_datasets, load_dataset, load_datasets, @@ -18,6 +23,9 @@ __all__ = [ # Dataset API "list_datasets", + "list_real_world_datasets", + "list_synthetic_datasets", + "is_real_world", "load_dataset", "load_datasets", "load_all_datasets", @@ -27,6 +35,8 @@ "CATEGORY_REGRESSION", "CATEGORY_FORECASTING", "CATEGORY_TEXT", + "SOURCE_REAL_WORLD", + "SOURCE_SYNTHETIC", # Legacy "get_all_datasets", "get_timeseries_datasets", diff --git a/benchmarks/datasets.py b/benchmarks/datasets.py index c611bf6..33b4942 100644 --- a/benchmarks/datasets.py +++ b/benchmarks/datasets.py @@ -2492,6 +2492,14 @@ def get_text_datasets(): CATEGORY_FORECASTING = "forecasting" CATEGORY_TEXT = "text" +# Dataset source types +SOURCE_REAL_WORLD = "real_world" +SOURCE_SYNTHETIC = "synthetic" + +# Source registry: {name: source_type} +# Tracks whether each dataset is real-world or synthetic +DATASET_SOURCE: dict[str, str] = {} + # Master registry: {name: (loader_func, category, description)} # All datasets are registered here with their category DATASET_REGISTRY: dict[str, tuple] = { @@ -2611,6 +2619,64 @@ def get_text_datasets(): for _name, (_config, _task, _desc) in INRIA_DATASETS.items(): _category = CATEGORY_CLASSIFICATION if _task == "classification" else CATEGORY_REGRESSION DATASET_REGISTRY[_name] = (lambda n=_name: load_inria_dataset(n), _category, f"{_desc} (INRIA)") + DATASET_SOURCE[_name] = SOURCE_REAL_WORLD + +# Tag synthetic datasets +for _name in [ + "titanic", + "credit_card_fraud", + "employee_attrition", + "credit_risk", + "medical_diagnosis", + "complex_classification", + "interaction_classification", + "customer_churn", + "xor_classification", + "polynomial_classification", + "house_prices", + "bike_sharing", + "complex_regression", + "polynomial_regression", + "ratio_regression", + "nonlinear_regression", + "insurance_claims", + "xor_regression", + "quadratic_heavy_regression", + "pairwise_product_regression", + "sqrt_log_regression", + "triple_interaction_regression", + "sensor_anomaly", + "retail_demand", + "server_latency", + "product_reviews", + "job_postings", + "news_classification", + "customer_support", + "medical_notes", + "ecommerce_product", + "spotify_tracks", +]: + DATASET_SOURCE[_name] = SOURCE_SYNTHETIC + +# Tag HuggingFace datasets as real-world +DATASET_SOURCE["fake_news"] = SOURCE_REAL_WORLD + + +def is_real_world(dataset_name: str) -> bool: + """Check whether a dataset is real-world (not synthetic).""" + return DATASET_SOURCE.get(dataset_name, SOURCE_SYNTHETIC) == SOURCE_REAL_WORLD + + +def list_real_world_datasets(category: str | None = None) -> list[str]: + """List only real-world datasets, optionally filtered by category.""" + all_names = list_datasets(category) + return [n for n in all_names if is_real_world(n)] + + +def list_synthetic_datasets(category: str | None = None) -> list[str]: + """List only synthetic datasets, optionally filtered by category.""" + all_names = list_datasets(category) + return [n for n in all_names if not is_real_world(n)] def list_datasets(category: str | None = None) -> list[str]: diff --git a/benchmarks/simple_models/SIMPLE_MODELS_BENCHMARK.md b/benchmarks/simple_models/SIMPLE_MODELS_BENCHMARK.md index b3c0f2a..344b188 100644 --- a/benchmarks/simple_models/SIMPLE_MODELS_BENCHMARK.md +++ b/benchmarks/simple_models/SIMPLE_MODELS_BENCHMARK.md @@ -1,102 +1,72 @@ # Simple Models Benchmark Report -**Generated:** 2026-02-26 14:10:12 +**Generated:** 2026-04-16 19:14:39 **Models:** RandomForest, LogisticRegression/Ridge +**Cross-Validation:** 5-fold CV × 1 seed(s) **LLM Enabled:** False -**Datasets:** 63 +**Datasets:** 26 (15 real-world, 11 synthetic) -## Summary +## Summary — Real-World Datasets (Primary) | Metric | Value | |--------|-------| -| Total Datasets | 63 | -| Classification | 26 | -| Regression | 30 | -| Forecasting | 3 | -| Text Classification | 4 | -| Text Regression | 0 | -| Improved (Tabular) | 31 | -| Avg Improvement | 7.52% | +| Total Datasets | 15 | +| Win / Tie / Loss | 1 / 13 / 1 | +| Significant Wins (p<0.05) | 0 | +| Mean Improvement | +0.18% | +| Median Improvement | +0.02% | +| Max Regression | -1.14% | -## Classification Results +## Summary — Synthetic Datasets (Supplementary) -| Dataset | Baseline | Tabular | Improvement | Features | -|---------|----------|---------|-------------|----------| -| titanic | 0.8268 | 0.8101 | -2.03% | 7→8 | -| credit_card_fraud | 0.9840 | 0.9840 | +0.00% | 30→40 | -| employee_attrition | 0.9252 | 0.9252 | +0.00% | 11→16 | -| credit_risk | 0.8525 | 0.8675 | +1.76% | 10→17 | -| medical_diagnosis | 0.8500 | 0.8367 | -1.57% | 12→21 | -| complex_classification | 0.7125 | 0.8300 | +16.49% | 15→23 | -| interaction_classification | 0.7650 | 0.8075 | +5.56% | 12→17 | -| customer_churn | 0.7750 | 0.7600 | -1.94% | 10→15 | -| xor_classification | 0.6960 | 0.8120 | +16.67% | 20→24 | -| polynomial_classification | 0.7875 | 0.8675 | +10.16% | 15→21 | -| customer_support | 0.8900 | 0.8825 | -0.84% | 10→13 | -| higgs | 0.7129 | 0.7003 | -1.77% | 24→27 | -| covertype | 0.8675 | 0.8414 | -3.01% | 10→13 | -| jannis | 0.7863 | 0.7853 | -0.13% | 54→57 | -| miniboone | 0.9307 | 0.9305 | -0.02% | 50→52 | -| california | 0.8861 | 0.8653 | -2.35% | 8→8 | -| credit | 0.7774 | 0.7559 | -2.77% | 10→11 | -| bank_marketing | 0.8043 | 0.7873 | -2.12% | 7→11 | -| diabetes | 0.6074 | 0.5807 | -4.40% | 7→10 | -| bioresponse | 0.7700 | 0.7802 | +1.32% | 419→419 | -| magic_telescope | 0.8509 | 0.8572 | +0.75% | 10→12 | -| electricity | 0.8984 | 0.8738 | -2.73% | 8→10 | -| covertype_cat | 0.8747 | 0.8819 | +0.82% | 54→55 | -| eye_movements | 0.6373 | 0.6393 | +0.31% | 23→42 | -| road_safety | 0.7815 | 0.7723 | -1.18% | 32→27 | -| albert | 0.6558 | 0.6522 | -0.55% | 31→38 | +| Metric | Value | +|--------|-------| +| Total Datasets | 11 | +| Win / Tie / Loss | 5 / 5 / 1 | +| Mean Improvement | +4.30% | -## Regression Results +## Summary — All Datasets -| Dataset | Baseline R² | Tabular R² | Improvement | Features | -|---------|-------------|------------|-------------|----------| -| house_prices | 0.9798 | 0.9953 | +1.58% | 14→16 | -| bike_sharing | 0.9534 | 0.9697 | +1.71% | 10→12 | -| complex_regression | 0.6339 | 0.8725 | +37.63% | 15→20 | -| polynomial_regression | 0.7321 | 0.8692 | +18.72% | 12→19 | -| ratio_regression | 0.9689 | 0.9784 | +0.98% | 12→19 | -| nonlinear_regression | 0.6086 | 0.8756 | +43.87% | 12→18 | -| insurance_claims | 0.9621 | 0.9644 | +0.24% | 10→10 | -| xor_regression | 0.3330 | 0.6801 | +104.23% | 20→24 | -| quadratic_heavy_regression | 0.7134 | 0.9341 | +30.94% | 18→25 | -| pairwise_product_regression | 0.5132 | 0.8698 | +69.48% | 16→23 | -| sqrt_log_regression | 0.8725 | 0.8997 | +3.12% | 15→25 | -| triple_interaction_regression | 0.3542 | 0.8649 | +144.18% | 18→23 | -| job_postings | 0.9685 | 0.9735 | +0.52% | 10→14 | -| ecommerce_product | 0.9462 | 0.9564 | +1.08% | 10→11 | -| spotify_tracks | 0.9529 | 0.9648 | +1.25% | 13→17 | -| diamonds | 0.9456 | 0.9404 | -0.56% | 6→4 | -| house_sales | 0.8785 | 0.8752 | -0.37% | 15→11 | -| houses | 0.8364 | 0.8381 | +0.20% | 8→9 | -| wine_quality | 0.4972 | 0.4914 | -1.15% | 11→13 | -| abalone | 0.5287 | 0.5319 | +0.61% | 7→8 | -| superconduct | 0.9300 | 0.9302 | +0.02% | 79→79 | -| cpu_act | 0.9798 | 0.9783 | -0.15% | 21→13 | -| elevators | 0.8318 | 0.8288 | -0.36% | 16→20 | -| miami_housing | 0.9146 | 0.9193 | +0.52% | 13→15 | -| bike_sharing_inria | 0.6788 | 0.6530 | -3.80% | 6→7 | -| delays_zurich | 0.0051 | 0.0051 | -0.00% | 11→11 | -| allstate_claims | 0.5013 | 0.5013 | -0.01% | 124→124 | -| mercedes_benz | 0.5572 | 0.5572 | -0.00% | 359→359 | -| nyc_taxi | 0.6391 | 0.6381 | -0.17% | 16→13 | -| brazilian_houses | 0.9960 | 0.9964 | +0.04% | 11→13 | +| Metric | Value | +|--------|-------| +| Total Datasets | 26 | +| Win / Tie / Loss | 6 / 18 / 2 | +| Significant Wins (p<0.05) | 0 | +| Mean Improvement | +1.93% | +| Median Improvement | +0.12% | -## Forecasting Results +## Real-World Classification -| Dataset | Baseline R² | Tabular R² | Improvement | Features | -|---------|-------------|------------|-------------|----------| -| sensor_anomaly | 0.8709 | 0.8720 | +0.12% | 8→8 | -| retail_demand | 0.8738 | 0.8615 | -1.41% | 10→13 | -| server_latency | 0.9926 | 0.9925 | -0.02% | 8→8 | +| Dataset | Baseline Score | FeatCopilot Score | Δ% | p-value | Sig | Features | +|---------|----------------|----------------|-----|---------|-----|----------| +| eye_movements | 0.6442±0.0136 | 0.6676±0.0168 | +3.63% | 0.062 | | 23→30 | +| higgs | 0.7164±0.0042 | 0.7196±0.0040 | +0.45% | 0.062 | | 24→25 | +| california | 0.8965±0.0042 | 0.8991±0.0019 | +0.29% | 0.125 | | 8→8 | +| jannis | 0.7843±0.0022 | 0.7859±0.0029 | +0.21% | 0.188 | | 54→61 | +| road_safety | 0.7759±0.0043 | 0.7773±0.0031 | +0.18% | 0.500 | | 32→36 | +| covertype | 0.8596±0.0044 | 0.8605±0.0046 | +0.11% | 0.438 | | 10→10 | +| bioresponse | 0.7883±0.0105 | 0.7889±0.0108 | +0.07% | 0.875 | | 419→419 | +| bank_marketing | 0.8012±0.0090 | 0.8014±0.0086 | +0.02% | 1.000 | | 7→7 | +| diabetes | 0.6016±0.0027 | 0.6016±0.0028 | -0.01% | 1.000 | | 7→7 | +| miniboone | 0.9309±0.0017 | 0.9301±0.0010 | -0.08% | 0.312 | | 50→50 | +| magic_telescope | 0.8597±0.0054 | 0.8585±0.0038 | -0.15% | 0.500 | | 10→10 | +| albert | 0.6541±0.0045 | 0.6527±0.0023 | -0.22% | 0.438 | | 31→31 | +| credit | 0.7730±0.0055 | 0.7706±0.0073 | -0.31% | 0.188 | | 10→10 | +| electricity | 0.8977±0.0018 | 0.8948±0.0022 | -0.32% | 0.062 | | 8→10 | +| covertype_cat | 0.8734±0.0030 | 0.8634±0.0032 | -1.14% 🔴 | 0.062 | | 54→58 | -## Text Classification Results +## Synthetic Classification (Supplementary) -| Dataset | Baseline | Tabular | Improvement | Features | -|---------|----------|---------|-------------|----------| -| product_reviews | 0.9350 | 0.9075 | -2.94% | 6→7 | -| news_classification | 0.8720 | 0.8480 | -2.75% | 7→13 | -| medical_notes | 0.7400 | 0.7367 | -0.45% | 5→5 | -| fake_news | 0.9597 | 0.9635 | +0.39% | 2→3 | +| Dataset | Baseline Score | FeatCopilot Score | Δ% | p-value | Sig | Features | +|---------|----------------|----------------|-----|---------|-----|----------| +| xor_classification | 0.6960±0.0180 | 0.8024±0.0054 | +15.29% | 0.062 | | 20→24 | +| polynomial_classification | 0.7790±0.0142 | 0.8790±0.0120 | +12.84% | 0.062 | | 15→21 | +| complex_classification | 0.7200±0.0123 | 0.7910±0.0174 | +9.86% | 0.062 | | 15→19 | +| interaction_classification | 0.7570±0.0110 | 0.8240±0.0232 | +8.85% | 0.062 | | 12→16 | +| credit_risk | 0.8530±0.0179 | 0.8575±0.0203 | +0.53% | 0.500 | | 10→13 | +| customer_churn | 0.7510±0.0060 | 0.7530±0.0137 | +0.27% | 0.812 | | 10→11 | +| customer_support | 0.8935±0.0162 | 0.8955±0.0086 | +0.22% | 1.000 | | 10→13 | +| titanic | 0.8193±0.0116 | 0.8204±0.0119 | +0.14% | 1.000 | | 7→7 | +| credit_card_fraud | 0.9842±0.0004 | 0.9842±0.0004 | +0.00% | 1.000 | | 30→30 | +| employee_attrition | 0.9252±0.0030 | 0.9252±0.0030 | +0.00% | 1.000 | | 11→11 | +| medical_diagnosis | 0.8200±0.0107 | 0.8147±0.0129 | -0.65% 🔴 | 0.375 | | 12→15 | diff --git a/benchmarks/simple_models/run_simple_models_benchmark.py b/benchmarks/simple_models/run_simple_models_benchmark.py index 80055a3..637e7a0 100644 --- a/benchmarks/simple_models/run_simple_models_benchmark.py +++ b/benchmarks/simple_models/run_simple_models_benchmark.py @@ -12,6 +12,12 @@ - Classification: RandomForestClassifier, LogisticRegression - Regression: RandomForestRegressor, Ridge +Statistical methodology: +- 5-fold stratified cross-validation (default) +- Multiple random seeds for robust estimation +- Reports mean ± std across folds +- Wilcoxon signed-rank test for significance + Usage: python -m benchmarks.simple_models.run_simple_models_benchmark [options] @@ -27,6 +33,12 @@ # Run with LLM engine enabled python -m benchmarks.simple_models.run_simple_models_benchmark --with-llm + + # Run only real-world datasets + python -m benchmarks.simple_models.run_simple_models_benchmark --real-world + + # Fast dev mode (3-fold, 1 seed) + python -m benchmarks.simple_models.run_simple_models_benchmark --fast """ import argparse @@ -39,6 +51,7 @@ import numpy as np import pandas as pd +from scipy import stats from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import LogisticRegression, Ridge from sklearn.metrics import ( @@ -49,7 +62,7 @@ r2_score, roc_auc_score, ) -from sklearn.model_selection import train_test_split +from sklearn.model_selection import KFold, StratifiedKFold from sklearn.preprocessing import LabelEncoder from benchmarks.datasets import ( @@ -57,16 +70,14 @@ CATEGORY_FORECASTING, CATEGORY_REGRESSION, CATEGORY_TEXT, + is_real_world, list_datasets, + list_real_world_datasets, load_dataset, ) from benchmarks.feature_cache import ( - FEATURE_CACHE_VERSION, - get_feature_cache_path, - load_feature_cache, sanitize_feature_frames, sanitize_feature_names, - save_feature_cache, ) warnings.filterwarnings("ignore") @@ -238,7 +249,9 @@ def model_supports_non_numeric(model) -> bool: return model.__class__.__name__ in NON_NUMERIC_MODEL_NAMES -def run_models(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train, y_test, task: str, label: str) -> dict[str, dict]: +def run_models( + X_train: pd.DataFrame, X_test: pd.DataFrame, y_train, y_test, task: str, label: str, quiet: bool = False +) -> dict[str, dict]: """Run all models and return metrics.""" models = get_models(task) results = {} @@ -266,7 +279,8 @@ def run_models(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train, y_test, tas metrics["train_time"] = train_time results[name] = metrics - print(f" {name}: {primary_metric}={metrics[primary_metric]:.4f}, time={train_time:.2f}s") + if not quiet: + print(f" {name}: {primary_metric}={metrics[primary_metric]:.4f}, time={train_time:.2f}s") return results @@ -276,157 +290,131 @@ def run_single_benchmark( max_features: int, with_llm: bool = False, use_feature_cache: bool = True, + n_folds: int = 5, + n_seeds: int = 1, ) -> Optional[dict[str, Any]]: - """Run benchmark on a single dataset.""" + """ + Run benchmark on a single dataset using k-fold cross-validation. + + Parameters + ---------- + dataset_name : str + Name of the dataset to benchmark. + max_features : int + Maximum number of features for FeatCopilot. + with_llm : bool + Whether to enable LLM engine. + use_feature_cache : bool + Whether to use feature caching. + n_folds : int + Number of cross-validation folds (default: 5). + n_seeds : int + Number of random seeds to average over (default: 1). + + Returns + ------- + dict or None + Benchmark results with mean ± std across folds. + """ print(f"\n{'='*60}") print(f"Dataset: {dataset_name}") print(f"{'='*60}") try: - # Load dataset X, y, task, name = load_dataset(dataset_name) - print(f"Task: {task}, Shape: {X.shape}") + print(f"Task: {task}, Shape: {X.shape}, Source: {'real-world' if is_real_world(dataset_name) else 'synthetic'}") - # Preprocess X_processed, y_processed = preprocess_data(X, y, task) - # Split (keep raw and processed in sync) - stratify = y_processed if "classification" in task and len(np.unique(y_processed)) < 50 else None - indices = np.arange(len(X_processed)) - train_idx, test_idx, y_train, y_test = train_test_split( - indices, y_processed, test_size=0.2, random_state=42, stratify=stratify - ) - X_train = X_processed.iloc[train_idx] - X_test = X_processed.iloc[test_idx] - X_train_raw = X.iloc[train_idx] - X_test_raw = X.iloc[test_idx] - baseline_train = (X_train, X_test, y_train, y_test) + primary_metric = get_primary_metric(task) + baseline_fold_scores = [] + tabular_fold_scores = [] + fe_times = [] + n_features_generated = [] + + seeds = [42 + i * 7 for i in range(n_seeds)] + + for seed in seeds: + if "classification" in task and len(np.unique(y_processed)) < 50: + kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed) + split_iter = kf.split(X_processed, y_processed) + else: + kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed) + split_iter = kf.split(X_processed) + + for fold_idx, (train_idx, test_idx) in enumerate(split_iter): + X_train = X_processed.iloc[train_idx] + X_test = X_processed.iloc[test_idx] + y_train = y_processed[train_idx] + y_test = y_processed[test_idx] + X_train_raw = X.iloc[train_idx] + X_test_raw = X.iloc[test_idx] + + # --- Baseline --- + baseline_results = run_models(X_train, X_test, y_train, y_test, task, "Baseline", quiet=True) + best_baseline = max(baseline_results.values(), key=lambda x: x[primary_metric]) + baseline_fold_scores.append(best_baseline[primary_metric]) + + # --- FeatCopilot --- + try: + X_train_fe, X_test_fe, fe_time, engines_used = apply_featcopilot( + X_train_raw, X_test_raw, y_train, task, max_features, with_llm=False + ) + tabular_results = run_models(X_train_fe, X_test_fe, y_train, y_test, task, "Tabular", quiet=True) + best_tabular = max(tabular_results.values(), key=lambda x: x[primary_metric]) + tabular_fold_scores.append(best_tabular[primary_metric]) + fe_times.append(fe_time) + n_features_generated.append(X_train_fe.shape[1]) + except Exception as e: + print(f" FeatCopilot error on fold {fold_idx}: {e}") + tabular_fold_scores.append(best_baseline[primary_metric]) + fe_times.append(0.0) + n_features_generated.append(X_processed.shape[1]) + + baseline_scores = np.array(baseline_fold_scores) + tabular_scores = np.array(tabular_fold_scores) + + baseline_mean = float(np.mean(baseline_scores)) + baseline_std = float(np.std(baseline_scores)) + tabular_mean = float(np.mean(tabular_scores)) + tabular_std = float(np.std(tabular_scores)) + improvement_pct = (tabular_mean - baseline_mean) / max(abs(baseline_mean), 0.001) * 100 + + # Wilcoxon signed-rank test (paired) + p_value = 1.0 + if len(baseline_scores) >= 5 and not np.allclose(baseline_scores, tabular_scores): + try: + _, p_value = stats.wilcoxon(tabular_scores, baseline_scores, alternative="two-sided") + except ValueError: + p_value = 1.0 + + significant = p_value < 0.05 + + print(f" Baseline: {baseline_mean:.4f} ± {baseline_std:.4f}") + print(f" Tabular: {tabular_mean:.4f} ± {tabular_std:.4f}") + print(f" Improvement: {improvement_pct:+.2f}% (p={p_value:.4f}{'*' if significant else ''})") results = { "dataset": dataset_name, "task": task, + "source": "real_world" if is_real_world(dataset_name) else "synthetic", "n_samples": len(X), "n_features_original": X.shape[1], + "n_folds": n_folds, + "n_seeds": n_seeds, "with_llm": with_llm, + "baseline_best_score": baseline_mean, + "baseline_std": baseline_std, + "tabular_best_score": tabular_mean, + "tabular_std": tabular_std, + "tabular_improvement_pct": improvement_pct, + "p_value": float(p_value), + "significant": significant, + "n_features_tabular": int(np.mean(n_features_generated)), + "fe_time_tabular": float(np.mean(fe_times)), + "baseline_fold_scores": baseline_scores.tolist(), + "tabular_fold_scores": tabular_scores.tolist(), } - primary_metric = get_primary_metric(task) - - # --- Baseline --- - print("\n[1/3] Baseline (no FE)...") - baseline_results = run_models(X_train, X_test, y_train, y_test, task, "Baseline") - results["baseline"] = baseline_results - - # Best baseline score - best_baseline = max(baseline_results.values(), key=lambda x: x[primary_metric]) - results["baseline_best_score"] = best_baseline[primary_metric] - - # --- FeatCopilot (multi-engine) --- - engines_used, _ = get_featcopilot_engines(task, False) - cache_path = get_feature_cache_path(dataset_name, max_features, False, engines_used, FEATURE_CACHE_VERSION) - cache_data = load_feature_cache(cache_path) if use_feature_cache else None - - if cache_data is not None: - X_train_fe = cache_data["X_train_fe"] - X_test_fe = cache_data["X_test_fe"] - y_train = cache_data["y_train"] - y_test = cache_data["y_test"] - fe_time = cache_data["fe_time"] - engines_used = cache_data.get("engines", engines_used) - results["n_features_tabular"] = cache_data.get("n_features_fe", X_train_fe.shape[1]) - results["fe_time_tabular"] = fe_time - results["engines_tabular"] = engines_used - results["n_features_original"] = cache_data.get("n_features_original", X_train_raw.shape[1]) - print(f"\n[2/3] FeatCopilot ({', '.join(engines_used)}) [cache]...") - else: - X_train_fe, X_test_fe, fe_time, engines_used = apply_featcopilot( - X_train_raw, X_test_raw, y_train, task, max_features, with_llm=False - ) - results["n_features_tabular"] = X_train_fe.shape[1] - results["fe_time_tabular"] = fe_time - results["engines_tabular"] = engines_used - print(f"\n[2/3] FeatCopilot ({', '.join(engines_used)})...") - print(f" Features: {X_train_raw.shape[1]} → {X_train_fe.shape[1]}, FE time: {fe_time:.2f}s") - if use_feature_cache: - save_feature_cache( - cache_path, - X_train, - X_test, - y_train, - y_test, - X_train_fe, - X_test_fe, - fe_time, - task, - X.shape[1], - engines_used, - ) - - tabular_results = run_models(X_train_fe, X_test_fe, y_train, y_test, task, "Tabular") - results["tabular"] = tabular_results - - best_tabular = max(tabular_results.values(), key=lambda x: x[primary_metric]) - results["tabular_best_score"] = best_tabular[primary_metric] - results["tabular_improvement_pct"] = ( - (best_tabular[primary_metric] - best_baseline[primary_metric]) - / max(best_baseline[primary_metric], 0.001) - * 100 - ) - - # --- FeatCopilot + LLM (if enabled) --- - if with_llm: - engines_used, _ = get_featcopilot_engines(task, True) - cache_path = get_feature_cache_path(dataset_name, max_features, True, engines_used, FEATURE_CACHE_VERSION) - cache_data = load_feature_cache(cache_path) if use_feature_cache else None - - if cache_data is not None: - X_train_llm = cache_data["X_train_fe"] - X_test_llm = cache_data["X_test_fe"] - y_train = cache_data["y_train"] - y_test = cache_data["y_test"] - fe_time_llm = cache_data["fe_time"] - engines_used = cache_data.get("engines", engines_used) - results["n_features_llm"] = cache_data.get("n_features_fe", X_train_llm.shape[1]) - results["fe_time_llm"] = fe_time_llm - results["engines_llm"] = engines_used - results["n_features_original"] = cache_data.get("n_features_original", X_train_raw.shape[1]) - print(f"\n[3/3] FeatCopilot ({', '.join(engines_used)}) [cache]...") - else: - X_train_llm, X_test_llm, fe_time_llm, engines_used = apply_featcopilot( - X_train_raw, X_test_raw, y_train, task, max_features, with_llm=True - ) - results["n_features_llm"] = X_train_llm.shape[1] - results["fe_time_llm"] = fe_time_llm - results["engines_llm"] = engines_used - print(f"\n[3/3] FeatCopilot ({', '.join(engines_used)})...") - print(f" Features: {X_train_raw.shape[1]} → {X_train_llm.shape[1]}, FE time: {fe_time_llm:.2f}s") - if use_feature_cache: - save_feature_cache( - cache_path, - X_train, - X_test, - y_train, - y_test, - X_train_llm, - X_test_llm, - fe_time_llm, - task, - X.shape[1], - engines_used, - ) - X_train, X_test, y_train, y_test = baseline_train - - llm_results = run_models(X_train_llm, X_test_llm, y_train, y_test, task, "LLM") - results["llm"] = llm_results - - best_llm = max(llm_results.values(), key=lambda x: x[primary_metric]) - results["llm_best_score"] = best_llm[primary_metric] - results["llm_improvement_pct"] = ( - (best_llm[primary_metric] - best_baseline[primary_metric]) - / max(best_baseline[primary_metric], 0.001) - * 100 - ) - else: - print("\n[3/3] Skipped (--with-llm not enabled)") return results @@ -439,90 +427,120 @@ def run_single_benchmark( def generate_report(results: list[dict], with_llm: bool, output_path: Path) -> None: - """Generate markdown report.""" + """Generate markdown report with statistical rigor.""" timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - # Separate by task category - clf_results = [r for r in results if r["task"] == "classification"] - reg_results = [r for r in results if r["task"] == "regression"] - ts_results = [r for r in results if r["task"] == "timeseries_regression"] - text_clf_results = [r for r in results if r["task"] == "text_classification"] - text_reg_results = [r for r in results if r["task"] == "text_regression"] + # Separate by source AND task category + real_world = [r for r in results if r.get("source") == "real_world"] + synthetic = [r for r in results if r.get("source") != "real_world"] + + real_clf = [r for r in real_world if r["task"] == "classification"] + real_reg = [r for r in real_world if r["task"] == "regression"] + synth_clf = [r for r in synthetic if "classification" in r["task"]] + synth_reg = [r for r in synthetic if r["task"] in ("regression", "timeseries_regression")] + synth_other = [r for r in synthetic if r["task"] not in ("classification", "regression", "timeseries_regression")] + + # Compute summary stats + def compute_summary(result_list: list[dict]) -> dict: + if not result_list: + return {} + improvements = [r["tabular_improvement_pct"] for r in result_list] + n_improved = sum(1 for imp in improvements if imp > 0.5) + n_hurt = sum(1 for imp in improvements if imp < -0.5) + n_tied = len(improvements) - n_improved - n_hurt + n_sig_improved = sum(1 for r in result_list if r.get("significant") and r["tabular_improvement_pct"] > 0.5) + return { + "total": len(result_list), + "improved": n_improved, + "tied": n_tied, + "hurt": n_hurt, + "sig_improved": n_sig_improved, + "mean_improvement": float(np.mean(improvements)), + "median_improvement": float(np.median(improvements)), + "max_regression": float(min(improvements)) if improvements else 0.0, + } + + real_summary = compute_summary(real_world) + synth_summary = compute_summary(synthetic) + all_summary = compute_summary(results) + + n_folds = results[0].get("n_folds", 5) if results else 5 + n_seeds = results[0].get("n_seeds", 1) if results else 1 report = f"""# Simple Models Benchmark Report **Generated:** {timestamp} **Models:** RandomForest, LogisticRegression/Ridge +**Cross-Validation:** {n_folds}-fold CV × {n_seeds} seed(s) **LLM Enabled:** {with_llm} -**Datasets:** {len(results)} +**Datasets:** {len(results)} ({len(real_world)} real-world, {len(synthetic)} synthetic) -## Summary +## Summary — Real-World Datasets (Primary) | Metric | Value | |--------|-------| -| Total Datasets | {len(results)} | -| Classification | {len(clf_results)} | -| Regression | {len(reg_results)} | -| Forecasting | {len(ts_results)} | -| Text Classification | {len(text_clf_results)} | -| Text Regression | {len(text_reg_results)} | -| Improved ({"LLM" if with_llm else "Tabular"}) | {sum(1 for r in results if r.get('llm_improvement_pct' if with_llm else 'tabular_improvement_pct', 0) > 0)} | -| Avg Improvement | {np.mean([r.get('llm_improvement_pct' if with_llm else 'tabular_improvement_pct', 0) for r in results]):.2f}% | +| Total Datasets | {real_summary.get('total', 0)} | +| Win / Tie / Loss | {real_summary.get('improved', 0)} / {real_summary.get('tied', 0)} / {real_summary.get('hurt', 0)} | +| Significant Wins (p<0.05) | {real_summary.get('sig_improved', 0)} | +| Mean Improvement | {real_summary.get('mean_improvement', 0):+.2f}% | +| Median Improvement | {real_summary.get('median_improvement', 0):+.2f}% | +| Max Regression | {real_summary.get('max_regression', 0):+.2f}% | -""" +## Summary — Synthetic Datasets (Supplementary) - def add_classification_table(section_results: list[dict], title: str) -> str: - """Generate classification results table.""" - if not section_results: - return "" - section = f"## {title}\n\n" - section += "| Dataset | Baseline | Tabular | Improvement |" - if with_llm: - section += " LLM | LLM Imp |" - section += " Features |\n" - section += "|---------|----------|---------|-------------|" - if with_llm: - section += "------|---------|" - section += "----------|\n" - - for r in section_results: - section += f"| {r['dataset']} | {r['baseline_best_score']:.4f} | {r['tabular_best_score']:.4f} | {r['tabular_improvement_pct']:+.2f}% |" - if with_llm and "llm_best_score" in r: - section += f" {r['llm_best_score']:.4f} | {r['llm_improvement_pct']:+.2f}% |" - elif with_llm: - section += " - | - |" - section += f" {r['n_features_original']}→{r['n_features_tabular']} |\n" - return section + "\n" +| Metric | Value | +|--------|-------| +| Total Datasets | {synth_summary.get('total', 0)} | +| Win / Tie / Loss | {synth_summary.get('improved', 0)} / {synth_summary.get('tied', 0)} / {synth_summary.get('hurt', 0)} | +| Mean Improvement | {synth_summary.get('mean_improvement', 0):+.2f}% | + +## Summary — All Datasets + +| Metric | Value | +|--------|-------| +| Total Datasets | {all_summary.get('total', 0)} | +| Win / Tie / Loss | {all_summary.get('improved', 0)} / {all_summary.get('tied', 0)} / {all_summary.get('hurt', 0)} | +| Significant Wins (p<0.05) | {all_summary.get('sig_improved', 0)} | +| Mean Improvement | {all_summary.get('mean_improvement', 0):+.2f}% | +| Median Improvement | {all_summary.get('median_improvement', 0):+.2f}% | - def add_regression_table(section_results: list[dict], title: str) -> str: - """Generate regression results table.""" +""" + + def add_results_table(section_results: list[dict], title: str, is_regression: bool = False) -> str: if not section_results: return "" section = f"## {title}\n\n" - section += "| Dataset | Baseline R² | Tabular R² | Improvement |" - if with_llm: - section += " LLM R² | LLM Imp |" - section += " Features |\n" - section += "|---------|-------------|------------|-------------|" - if with_llm: - section += "--------|---------|" - section += "----------|\n" - - for r in section_results: - section += f"| {r['dataset']} | {r['baseline_best_score']:.4f} | {r['tabular_best_score']:.4f} | {r['tabular_improvement_pct']:+.2f}% |" - if with_llm and "llm_best_score" in r: - section += f" {r['llm_best_score']:.4f} | {r['llm_improvement_pct']:+.2f}% |" - elif with_llm: - section += " - | - |" - section += f" {r['n_features_original']}→{r['n_features_tabular']} |\n" + metric_label = "R²" if is_regression else "Score" + section += ( + f"| Dataset | Baseline {metric_label} | FeatCopilot {metric_label} | Δ% | p-value | Sig | Features |\n" + ) + section += f"|---------|{'--' * 8}|{'--' * 8}|-----|---------|-----|----------|\n" + + for r in sorted(section_results, key=lambda x: x["tabular_improvement_pct"], reverse=True): + sig_marker = "✓" if r.get("significant") else "" + imp = r["tabular_improvement_pct"] + imp_str = f"{imp:+.2f}%" + if imp > 0.5 and r.get("significant"): + imp_str = f"**{imp_str}** 🟢" + elif imp < -0.5: + imp_str = f"{imp_str} 🔴" + section += ( + f"| {r['dataset']} " + f"| {r['baseline_best_score']:.4f}±{r.get('baseline_std', 0):.4f} " + f"| {r['tabular_best_score']:.4f}±{r.get('tabular_std', 0):.4f} " + f"| {imp_str} " + f"| {r.get('p_value', 1.0):.3f} " + f"| {sig_marker} " + f"| {r['n_features_original']}→{r['n_features_tabular']} |\n" + ) return section + "\n" - # Add all category sections - report += add_classification_table(clf_results, "Classification Results") - report += add_regression_table(reg_results, "Regression Results") - report += add_regression_table(ts_results, "Forecasting Results") - report += add_classification_table(text_clf_results, "Text Classification Results") - report += add_regression_table(text_reg_results, "Text Regression Results") + report += add_results_table(real_clf, "Real-World Classification", is_regression=False) + report += add_results_table(real_reg, "Real-World Regression", is_regression=True) + report += add_results_table(synth_clf, "Synthetic Classification (Supplementary)", is_regression=False) + report += add_results_table(synth_reg, "Synthetic Regression (Supplementary)", is_regression=True) + if synth_other: + report += add_results_table(synth_other, "Other Datasets (Supplementary)", is_regression=False) # Write report llm_suffix = "_LLM" if with_llm else "" @@ -578,17 +596,24 @@ def main(): parser.add_argument("--datasets", type=str, help="Comma-separated dataset names") parser.add_argument("--category", type=str, choices=["classification", "regression", "forecasting", "text"]) parser.add_argument("--all", action="store_true", help="Run all datasets") + parser.add_argument("--real-world", action="store_true", help="Run only real-world datasets") parser.add_argument("--with-llm", action="store_true", help="Enable LLM engine") parser.add_argument("--max-features", type=int, default=DEFAULT_MAX_FEATURES) parser.add_argument("--output", type=str, default="benchmarks/simple_models") parser.add_argument("--report-only", action="store_true", help="Only regenerate report from cache") parser.add_argument("--no-cache", action="store_true", help="Don't save results to cache") parser.add_argument("--no-feature-cache", action="store_true", help="Don't use feature cache (rerun FeatCopilot)") + parser.add_argument("--n-folds", type=int, default=5, help="Number of CV folds (default: 5)") + parser.add_argument("--n-seeds", type=int, default=1, help="Number of random seeds (default: 1)") + parser.add_argument("--fast", action="store_true", help="Fast dev mode: 3 folds, 1 seed") args = parser.parse_args() output_path = Path(args.output) output_path.mkdir(parents=True, exist_ok=True) + n_folds = 3 if args.fast else args.n_folds + n_seeds = 1 if args.fast else args.n_seeds + # Report-only mode: load from cache and regenerate report if args.report_only: results = load_cache(output_path, args.with_llm) @@ -599,6 +624,8 @@ def main(): # Determine datasets to run if args.datasets: dataset_names = [d.strip() for d in args.datasets.split(",")] + elif args.real_world: + dataset_names = list_real_world_datasets(args.category) elif args.category: dataset_names = list_datasets(args.category) elif args.all: @@ -614,6 +641,7 @@ def main(): print("Simple Models Benchmark") print("=======================") print("Models: RandomForest, LogisticRegression/Ridge") + print(f"Cross-Validation: {n_folds}-fold × {n_seeds} seed(s)") print(f"LLM enabled: {args.with_llm}") print(f"Datasets: {len(dataset_names)}") @@ -621,7 +649,12 @@ def main(): results = [] for name in dataset_names: result = run_single_benchmark( - name, args.max_features, args.with_llm, use_feature_cache=not args.no_feature_cache + name, + args.max_features, + args.with_llm, + use_feature_cache=not args.no_feature_cache, + n_folds=n_folds, + n_seeds=n_seeds, ) if result: results.append(result) diff --git a/featcopilot/selection/redundancy.py b/featcopilot/selection/redundancy.py index c2e3f21..19754bc 100644 --- a/featcopilot/selection/redundancy.py +++ b/featcopilot/selection/redundancy.py @@ -125,32 +125,37 @@ def _find_redundant_features(self, columns: list[str], non_numeric_cols: list[st corr = abs(self._correlation_matrix.loc[col1, col2]) if corr >= self.correlation_threshold: - # Decide which to remove based on importance + original feature preference - imp1 = self.importance_scores.get(col1, 0) - imp2 = self.importance_scores.get(col2, 0) - - # Add preference bonus for original features - # This ensures original features are preferred over derived ones is_orig1 = col1 in self.original_features is_orig2 = col2 in self.original_features + # Never remove an original feature if the other is derived if is_orig1 and not is_orig2: - # col1 is original, col2 is derived - prefer col1 - imp1 += self.original_preference + to_remove.add(col2) + if self.verbose: + logger.info(f"Removing {col2} (derived, corr={corr:.3f} with original {col1})") + continue elif is_orig2 and not is_orig1: - # col2 is original, col1 is derived - prefer col2 - imp2 += self.original_preference + to_remove.add(col1) + if self.verbose: + logger.info(f"Removing {col1} (derived, corr={corr:.3f} with original {col2})") + break + + # Both are original — never remove either + if is_orig1 and is_orig2: + continue + + # Both are derived — remove the one with lower importance + imp1 = self.importance_scores.get(col1, 0) + imp2 = self.importance_scores.get(col2, 0) if imp1 >= imp2: to_remove.add(col2) if self.verbose: - orig_tag = " (derived)" if not is_orig2 else "" - logger.info(f"Removing {col2}{orig_tag} (corr={corr:.3f} with {col1})") + logger.info(f"Removing {col2} (derived, corr={corr:.3f} with {col1})") else: to_remove.add(col1) if self.verbose: - orig_tag = " (derived)" if not is_orig1 else "" - logger.info(f"Removing {col1}{orig_tag} (corr={corr:.3f} with {col2})") + logger.info(f"Removing {col1} (derived, corr={corr:.3f} with {col2})") break # col1 is removed, move to next # Selected features are those not removed (numeric) plus all non-numeric columns diff --git a/featcopilot/selection/unified.py b/featcopilot/selection/unified.py index afd02cd..429d626 100644 --- a/featcopilot/selection/unified.py +++ b/featcopilot/selection/unified.py @@ -117,7 +117,10 @@ def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray ) eliminator.fit(X) non_redundant = set(eliminator.get_selected_features()) - self._feature_scores = {k: v for k, v in self._feature_scores.items() if k in non_redundant} + # Always preserve original features even if marked redundant + self._feature_scores = { + k: v for k, v in self._feature_scores.items() if k in non_redundant or k in self.original_features + } # Final selection self._final_selection() @@ -196,13 +199,13 @@ def _l1_refine(self, X: pd.DataFrame, y: np.ndarray, candidates: list[str]) -> l model.fit(X_cand, y) importances = model.feature_importances_ - # Keep features with importance above mean importance + # Keep features with importance above mean importance (stricter threshold) mean_imp = np.mean(importances) - selected = [c for c, imp in zip(candidates, importances) if imp >= mean_imp * 0.5] + selected = [c for c, imp in zip(candidates, importances) if imp >= mean_imp] if len(selected) == 0: - # Fallback: keep top half by importance - top_k = max(3, len(candidates) // 2) + # Fallback: keep only top 3 by importance + top_k = min(3, len(candidates)) idx = np.argsort(importances)[::-1][:top_k] selected = [candidates[i] for i in idx] diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py index c93f8fa..b32e93e 100644 --- a/featcopilot/transformers/sklearn_compat.py +++ b/featcopilot/transformers/sklearn_compat.py @@ -320,8 +320,125 @@ def fit_transform( if self.verbose: logger.info(f"Selected {len(self._selector.get_selected_features())} features") + # Do-no-harm gate: validate derived features help via held-out validation + if apply_selection and y is not None: + result = self._do_no_harm_gate(result, X, y, original_features) + return result + def _do_no_harm_gate( + self, + X_engineered: pd.DataFrame, + X_original: Union[pd.DataFrame, np.ndarray], + y: Union[pd.Series, np.ndarray], + original_features: set[str], + ) -> pd.DataFrame: + """ + Validate that engineered features help using held-out validation. + + Holds out 20% of the data, fits a fresh model on the remaining 80%, + and compares performance with and without derived features. This avoids + the bias from features being selected on the same data. + + Falls back to original features if derived features don't show + clear benefit on the held-out set. + + Parameters + ---------- + X_engineered : DataFrame + Data with engineered features (selected). + X_original : DataFrame or ndarray + Original input data. + y : Series or ndarray + Target variable. + original_features : set[str] + Names of original (non-derived) features. + + Returns + ------- + DataFrame + Either X_engineered if features help, or original-only subset. + """ + from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit + + y_arr = np.array(y) + + orig_cols = [c for c in X_engineered.columns if c in original_features] + derived_cols = [c for c in X_engineered.columns if c not in original_features] + + if len(derived_cols) == 0: + return X_engineered + + X_full = X_engineered.copy() + + # Use only numeric columns for the gate check + X_orig_numeric = X_full[orig_cols].select_dtypes(include=[np.number]) + X_full_numeric = X_full.select_dtypes(include=[np.number]) + + if X_orig_numeric.shape[1] == 0 or X_full_numeric.shape[1] == 0: + return X_engineered + + X_orig_numeric = X_orig_numeric.replace([np.inf, -np.inf], np.nan).fillna(0) + X_full_numeric = X_full_numeric.replace([np.inf, -np.inf], np.nan).fillna(0) + + try: + is_classification = len(np.unique(y_arr)) <= 20 and np.issubdtype(y_arr.dtype, np.integer) + if is_classification: + model_cls = RandomForestClassifier + splitter = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) + split_target = y_arr + else: + model_cls = RandomForestRegressor + splitter = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42) + split_target = y_arr + + model_params = {"n_estimators": 50, "max_depth": 10, "random_state": 42, "n_jobs": -1} + + orig_scores = [] + full_scores = [] + + for train_idx, val_idx in splitter.split(X_orig_numeric, split_target): + # Fit and score on held-out data + m_orig = model_cls(**model_params) + m_orig.fit(X_orig_numeric.iloc[train_idx], y_arr[train_idx]) + orig_scores.append(m_orig.score(X_orig_numeric.iloc[val_idx], y_arr[val_idx])) + + m_full = model_cls(**model_params) + m_full.fit(X_full_numeric.iloc[train_idx], y_arr[train_idx]) + full_scores.append(m_full.score(X_full_numeric.iloc[val_idx], y_arr[val_idx])) + + orig_mean = np.mean(orig_scores) + full_mean = np.mean(full_scores) + improvement = full_mean - orig_mean + + # Scale threshold by feature ratio — more added features = higher bar + feature_ratio = len(derived_cols) / max(len(orig_cols), 1) + threshold = 0.001 + 0.001 * feature_ratio + + if self.verbose: + logger.info( + f"Do-no-harm gate: orig={orig_mean:.4f}, full={full_mean:.4f}, " + f"delta={improvement:+.4f}, threshold={threshold:.4f} " + f"({len(derived_cols)} derived features)" + ) + + # Require clear positive benefit to keep derived features + if improvement < threshold: + if self.verbose: + logger.warning( + f"Do-no-harm: Derived features not beneficial ({improvement:+.4f}). " + f"Falling back to {len(orig_cols)} original features." + ) + self._selector = None + return X_engineered[orig_cols] + + except Exception as e: + if self.verbose: + logger.warning(f"Do-no-harm gate skipped due to error: {e}") + + return X_engineered + def get_feature_names(self) -> list[str]: """Get names of all generated features.""" names = []