RSK World - Statsmodels Statistical Modeling - Project Files | RSK World

"""
Model Selection and Comparison Utilities

Author: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')


class ModelSelection:
    """
    Model Selection and Comparison Tools
    
    Author: RSK World
    Website: https://rskworld.in
    Email: help@rskworld.in
    Phone: +91 93305 39277
    """
    
    def __init__(self):
        self.models = {}
        self.results = {}
    
    def compare_models(self, X, y, models_dict):
        """
        Compare multiple regression models
        
        Parameters:
        -----------
        X : array-like
            Independent variables
        y : array-like
            Dependent variable
        models_dict : dict
            Dictionary of model names and feature indices
        """
        comparison = []
        
        for name, features in models_dict.items():
            X_subset = X[:, features] if isinstance(features, (list, np.ndarray)) else X
            X_with_const = add_constant(X_subset)
            
            model = OLS(y, X_with_const).fit()
            
            comparison.append({
                'Model': name,
                'AIC': model.aic,
                'BIC': model.bic,
                'R-squared': model.rsquared,
                'Adj R-squared': model.rsquared_adj,
                'F-statistic': model.fvalue,
                'F p-value': model.f_pvalue,
                'Log Likelihood': model.llf,
                'Num Features': X_subset.shape[1] if len(X_subset.shape) > 1 else 1
            })
        
        comparison_df = pd.DataFrame(comparison)
        comparison_df = comparison_df.sort_values('AIC')
        
        print("Model Comparison:")
        print("=" * 80)
        print(comparison_df.to_string(index=False))
        
        return comparison_df
    
    def stepwise_selection(self, X, y, initial_features=None, threshold_in=0.01, 
                          threshold_out=0.05, verbose=True):
        """
        Stepwise feature selection
        
        Parameters:
        -----------
        X : array-like
            Independent variables
        y : array-like
            Dependent variable
        initial_features : list
            Initial features to include
        threshold_in : float
            p-value threshold for adding features
        threshold_out : float
            p-value threshold for removing features
        verbose : bool
            Print progress
        """
        included = list(initial_features) if initial_features else []
        n_features = X.shape[1] if len(X.shape) > 1 else 1
        
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)
        
        best_score = float('inf')
        
        while True:
            changed = False
            
            # Forward step
            if len(X.shape) > 1:
                excluded = [i for i in range(n_features) if i not in included]
            else:
                excluded = [0] if 0 not in included else []
            
            if len(X.shape) > 1:
                new_pval = pd.Series(index=range(n_features), dtype=float)
                for new_column in excluded:
                    model = OLS(y, add_constant(X[:, included + [new_column]])).fit()
                    new_pval[new_column] = model.pvalues[new_column]
                
                best_pval = new_pval.min()
                if best_pval < threshold_in:
                    best_feature = new_pval.idxmin()
                    included.append(best_feature)
                    changed = True
                    if verbose:
                        print(f'Add {best_feature} with p-value {best_pval:.6f}')
            
            # Backward step
            if included:
                model = OLS(y, add_constant(X[:, included])).fit()
                pvalues = model.pvalues[1:]  # Exclude intercept
                worst_pval = pvalues.max()
                
                if worst_pval > threshold_out:
                    worst_feature = pvalues.idxmax()
                    included.remove(worst_feature)
                    changed = True
                    if verbose:
                        print(f'Remove {worst_feature} with p-value {worst_pval:.6f}')
            
            if not changed:
                break
        
        final_model = OLS(y, add_constant(X[:, included])).fit()
        
        if verbose:
            print(f'\nFinal model includes features: {included}')
            print(f'R-squared: {final_model.rsquared:.4f}')
            print(f'AIC: {final_model.aic:.4f}')
        
        return included, final_model
    
    def calculate_ic(self, model_results, ic_type='aic'):
        """
        Calculate information criteria
        
        Parameters:
        -----------
        model_results : RegressionResults
            Fitted model results
        ic_type : str
            'aic', 'bic', or 'hqic'
        """
        if ic_type.lower() == 'aic':
            return model_results.aic
        elif ic_type.lower() == 'bic':
            return model_results.bic
        elif ic_type.lower() == 'hqic':
            return model_results.hqic
        else:
            raise ValueError("ic_type must be 'aic', 'bic', or 'hqic'")
    
    def plot_ic_comparison(self, models_dict, ic_type='aic'):
        """
        Plot information criteria comparison
        
        Parameters:
        -----------
        models_dict : dict
            Dictionary of model names and results
        ic_type : str
            Information criterion type
        """
        ics = []
        names = []
        
        for name, results in models_dict.items():
            ics.append(self.calculate_ic(results, ic_type))
            names.append(name)
        
        plt.figure(figsize=(10, 6))
        plt.bar(names, ics, color='steelblue', alpha=0.7)
        plt.xlabel('Model')
        plt.ylabel(f'{ic_type.upper()}')
        plt.title(f'Model Comparison: {ic_type.upper()}')
        plt.xticks(rotation=45, ha='right')
        plt.grid(True, alpha=0.3, axis='y')
        plt.tight_layout()
        plt.show()
        
        return pd.DataFrame({'Model': names, ic_type.upper(): ics})


class FeatureSelection:
    """
    Feature Selection Utilities
    
    Author: RSK World
    Website: https://rskworld.in
    Email: help@rskworld.in
    Phone: +91 93305 39277
    """
    
    @staticmethod
    def remove_multicollinear_features(X, threshold=10):
        """
        Remove multicollinear features based on VIF
        
        Parameters:
        -----------
        X : array-like
            Feature matrix
        threshold : float
            VIF threshold
        """
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)
        
        X_df = pd.DataFrame(X)
        X_with_const = add_constant(X_df)
        
        vif_data = pd.DataFrame()
        vif_data["Variable"] = X_with_const.columns
        vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i) 
                          for i in range(X_with_const.shape[1])]
        
        # Remove constant column from results
        vif_data = vif_data[vif_data['Variable'] != 'const']
        
        high_vif = vif_data[vif_data['VIF'] > threshold]
        
        if len(high_vif) > 0:
            features_to_remove = high_vif['Variable'].tolist()
            print(f"Features with VIF > {threshold}: {features_to_remove}")
            return [i for i in range(X.shape[1]) if i not in [int(f) for f in features_to_remove]]
        
        return list(range(X.shape[1]))
    
    @staticmethod
    def correlation_filter(X, threshold=0.95):
        """
        Remove highly correlated features
        
        Parameters:
        -----------
        X : array-like
            Feature matrix
        threshold : float
            Correlation threshold
        """
        if len(X.shape) == 1:
            X = X.reshape(-1, 1)
        
        X_df = pd.DataFrame(X)
        corr_matrix = X_df.corr().abs()
        
        upper_triangle = corr_matrix.where(
            np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        )
        
        to_drop = [column for column in upper_triangle.columns 
                  if any(upper_triangle[column] > threshold)]
        
        if to_drop:
            print(f"Features to drop due to high correlation: {to_drop}")
            return [i for i in range(X.shape[1]) if i not in [X_df.columns.get_loc(c) for c in to_drop]]
        
        return list(range(X.shape[1]))


if __name__ == "__main__":
    # Example usage
    print("Model Selection Example")
    print("=" * 70)
    
    # Generate sample data
    np.random.seed(42)
    n = 100
    X = np.random.randn(n, 5)
    y = 2 + 1.5 * X[:, 0] + 0.8 * X[:, 1] - 0.5 * X[:, 2] + np.random.randn(n) * 0.5
    
    # Model selection
    selector = ModelSelection()
    
    # Compare models
    models_dict = {
        'Model 1': [0],
        'Model 2': [0, 1],
        'Model 3': [0, 1, 2],
        'Full Model': [0, 1, 2, 3, 4]
    }
    
    comparison = selector.compare_models(X, y, models_dict)
    
    # Stepwise selection
    print("\n" + "=" * 70)
    print("Stepwise Selection:")
    print("=" * 70)
    features, model = selector.stepwise_selection(X, y, threshold_in=0.05, threshold_out=0.1)

310 lines•9.7 KB

python

Theme Settings

Color Scheme

Display Options

Font Size