RSK World - Statsmodels Statistical Modeling - Project Files | RSK World

"""
Statistical Diagnostics and Model Validation

Author: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.diagnostic import (
    het_breuschpagan, het_white, acorr_ljungbox,
    linear_harvey_collier, linear_rainbow
)
from statsmodels.stats.outliers_influence import (
    variance_inflation_factor, influence_plot
)
from statsmodels.stats.stattools import durbin_watson
from scipy import stats
import warnings
warnings.filterwarnings('ignore')


class ModelDiagnostics:
    """
    Comprehensive Model Diagnostics
    
    Author: RSK World
    Website: https://rskworld.in
    Email: help@rskworld.in
    Phone: +91 93305 39277
    """
    
    def __init__(self, model_results):
        """
        Initialize diagnostics with model results
        
        Parameters:
        -----------
        model_results : RegressionResults
            Fitted model results from statsmodels
        """
        self.results = model_results
        self.residuals = model_results.resid
        self.fitted_values = model_results.fittedvalues
    
    def check_linearity(self):
        """Check linearity assumption using Rainbow test"""
        try:
            f_stat, p_value = linear_rainbow(self.results)
            print("Rainbow Test for Linearity:")
            print(f"F-statistic: {f_stat:.4f}")
            print(f"p-value: {p_value:.4f}")
            
            if p_value < 0.05:
                print("Warning: Non-linearity detected (p < 0.05)")
            else:
                print("Linearity assumption appears valid")
            
            return {'f_statistic': f_stat, 'p_value': p_value}
        except Exception as e:
            print(f"Error in linearity test: {e}")
            return None
    
    def check_heteroscedasticity(self, test='both'):
        """
        Check for heteroscedasticity
        
        Parameters:
        -----------
        test : str
            'breusch-pagan', 'white', or 'both'
        """
        results = {}
        
        if test in ['breusch-pagan', 'both']:
            try:
                lm, lm_pvalue, fvalue, f_pvalue = het_breuschpagan(
                    self.residuals, self.results.model.exog
                )
                print("\nBreusch-Pagan Test for Heteroscedasticity:")
                print(f"LM Statistic: {lm:.4f}")
                print(f"LM p-value: {lm_pvalue:.4f}")
                print(f"F Statistic: {fvalue:.4f}")
                print(f"F p-value: {f_pvalue:.4f}")
                
                if f_pvalue < 0.05:
                    print("Warning: Heteroscedasticity detected (p < 0.05)")
                else:
                    print("No significant heteroscedasticity detected")
                
                results['breusch_pagan'] = {
                    'lm': lm, 'lm_pvalue': lm_pvalue,
                    'fvalue': fvalue, 'f_pvalue': f_pvalue
                }
            except Exception as e:
                print(f"Error in Breusch-Pagan test: {e}")
        
        if test in ['white', 'both']:
            try:
                lm, lm_pvalue, fvalue, f_pvalue = het_white(
                    self.residuals, self.results.model.exog
                )
                print("\nWhite Test for Heteroscedasticity:")
                print(f"LM Statistic: {lm:.4f}")
                print(f"LM p-value: {lm_pvalue:.4f}")
                print(f"F Statistic: {fvalue:.4f}")
                print(f"F p-value: {f_pvalue:.4f}")
                
                if f_pvalue < 0.05:
                    print("Warning: Heteroscedasticity detected (p < 0.05)")
                else:
                    print("No significant heteroscedasticity detected")
                
                results['white'] = {
                    'lm': lm, 'lm_pvalue': lm_pvalue,
                    'fvalue': fvalue, 'f_pvalue': f_pvalue
                }
            except Exception as e:
                print(f"Error in White test: {e}")
        
        return results
    
    def check_autocorrelation(self, lags=10):
        """
        Check for autocorrelation
        
        Parameters:
        -----------
        lags : int
            Number of lags to test
        """
        # Durbin-Watson test
        dw = durbin_watson(self.residuals)
        print("\nDurbin-Watson Test for Autocorrelation:")
        print(f"Durbin-Watson Statistic: {dw:.4f}")
        
        if dw < 1.5:
            print("Warning: Positive autocorrelation detected")
        elif dw > 2.5:
            print("Warning: Negative autocorrelation detected")
        else:
            print("No significant autocorrelation detected")
        
        # Ljung-Box test
        lb_test = acorr_ljungbox(self.residuals, lags=lags, return_df=True)
        print(f"\nLjung-Box Test (lags={lags}):")
        print(lb_test)
        
        if lb_test['lb_pvalue'].iloc[-1] < 0.05:
            print("Warning: Residual autocorrelation detected (p < 0.05)")
        else:
            print("No significant residual autocorrelation detected")
        
        return {'durbin_watson': dw, 'ljung_box': lb_test}
    
    def check_multicollinearity(self):
        """Check for multicollinearity using VIF"""
        try:
            exog = self.results.model.exog
            vif_data = pd.DataFrame()
            vif_data["Variable"] = range(exog.shape[1])
            vif_data["VIF"] = [variance_inflation_factor(exog, i) 
                              for i in range(exog.shape[1])]
            
            print("\nVariance Inflation Factor (VIF):")
            print(vif_data)
            print("\nVIF > 10 indicates multicollinearity")
            
            high_vif = vif_data[vif_data['VIF'] > 10]
            if len(high_vif) > 0:
                print(f"\nWarning: {len(high_vif)} variable(s) with VIF > 10")
            else:
                print("\nNo significant multicollinearity detected")
            
            return vif_data
        except Exception as e:
            print(f"Error in VIF calculation: {e}")
            return None
    
    def check_normality(self):
        """Check normality of residuals"""
        print("\nNormality Tests for Residuals:")
        print("=" * 50)
        
        # Shapiro-Wilk test
        shapiro_stat, shapiro_p = stats.shapiro(self.residuals)
        print(f"\nShapiro-Wilk Test:")
        print(f"Statistic: {shapiro_stat:.4f}")
        print(f"p-value: {shapiro_p:.4f}")
        
        # Jarque-Bera test
        from statsmodels.stats.diagnostic import jarque_bera
        jb_stat, jb_p, _, _ = jarque_bera(self.residuals)
        print(f"\nJarque-Bera Test:")
        print(f"Statistic: {jb_stat:.4f}")
        print(f"p-value: {jb_p:.4f}")
        
        alpha = 0.05
        is_normal = shapiro_p > alpha and jb_p > alpha
        
        if not is_normal:
            print("\nWarning: Residuals may not be normally distributed")
        else:
            print("\nResiduals appear to be normally distributed")
        
        return {
            'shapiro': {'statistic': shapiro_stat, 'p_value': shapiro_p},
            'jarque_bera': {'statistic': jb_stat, 'p_value': jb_p},
            'is_normal': is_normal
        }
    
    def check_influential_points(self, threshold=2):
        """
        Check for influential observations
        
        Parameters:
        -----------
        threshold : float
            Threshold for Cook's distance
        """
        try:
            influence = self.results.get_influence()
            cooks_d = influence.cooks_distance[0]
            
            print(f"\nInfluential Points Analysis (Cook's Distance > {threshold}):")
            influential = np.where(cooks_d > threshold)[0]
            
            if len(influential) > 0:
                print(f"Found {len(influential)} influential observation(s):")
                print(f"Indices: {influential}")
                print(f"Cook's D values: {cooks_d[influential]}")
            else:
                print("No influential observations detected")
            
            # Plot influence
            fig, axes = plt.subplots(1, 2, figsize=(12, 5))
            
            axes[0].stem(range(len(cooks_d)), cooks_d)
            axes[0].axhline(y=threshold, color='r', linestyle='--', 
                          label=f'Threshold ({threshold})')
            axes[0].set_xlabel('Observation')
            axes[0].set_ylabel("Cook's Distance")
            axes[0].set_title("Cook's Distance")
            axes[0].legend()
            axes[0].grid(True, alpha=0.3)
            
            # Leverage plot
            leverage = influence.hat_matrix_diag
            axes[1].scatter(leverage, cooks_d, alpha=0.6)
            axes[1].axhline(y=threshold, color='r', linestyle='--')
            axes[1].set_xlabel('Leverage')
            axes[1].set_ylabel("Cook's Distance")
            axes[1].set_title('Influence Plot')
            axes[1].grid(True, alpha=0.3)
            
            plt.tight_layout()
            plt.show()
            
            return {
                'cooks_distance': cooks_d,
                'influential_indices': influential,
                'leverage': leverage
            }
        except Exception as e:
            print(f"Error in influential points analysis: {e}")
            return None
    
    def comprehensive_diagnostics(self):
        """Run all diagnostic tests"""
        print("=" * 70)
        print("COMPREHENSIVE MODEL DIAGNOSTICS")
        print("=" * 70)
        
        diagnostics = {}
        
        # Linearity
        print("\n" + "-" * 70)
        diagnostics['linearity'] = self.check_linearity()
        
        # Heteroscedasticity
        print("\n" + "-" * 70)
        diagnostics['heteroscedasticity'] = self.check_heteroscedasticity(test='both')
        
        # Autocorrelation
        print("\n" + "-" * 70)
        diagnostics['autocorrelation'] = self.check_autocorrelation()
        
        # Multicollinearity
        print("\n" + "-" * 70)
        diagnostics['multicollinearity'] = self.check_multicollinearity()
        
        # Normality
        print("\n" + "-" * 70)
        diagnostics['normality'] = self.check_normality()
        
        # Influential points
        print("\n" + "-" * 70)
        diagnostics['influential_points'] = self.check_influential_points()
        
        return diagnostics
    
    def plot_diagnostics(self):
        """Create comprehensive diagnostic plots"""
        fig = plt.figure(figsize=(15, 10))
        
        # Residuals vs Fitted
        ax1 = plt.subplot(2, 3, 1)
        ax1.scatter(self.fitted_values, self.residuals, alpha=0.6)
        ax1.axhline(y=0, color='r', linestyle='--')
        ax1.set_xlabel('Fitted Values')
        ax1.set_ylabel('Residuals')
        ax1.set_title('Residuals vs Fitted')
        ax1.grid(True, alpha=0.3)
        
        # Q-Q Plot
        ax2 = plt.subplot(2, 3, 2)
        stats.probplot(self.residuals, dist="norm", plot=ax2)
        ax2.set_title('Q-Q Plot')
        ax2.grid(True, alpha=0.3)
        
        # Scale-Location Plot
        ax3 = plt.subplot(2, 3, 3)
        sqrt_abs_residuals = np.sqrt(np.abs(self.residuals))
        ax3.scatter(self.fitted_values, sqrt_abs_residuals, alpha=0.6)
        ax3.set_xlabel('Fitted Values')
        ax3.set_ylabel('√|Standardized Residuals|')
        ax3.set_title('Scale-Location Plot')
        ax3.grid(True, alpha=0.3)
        
        # Residuals Histogram
        ax4 = plt.subplot(2, 3, 4)
        ax4.hist(self.residuals, bins=30, edgecolor='black', alpha=0.7)
        ax4.set_xlabel('Residuals')
        ax4.set_ylabel('Frequency')
        ax4.set_title('Residuals Distribution')
        ax4.grid(True, alpha=0.3)
        
        # Leverage Plot
        ax5 = plt.subplot(2, 3, 5)
        try:
            influence = self.results.get_influence()
            leverage = influence.hat_matrix_diag
            cooks_d = influence.cooks_distance[0]
            ax5.scatter(leverage, cooks_d, alpha=0.6)
            ax5.set_xlabel('Leverage')
            ax5.set_ylabel("Cook's Distance")
            ax5.set_title('Influence Plot')
            ax5.grid(True, alpha=0.3)
        except:
            ax5.text(0.5, 0.5, 'Leverage data not available', 
                    ha='center', va='center')
        
        # ACF of Residuals
        ax6 = plt.subplot(2, 3, 6)
        from statsmodels.tsa.stattools import acf
        acf_values = acf(self.residuals, nlags=20)
        ax6.stem(range(len(acf_values)), acf_values)
        ax6.axhline(y=1.96/np.sqrt(len(self.residuals)), 
              color='r', linestyle='--', alpha=0.5)
        ax6.axhline(y=-1.96/np.sqrt(len(self.residuals)), 
                   color='r', linestyle='--', alpha=0.5)
        ax6.set_xlabel('Lag')
        ax6.set_ylabel('ACF')
        ax6.set_title('ACF of Residuals')
        ax6.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()


if __name__ == "__main__":
    # Example usage
    print("Statistical Diagnostics Example")
    print("=" * 50)
    
    from regression_analysis import LinearRegressionModel
    
    # Generate sample data
    np.random.seed(42)
    n = 100
    X = np.random.randn(n, 3)
    y = 2 + 1.5 * X[:, 0] + 0.8 * X[:, 1] - 0.5 * X[:, 2] + np.random.randn(n) * 0.5
    
    # Fit model
    model = LinearRegressionModel()
    model.fit(X, y)
    
    # Create diagnostics
    diagnostics = ModelDiagnostics(model.results)
    
    # Run comprehensive diagnostics
    diagnostics.comprehensive_diagnostics()
    
    # Plot diagnostics
    diagnostics.plot_diagnostics()

401 lines•13.8 KB

python

Theme Settings

Color Scheme

Display Options

Font Size