RSK World - Statsmodels Statistical Modeling - Project Files | RSK World

"""
Hypothesis Testing and Statistical Tests using Statsmodels

Author: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.stats.weightstats import ttest_ind, ztest
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
from statsmodels.stats.diagnostic import lilliefors, jarque_bera
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
import warnings
warnings.filterwarnings('ignore')


class StatisticalTests:
    """
    Statistical Hypothesis Testing
    
    Author: RSK World
    Website: https://rskworld.in
    Email: help@rskworld.in
    Phone: +91 93305 39277
    """
    
    def __init__(self):
        pass
    
    def t_test(self, sample1, sample2=None, alternative='two-sided', value=0):
        """
        Perform t-test
        
        Parameters:
        -----------
        sample1 : array-like
            First sample or single sample
        sample2 : array-like, optional
            Second sample for two-sample test
        alternative : str
            'two-sided', 'larger', or 'smaller'
        value : float
            Hypothesized value for one-sample test
        """
        if sample2 is None:
            # One-sample t-test
            t_stat, p_value = stats.ttest_1samp(sample1, value)
            print("One-Sample t-test:")
            print(f"t-statistic: {t_stat:.4f}")
            print(f"p-value: {p_value:.4f}")
            print(f"Hypothesized mean: {value}")
            print(f"Sample mean: {np.mean(sample1):.4f}")
            print(f"Sample std: {np.std(sample1, ddof=1):.4f}")
        else:
            # Two-sample t-test
            t_stat, p_value, df = ttest_ind(sample1, sample2, alternative=alternative)
            print("Two-Sample t-test:")
            print(f"t-statistic: {t_stat:.4f}")
            print(f"p-value: {p_value:.4f}")
            print(f"Degrees of freedom: {df:.4f}")
            print(f"Sample 1 mean: {np.mean(sample1):.4f}")
            print(f"Sample 2 mean: {np.mean(sample2):.4f}")
        
        alpha = 0.05
        if p_value < alpha:
            print(f"\nResult: Reject null hypothesis (p < {alpha})")
        else:
            print(f"\nResult: Fail to reject null hypothesis (p >= {alpha})")
        
        return {'t_statistic': t_stat, 'p_value': p_value}
    
    def z_test(self, sample1, sample2=None, value=0, sigma1=None, sigma2=None):
        """
        Perform z-test
        
        Parameters:
        -----------
        sample1 : array-like
            First sample
        sample2 : array-like, optional
            Second sample
        value : float
            Hypothesized value
        sigma1 : float, optional
            Standard deviation of sample1
        sigma2 : float, optional
            Standard deviation of sample2
        """
        if sample2 is None:
            # One-sample z-test
            if sigma1 is None:
                sigma1 = np.std(sample1, ddof=1)
            z_stat = (np.mean(sample1) - value) / (sigma1 / np.sqrt(len(sample1)))
            p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
            print("One-Sample z-test:")
        else:
            # Two-sample z-test
            if sigma1 is None:
                sigma1 = np.std(sample1, ddof=1)
            if sigma2 is None:
                sigma2 = np.std(sample2, ddof=1)
            z_stat, p_value = ztest(sample1, sample2, value=value, 
                                   alternative='two-sided', usevar='pooled')
            print("Two-Sample z-test:")
        
        print(f"z-statistic: {z_stat:.4f}")
        print(f"p-value: {p_value:.4f}")
        
        alpha = 0.05
        if p_value < alpha:
            print(f"\nResult: Reject null hypothesis (p < {alpha})")
        else:
            print(f"\nResult: Fail to reject null hypothesis (p >= {alpha})")
        
        return {'z_statistic': z_stat, 'p_value': p_value}
    
    def chi_square_test(self, observed, expected=None):
        """
        Perform chi-square test
        
        Parameters:
        -----------
        observed : array-like
            Observed frequencies
        expected : array-like, optional
            Expected frequencies
        """
        if expected is None:
            # Goodness of fit test
            chi2, p_value = stats.chisquare(observed)
            print("Chi-Square Goodness of Fit Test:")
        else:
            # Test of independence
            chi2, p_value = stats.chisquare(observed, expected)
            print("Chi-Square Test of Independence:")
        
        print(f"Chi-square statistic: {chi2:.4f}")
        print(f"p-value: {p_value:.4f}")
        
        alpha = 0.05
        if p_value < alpha:
            print(f"\nResult: Reject null hypothesis (p < {alpha})")
        else:
            print(f"\nResult: Fail to reject null hypothesis (p >= {alpha})")
        
        return {'chi2': chi2, 'p_value': p_value}
    
    def proportion_test(self, count, nobs, value=None):
        """
        Test proportions
        
        Parameters:
        -----------
        count : int or array
            Number of successes
        nobs : int or array
            Number of trials
        value : float, optional
            Hypothesized proportion
        """
        if isinstance(count, (list, np.ndarray)) and len(count) > 1:
            # Two-sample proportion test
            z_stat, p_value = proportions_ztest(count, nobs, alternative='two-sided')
            print("Two-Sample Proportion Test:")
            print(f"Proportions: {count[0]/nobs[0]:.4f} vs {count[1]/nobs[1]:.4f}")
        else:
            # One-sample proportion test
            if value is None:
                value = 0.5
            count = count[0] if isinstance(count, (list, np.ndarray)) else count
            nobs = nobs[0] if isinstance(nobs, (list, np.ndarray)) else nobs
            z_stat, p_value = proportions_ztest(count, nobs, value=value)
            print("One-Sample Proportion Test:")
            print(f"Sample proportion: {count/nobs:.4f}")
            print(f"Hypothesized proportion: {value:.4f}")
        
        print(f"z-statistic: {z_stat:.4f}")
        print(f"p-value: {p_value:.4f}")
        
        alpha = 0.05
        if p_value < alpha:
            print(f"\nResult: Reject null hypothesis (p < {alpha})")
        else:
            print(f"\nResult: Fail to reject null hypothesis (p >= {alpha})")
        
        return {'z_statistic': z_stat, 'p_value': p_value}
    
    def normality_test(self, data):
        """
        Test for normality using multiple tests
        
        Parameters:
        -----------
        data : array-like
            Data to test
        """
        print("Normality Tests:")
        print("=" * 50)
        
        # Shapiro-Wilk test
        shapiro_stat, shapiro_p = stats.shapiro(data)
        print(f"\nShapiro-Wilk Test:")
        print(f"Statistic: {shapiro_stat:.4f}")
        print(f"p-value: {shapiro_p:.4f}")
        
        # Jarque-Bera test
        jb_stat, jb_p, _, _ = jarque_bera(data)
        print(f"\nJarque-Bera Test:")
        print(f"Statistic: {jb_stat:.4f}")
        print(f"p-value: {jb_p:.4f}")
        
        # Lilliefors test
        lf_stat, lf_p = lilliefors(data, dist='norm')
        print(f"\nLilliefors Test:")
        print(f"Statistic: {lf_stat:.4f}")
        print(f"p-value: {lf_p:.4f}")
        
        # Q-Q plot
        plt.figure(figsize=(10, 4))
        plt.subplot(1, 2, 1)
        stats.probplot(data, dist="norm", plot=plt)
        plt.title('Q-Q Plot')
        plt.grid(True, alpha=0.3)
        
        plt.subplot(1, 2, 2)
        plt.hist(data, bins=30, edgecolor='black', alpha=0.7, density=True)
        x = np.linspace(data.min(), data.max(), 100)
        plt.plot(x, stats.norm.pdf(x, np.mean(data), np.std(data)), 
                'r-', label='Normal Distribution')
        plt.xlabel('Value')
        plt.ylabel('Density')
        plt.title('Histogram with Normal Overlay')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        alpha = 0.05
        results = {
            'shapiro': {'statistic': shapiro_stat, 'p_value': shapiro_p, 
                        'is_normal': shapiro_p > alpha},
            'jarque_bera': {'statistic': jb_stat, 'p_value': jb_p, 
                           'is_normal': jb_p > alpha},
            'lilliefors': {'statistic': lf_stat, 'p_value': lf_p, 
                          'is_normal': lf_p > alpha}
        }
        
        return results
    
    def anova_test(self, data_dict):
        """
        Perform ANOVA test
        
        Parameters:
        -----------
        data_dict : dict
            Dictionary with group names as keys and data arrays as values
        """
        groups = list(data_dict.keys())
        data_list = [data_dict[group] for group in groups]
        
        f_stat, p_value = stats.f_oneway(*data_list)
        
        print("One-Way ANOVA Test:")
        print(f"F-statistic: {f_stat:.4f}")
        print(f"p-value: {p_value:.4f}")
        print("\nGroup Statistics:")
        for group, data in data_dict.items():
            print(f"{group}: mean={np.mean(data):.4f}, std={np.std(data, ddof=1):.4f}, n={len(data)}")
        
        alpha = 0.05
        if p_value < alpha:
            print(f"\nResult: Reject null hypothesis - means are significantly different (p < {alpha})")
        else:
            print(f"\nResult: Fail to reject null hypothesis - no significant difference in means (p >= {alpha})")
        
        return {'f_statistic': f_stat, 'p_value': p_value}
    
    def mann_whitney_test(self, sample1, sample2, alternative='two-sided'):
        """
        Perform Mann-Whitney U test (non-parametric)
        
        Parameters:
        -----------
        sample1 : array-like
            First sample
        sample2 : array-like
            Second sample
        alternative : str
            'two-sided', 'less', or 'greater'
        """
        u_stat, p_value = stats.mannwhitneyu(sample1, sample2, alternative=alternative)
        
        print("Mann-Whitney U Test (Non-parametric):")
        print(f"U-statistic: {u_stat:.4f}")
        print(f"p-value: {p_value:.4f}")
        print(f"Sample 1 median: {np.median(sample1):.4f}")
        print(f"Sample 2 median: {np.median(sample2):.4f}")
        
        alpha = 0.05
        if p_value < alpha:
            print(f"\nResult: Reject null hypothesis (p < {alpha})")
        else:
            print(f"\nResult: Fail to reject null hypothesis (p >= {alpha})")
        
        return {'u_statistic': u_stat, 'p_value': p_value}
    
    def kruskal_wallis_test(self, *samples):
        """
        Perform Kruskal-Wallis test (non-parametric ANOVA)
        
        Parameters:
        -----------
        *samples : array-like
            Two or more samples
        """
        h_stat, p_value = stats.kruskal(*samples)
        
        print("Kruskal-Wallis Test (Non-parametric ANOVA):")
        print(f"H-statistic: {h_stat:.4f}")
        print(f"p-value: {p_value:.4f}")
        print(f"Number of groups: {len(samples)}")
        
        alpha = 0.05
        if p_value < alpha:
            print(f"\nResult: Reject null hypothesis - distributions differ (p < {alpha})")
        else:
            print(f"\nResult: Fail to reject null hypothesis (p >= {alpha})")
        
        return {'h_statistic': h_stat, 'p_value': p_value}


if __name__ == "__main__":
    # Example usage
    print("Hypothesis Testing Example")
    print("=" * 50)
    
    # Generate sample data
    np.random.seed(42)
    sample1 = np.random.normal(100, 15, 30)
    sample2 = np.random.normal(105, 15, 30)
    
    # Create test object
    tests = StatisticalTests()
    
    # Perform t-test
    print("\n" + "="*50)
    tests.t_test(sample1, sample2)
    
    # Test for normality
    print("\n" + "="*50)
    tests.normality_test(sample1)
    
    # ANOVA test
    print("\n" + "="*50)
    sample3 = np.random.normal(110, 15, 30)
    tests.anova_test({
        'Group 1': sample1,
        'Group 2': sample2,
        'Group 3': sample3
    })

371 lines•12.5 KB

python

Theme Settings

Color Scheme

Display Options

Font Size