RSK World - SciPy Scientific Computing - Project Files | RSK World

src/statistics.py

"""
Statistical Functions with SciPy
Author: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
"""

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm, t, chi2, f, uniform, expon, poisson, binom

# Example 1: Probability Distributions
def example_probability_distributions():
    """
    Working with probability distributions
    """
    print("=" * 60)
    print("Example 1: Probability Distributions")
    print("=" * 60)
    
    # Normal distribution
    mu, sigma = 0, 1
    x = np.linspace(-4, 4, 100)
    y_norm = norm.pdf(x, mu, sigma)
    
    # t-distribution
    df = 5
    y_t = t.pdf(x, df)
    
    # Chi-square distribution
    x_chi2 = np.linspace(0, 10, 100)
    df_chi2 = 3
    y_chi2 = chi2.pdf(x_chi2, df_chi2)
    
    print(f"Normal distribution: μ={mu}, σ={sigma}")
    print(f"t-distribution: df={df}")
    print(f"Chi-square distribution: df={df_chi2}")
    
    # Visualize
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    axes[0].plot(x, y_norm, 'b-', linewidth=2, label=f'Normal(μ={mu}, σ={sigma})')
    axes[0].fill_between(x, 0, y_norm, alpha=0.3)
    axes[0].set_xlabel('x', fontsize=12)
    axes[0].set_ylabel('PDF', fontsize=12)
    axes[0].set_title('Normal Distribution', fontsize=12, fontweight='bold')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    axes[1].plot(x, y_t, 'r-', linewidth=2, label=f't-distribution (df={df})')
    axes[1].plot(x, y_norm, 'b--', linewidth=1.5, alpha=0.7, label='Normal')
    axes[1].fill_between(x, 0, y_t, alpha=0.3, color='red')
    axes[1].set_xlabel('x', fontsize=12)
    axes[1].set_ylabel('PDF', fontsize=12)
    axes[1].set_title('t-Distribution', fontsize=12, fontweight='bold')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    axes[2].plot(x_chi2, y_chi2, 'g-', linewidth=2, label=f'Chi² (df={df_chi2})')
    axes[2].fill_between(x_chi2, 0, y_chi2, alpha=0.3)
    axes[2].set_xlabel('x', fontsize=12)
    axes[2].set_ylabel('PDF', fontsize=12)
    axes[2].set_title('Chi-Square Distribution', fontsize=12, fontweight='bold')
    axes[2].legend()
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('statistics_distributions.png', dpi=300, bbox_inches='tight')
    print("\nPlot saved as 'statistics_distributions.png'")
    plt.close()


# Example 2: Descriptive Statistics
def example_descriptive_statistics():
    """
    Descriptive statistics
    """
    print("\n" + "=" * 60)
    print("Example 2: Descriptive Statistics")
    print("=" * 60)
    
    # Generate sample data
    np.random.seed(42)
    data = np.random.normal(100, 15, 1000)
    
    # Calculate statistics
    mean = np.mean(data)
    median = np.median(data)
    std = np.std(data)
    var = np.var(data)
    skew = stats.skew(data)
    kurtosis = stats.kurtosis(data)
    
    print(f"Sample size: {len(data)}")
    print(f"Mean: {mean:.4f}")
    print(f"Median: {median:.4f}")
    print(f"Standard deviation: {std:.4f}")
    print(f"Variance: {var:.4f}")
    print(f"Skewness: {skew:.4f}")
    print(f"Kurtosis: {kurtosis:.4f}")
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Histogram
    axes[0].hist(data, bins=30, density=True, alpha=0.7, color='blue', edgecolor='black')
    x_fit = np.linspace(data.min(), data.max(), 100)
    axes[0].plot(x_fit, norm.pdf(x_fit, mean, std), 'r-', linewidth=2, label='Normal fit')
    axes[0].axvline(mean, color='red', linestyle='--', linewidth=2, label=f'Mean={mean:.2f}')
    axes[0].axvline(median, color='green', linestyle='--', linewidth=2, label=f'Median={median:.2f}')
    axes[0].set_xlabel('Value', fontsize=12)
    axes[0].set_ylabel('Density', fontsize=12)
    axes[0].set_title('Data Distribution', fontsize=12, fontweight='bold')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Box plot
    axes[1].boxplot(data, vert=True, patch_artist=True)
    axes[1].set_ylabel('Value', fontsize=12)
    axes[1].set_title('Box Plot', fontsize=12, fontweight='bold')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('statistics_descriptive.png', dpi=300, bbox_inches='tight')
    print("\nPlot saved as 'statistics_descriptive.png'")
    plt.close()


# Example 3: Hypothesis Testing
def example_hypothesis_testing():
    """
    Hypothesis testing examples
    """
    print("\n" + "=" * 60)
    print("Example 3: Hypothesis Testing")
    print("=" * 60)
    
    # Generate two samples
    np.random.seed(42)
    sample1 = np.random.normal(100, 15, 100)
    sample2 = np.random.normal(105, 15, 100)
    
    # One-sample t-test (test if mean = 100)
    t_stat1, p_value1 = stats.ttest_1samp(sample1, 100)
    
    # Two-sample t-test (test if means are equal)
    t_stat2, p_value2 = stats.ttest_ind(sample1, sample2)
    
    # Paired t-test
    t_stat3, p_value3 = stats.ttest_rel(sample1, sample2)
    
    print(f"One-sample t-test (H0: μ = 100):")
    print(f"  t-statistic: {t_stat1:.4f}, p-value: {p_value1:.4f}")
    print(f"  Result: {'Reject H0' if p_value1 < 0.05 else 'Fail to reject H0'}")
    
    print(f"\nTwo-sample t-test (H0: μ1 = μ2):")
    print(f"  t-statistic: {t_stat2:.4f}, p-value: {p_value2:.4f}")
    print(f"  Result: {'Reject H0' if p_value2 < 0.05 else 'Fail to reject H0'}")
    
    print(f"\nPaired t-test:")
    print(f"  t-statistic: {t_stat3:.4f}, p-value: {p_value3:.4f}")
    print(f"  Result: {'Reject H0' if p_value3 < 0.05 else 'Fail to reject H0'}")
    
    # Visualize
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(sample1, bins=20, alpha=0.7, label=f'Sample 1 (μ={np.mean(sample1):.2f})', color='blue')
    plt.hist(sample2, bins=20, alpha=0.7, label=f'Sample 2 (μ={np.mean(sample2):.2f})', color='red')
    plt.xlabel('Value', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.title('Sample Distributions', fontsize=12, fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    plt.boxplot([sample1, sample2], labels=['Sample 1', 'Sample 2'])
    plt.ylabel('Value', fontsize=12)
    plt.title('Box Plot Comparison', fontsize=12, fontweight='bold')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('statistics_hypothesis.png', dpi=300, bbox_inches='tight')
    print("\nPlot saved as 'statistics_hypothesis.png'")
    plt.close()


# Example 4: Confidence Intervals
def example_confidence_intervals():
    """
    Confidence intervals
    """
    print("\n" + "=" * 60)
    print("Example 4: Confidence Intervals")
    print("=" * 60)
    
    # Generate sample data
    np.random.seed(42)
    data = np.random.normal(100, 15, 100)
    
    # Calculate confidence interval for mean
    confidence_level = 0.95
    alpha = 1 - confidence_level
    n = len(data)
    mean = np.mean(data)
    std = np.std(data, ddof=1)  # Sample standard deviation
    se = std / np.sqrt(n)  # Standard error
    
    # t-distribution for confidence interval
    t_critical = t.ppf(1 - alpha/2, df=n-1)
    ci_lower = mean - t_critical * se
    ci_upper = mean + t_critical * se
    
    print(f"Sample mean: {mean:.4f}")
    print(f"Standard error: {se:.4f}")
    print(f"{confidence_level*100}% Confidence Interval: [{ci_lower:.4f}, {ci_upper:.4f}]")
    
    # Using scipy
    ci_scipy = stats.t.interval(confidence_level, df=n-1, loc=mean, scale=se)
    print(f"Using scipy.stats: [{ci_scipy[0]:.4f}, {ci_scipy[1]:.4f}]")
    
    # Visualize
    plt.figure(figsize=(10, 6))
    plt.hist(data, bins=20, density=True, alpha=0.7, color='blue', edgecolor='black')
    plt.axvline(mean, color='red', linestyle='-', linewidth=2, label=f'Mean={mean:.2f}')
    plt.axvline(ci_lower, color='green', linestyle='--', linewidth=2, label=f'CI Lower={ci_lower:.2f}')
    plt.axvline(ci_upper, color='green', linestyle='--', linewidth=2, label=f'CI Upper={ci_upper:.2f}')
    plt.fill_between([ci_lower, ci_upper], 0, plt.ylim()[1], alpha=0.2, color='green', label='95% CI')
    plt.xlabel('Value', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    plt.title(f'{confidence_level*100}% Confidence Interval for Mean', fontsize=14, fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('statistics_confidence.png', dpi=300, bbox_inches='tight')
    print("\nPlot saved as 'statistics_confidence.png'")
    plt.close()


# Example 5: Correlation and Regression
def example_correlation_regression():
    """
    Correlation and linear regression
    """
    print("\n" + "=" * 60)
    print("Example 5: Correlation and Regression")
    print("=" * 60)
    
    # Generate correlated data
    np.random.seed(42)
    x = np.linspace(0, 10, 50)
    y_true = 2 * x + 3
    y = y_true + np.random.normal(0, 2, len(x))
    
    # Calculate correlation
    correlation, p_value = stats.pearsonr(x, y)
    
    # Linear regression
    slope, intercept, r_value, p_value_reg, std_err = stats.linregress(x, y)
    
    # Predictions
    y_pred = slope * x + intercept
    
    print(f"Correlation coefficient: {correlation:.4f}")
    print(f"P-value (correlation): {p_value:.4f}")
    print(f"\nLinear regression:")
    print(f"  Slope: {slope:.4f}")
    print(f"  Intercept: {intercept:.4f}")
    print(f"  R²: {r_value**2:.4f}")
    print(f"  P-value: {p_value_reg:.4f}")
    print(f"  Standard error: {std_err:.4f}")
    
    # Visualize
    plt.figure(figsize=(10, 6))
    plt.scatter(x, y, alpha=0.6, color='blue', label='Data points')
    plt.plot(x, y_true, 'k--', linewidth=2, alpha=0.7, label='True relationship')
    plt.plot(x, y_pred, 'r-', linewidth=2, label=f'Regression line (R²={r_value**2:.3f})')
    plt.xlabel('x', fontsize=12)
    plt.ylabel('y', fontsize=12)
    plt.title('Linear Regression', fontsize=14, fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('statistics_regression.png', dpi=300, bbox_inches='tight')
    print("\nPlot saved as 'statistics_regression.png'")
    plt.close()


# Example 6: Non-parametric Tests
def example_nonparametric_tests():
    """
    Non-parametric statistical tests
    """
    print("\n" + "=" * 60)
    print("Example 6: Non-parametric Tests")
    print("=" * 60)
    
    # Generate two samples (not necessarily normal)
    np.random.seed(42)
    sample1 = np.random.exponential(2, 50)
    sample2 = np.random.exponential(2.5, 50)
    
    # Mann-Whitney U test (non-parametric alternative to t-test)
    u_stat, p_value_mw = stats.mannwhitneyu(sample1, sample2, alternative='two-sided')
    
    # Wilcoxon signed-rank test (non-parametric alternative to paired t-test)
    w_stat, p_value_w = stats.wilcoxon(sample1[:min(len(sample1), len(sample2))], 
                                       sample2[:min(len(sample1), len(sample2))])
    
    # Kruskal-Wallis test (non-parametric alternative to ANOVA)
    h_stat, p_value_kw = stats.kruskal(sample1, sample2)
    
    print(f"Mann-Whitney U test:")
    print(f"  U-statistic: {u_stat:.4f}, p-value: {p_value_mw:.4f}")
    print(f"  Result: {'Reject H0' if p_value_mw < 0.05 else 'Fail to reject H0'}")
    
    print(f"\nWilcoxon signed-rank test:")
    print(f"  W-statistic: {w_stat:.4f}, p-value: {p_value_w:.4f}")
    print(f"  Result: {'Reject H0' if p_value_w < 0.05 else 'Fail to reject H0'}")
    
    print(f"\nKruskal-Wallis test:")
    print(f"  H-statistic: {h_stat:.4f}, p-value: {p_value_kw:.4f}")
    print(f"  Result: {'Reject H0' if p_value_kw < 0.05 else 'Fail to reject H0'}")
    
    # Visualize
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(sample1, bins=15, alpha=0.7, label='Sample 1', color='blue')
    plt.hist(sample2, bins=15, alpha=0.7, label='Sample 2', color='red')
    plt.xlabel('Value', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.title('Sample Distributions', fontsize=12, fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    plt.boxplot([sample1, sample2], labels=['Sample 1', 'Sample 2'])
    plt.ylabel('Value', fontsize=12)
    plt.title('Box Plot Comparison', fontsize=12, fontweight='bold')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('statistics_nonparametric.png', dpi=300, bbox_inches='tight')
    print("\nPlot saved as 'statistics_nonparametric.png'")
    plt.close()


def main():
    """
    Main function to run all statistics examples
    """
    print("\n" + "=" * 60)
    print("SciPy Statistical Functions Examples")
    print("Author: RSK World - https://rskworld.in")
    print("=" * 60 + "\n")
    
    example_probability_distributions()
    example_descriptive_statistics()
    example_hypothesis_testing()
    example_confidence_intervals()
    example_correlation_regression()
    example_nonparametric_tests()
    
    print("\n" + "=" * 60)
    print("All statistics examples completed!")
    print("=" * 60)


if __name__ == "__main__":
    main()

375 lines•13.1 KB

python

Theme Settings

Color Scheme

Display Options

Font Size