RSK World - Statsmodels Statistical Modeling - Project Files | RSK World

notebooks/01_linear_regression.ipynb

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Linear Regression Analysis with Statsmodels\n",
        "\n",
        "<!--\n",
        "Author: RSK World\n",
        "Website: https://rskworld.in\n",
        "Email: help@rskworld.in\n",
        "Phone: +91 93305 39277\n",
        "Description: Linear and generalized linear regression analysis using Statsmodels\n",
        "-->\n",
        "\n",
        "This notebook demonstrates linear regression analysis using Statsmodels library.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Import necessary libraries\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "import seaborn as sns\n",
        "import sys\n",
        "import os\n",
        "\n",
        "# Add parent directory to path\n",
        "sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))\n",
        "\n",
        "from regression_analysis import LinearRegressionModel, GLMModel\n",
        "from statistical_diagnostics import ModelDiagnostics\n",
        "\n",
        "# Set style\n",
        "plt.style.use('seaborn-v0_8')\n",
        "sns.set_palette(\"husl\")\n",
        "%matplotlib inline\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Generate Sample Data\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Generate sample data\n",
        "np.random.seed(42)\n",
        "n = 200\n",
        "\n",
        "# Create independent variables\n",
        "X1 = np.random.randn(n) * 2\n",
        "X2 = np.random.randn(n) * 1.5\n",
        "X3 = np.random.randn(n) * 1\n",
        "\n",
        "# Create dependent variable with linear relationship\n",
        "y = 5 + 2.5 * X1 + 1.8 * X2 - 0.9 * X3 + np.random.randn(n) * 0.8\n",
        "\n",
        "# Combine into feature matrix\n",
        "X = np.column_stack([X1, X2, X3])\n",
        "\n",
        "# Create DataFrame for better visualization\n",
        "df = pd.DataFrame({\n",
        "    'X1': X1,\n",
        "    'X2': X2,\n",
        "    'X3': X3,\n",
        "    'y': y\n",
        "})\n",
        "\n",
        "print(\"Sample Data:\")\n",
        "print(df.head())\n",
        "print(f\"\\nData Shape: {df.shape}\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Fit Linear Regression Model\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Create and fit model\n",
        "model = LinearRegressionModel()\n",
        "results = model.fit(X, y)\n",
        "\n",
        "# Print summary\n",
        "model.summary()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Model Diagnostics\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Residual plots\n",
        "model.plot_residuals()\n",
        "\n",
        "# Check assumptions\n",
        "model.check_multicollinearity()\n",
        "model.check_heteroscedasticity()\n",
        "model.check_autocorrelation()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Comprehensive Diagnostics\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Load Data from File\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Load data from CSV file\n",
        "import os\n",
        "data_path = os.path.join('..', 'data', 'sample_data.csv')\n",
        "if os.path.exists(data_path):\n",
        "    df_loaded = pd.read_csv(data_path)\n",
        "    print(\"Loaded Data from CSV:\")\n",
        "    print(df_loaded.head())\n",
        "    print(f\"\\nData Shape: {df_loaded.shape}\")\n",
        "    \n",
        "    # Use loaded data for analysis\n",
        "    X_loaded = df_loaded[['X1', 'X2', 'X3']].values\n",
        "    y_loaded = df_loaded['y'].values\n",
        "else:\n",
        "    print(\"Data file not found. Using generated data.\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Model Selection\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Model selection and comparison\n",
        "from model_selection import ModelSelection\n",
        "\n",
        "selector = ModelSelection()\n",
        "\n",
        "# Compare different models\n",
        "models_dict = {\n",
        "    'Model 1 (X1 only)': [0],\n",
        "    'Model 2 (X1, X2)': [0, 1],\n",
        "    'Model 3 (X1, X2, X3)': [0, 1, 2]\n",
        "}\n",
        "\n",
        "comparison = selector.compare_models(X, y, models_dict)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Model Evaluation\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Cross-validation and evaluation\n",
        "from model_evaluation import ModelEvaluation\n",
        "\n",
        "evaluator = ModelEvaluation()\n",
        "\n",
        "# Define model function for CV\n",
        "def create_model(X_train, y_train):\n",
        "    m = LinearRegressionModel()\n",
        "    m.fit(X_train, y_train)\n",
        "    return m\n",
        "\n",
        "# Perform cross-validation\n",
        "cv_results = evaluator.cross_validate(X, y, create_model, cv_folds=5, scoring='mse')\n",
        "\n",
        "# Calculate metrics\n",
        "y_pred = model.predict(X)\n",
        "metrics = evaluator.calculate_metrics(y, y_pred)\n",
        "\n",
        "# Plot prediction comparison\n",
        "evaluator.plot_prediction_comparison(y, y_pred)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Visualization\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Advanced visualizations\n",
        "from visualization_utils import StatisticalVisualizations\n",
        "\n",
        "viz = StatisticalVisualizations()\n",
        "\n",
        "# Correlation matrix\n",
        "viz.plot_correlation_matrix(df)\n",
        "\n",
        "# Distribution plots\n",
        "viz.plot_distribution(y, title=\"Distribution of Dependent Variable\")\n",
        "\n",
        "# Feature importance\n",
        "if hasattr(model.results, 'params'):\n",
        "    coefs = model.results.params[1:]  # Exclude intercept\n",
        "    viz.plot_feature_importance(coefs, feature_names=['X1', 'X2', 'X3'])\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Run comprehensive diagnostics\n",
        "diagnostics = ModelDiagnostics(model.results)\n",
        "diagnostics.comprehensive_diagnostics()\n",
        "diagnostics.plot_diagnostics()\n"
      ]
    }
  ],
  "metadata": {
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}

280 lines•7.6 KB

json

Theme Settings

Color Scheme

Display Options

Font Size