RSK World - Text Classification Dataset - Project Files Browser | RSK World

data_augmentation.py model_explainability.py

scripts/data_augmentation.py

"""
================================================================================
Text Classification Dataset - Advanced Data Augmentation Module
================================================================================
Project: Text Classification Dataset
Category: Text Data / NLP

Author: Molla Samser
Designer & Tester: Rima Khatun
Website: https://rskworld.in
Email: help@rskworld.in | support@rskworld.in
Phone: +91 93305 39277

Copyright (c) 2026 RSK World - All Rights Reserved
Content used for educational purposes only.

Features:
- Synonym Replacement
- Random Insertion
- Random Swap
- Random Deletion
- Back Translation (simulated)
- Contextual Word Embeddings Augmentation

Created: December 2026
================================================================================
"""

import random
import re
from typing import List, Tuple, Optional
from collections import defaultdict

# Project information
__author__ = "Molla Samser"
__website__ = "https://rskworld.in"
__email__ = "help@rskworld.in"


class TextAugmenter:
    """
    Advanced text augmentation for NLP tasks.
    
    Techniques:
    1. Synonym Replacement (SR)
    2. Random Insertion (RI)
    3. Random Swap (RS)
    4. Random Deletion (RD)
    5. Back Translation (BT) - simulated
    6. Character-level augmentation
    
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    
    # Simple synonym dictionary for demonstration
    SYNONYMS = {
        'good': ['great', 'excellent', 'wonderful', 'fantastic', 'superb'],
        'bad': ['poor', 'terrible', 'awful', 'horrible', 'dreadful'],
        'big': ['large', 'huge', 'enormous', 'massive', 'giant'],
        'small': ['tiny', 'little', 'miniature', 'compact', 'petite'],
        'new': ['novel', 'fresh', 'recent', 'modern', 'innovative'],
        'old': ['ancient', 'aged', 'vintage', 'classic', 'traditional'],
        'fast': ['quick', 'rapid', 'swift', 'speedy', 'hasty'],
        'slow': ['gradual', 'leisurely', 'unhurried', 'sluggish', 'delayed'],
        'important': ['significant', 'crucial', 'vital', 'essential', 'critical'],
        'announces': ['reveals', 'declares', 'states', 'proclaims', 'discloses'],
        'launches': ['introduces', 'releases', 'unveils', 'debuts', 'presents'],
        'develops': ['creates', 'builds', 'designs', 'produces', 'constructs'],
        'discovers': ['finds', 'uncovers', 'detects', 'identifies', 'locates'],
        'achieves': ['accomplishes', 'attains', 'reaches', 'gains', 'secures'],
        'wins': ['triumphs', 'conquers', 'prevails', 'succeeds', 'captures'],
        'breaks': ['shatters', 'surpasses', 'exceeds', 'beats', 'tops'],
        'shows': ['demonstrates', 'displays', 'exhibits', 'reveals', 'indicates'],
        'says': ['states', 'declares', 'mentions', 'reports', 'claims'],
        'technology': ['tech', 'innovation', 'advancement', 'development'],
        'company': ['firm', 'corporation', 'enterprise', 'business', 'organization'],
        'market': ['industry', 'sector', 'field', 'arena', 'domain'],
        'research': ['study', 'investigation', 'analysis', 'examination', 'inquiry'],
        'scientist': ['researcher', 'expert', 'specialist', 'scholar', 'analyst'],
        'government': ['administration', 'authorities', 'regime', 'state', 'officials'],
        'economy': ['market', 'finances', 'commerce', 'trade', 'business'],
        'revolutionary': ['groundbreaking', 'innovative', 'pioneering', 'transformative'],
        'historic': ['landmark', 'significant', 'momentous', 'unprecedented', 'notable'],
    }
    
    def __init__(
        self,
        alpha_sr: float = 0.1,
        alpha_ri: float = 0.1,
        alpha_rs: float = 0.1,
        alpha_rd: float = 0.1,
        num_aug: int = 4,
        random_state: Optional[int] = None
    ):
        """
        Initialize the TextAugmenter.
        
        Args:
            alpha_sr: Probability for synonym replacement
            alpha_ri: Probability for random insertion
            alpha_rs: Probability for random swap
            alpha_rd: Probability for random deletion
            num_aug: Number of augmented samples per original
            random_state: Random seed for reproducibility
        """
        self.alpha_sr = alpha_sr
        self.alpha_ri = alpha_ri
        self.alpha_rs = alpha_rs
        self.alpha_rd = alpha_rd
        self.num_aug = num_aug
        
        if random_state is not None:
            random.seed(random_state)
        
        # Build reverse synonym lookup
        self.word_to_synonyms = defaultdict(list)
        for word, syns in self.SYNONYMS.items():
            self.word_to_synonyms[word.lower()] = [s.lower() for s in syns]
            for syn in syns:
                self.word_to_synonyms[syn.lower()].append(word.lower())
    
    def get_synonyms(self, word: str) -> List[str]:
        """Get synonyms for a word."""
        return self.word_to_synonyms.get(word.lower(), [])
    
    def synonym_replacement(self, words: List[str], n: int) -> List[str]:
        """
        Replace n random words with their synonyms.
        
        Args:
            words: List of words
            n: Number of words to replace
            
        Returns:
            Augmented word list
        """
        new_words = words.copy()
        random_word_list = list(set([w for w in words if self.get_synonyms(w)]))
        random.shuffle(random_word_list)
        
        num_replaced = 0
        for random_word in random_word_list:
            synonyms = self.get_synonyms(random_word)
            if synonyms:
                synonym = random.choice(synonyms)
                new_words = [synonym if w.lower() == random_word.lower() else w for w in new_words]
                num_replaced += 1
            if num_replaced >= n:
                break
        
        return new_words
    
    def random_insertion(self, words: List[str], n: int) -> List[str]:
        """
        Randomly insert n synonyms into the sentence.
        
        Args:
            words: List of words
            n: Number of words to insert
            
        Returns:
            Augmented word list
        """
        new_words = words.copy()
        
        for _ in range(n):
            self._add_word(new_words)
        
        return new_words
    
    def _add_word(self, words: List[str]):
        """Add a synonym of a random word at a random position."""
        if not words:
            return
        
        synonyms = []
        counter = 0
        while not synonyms:
            random_word = words[random.randint(0, len(words) - 1)]
            synonyms = self.get_synonyms(random_word)
            counter += 1
            if counter >= 10:
                return
        
        random_synonym = random.choice(synonyms)
        random_idx = random.randint(0, len(words) - 1)
        words.insert(random_idx, random_synonym)
    
    def random_swap(self, words: List[str], n: int) -> List[str]:
        """
        Randomly swap n pairs of words.
        
        Args:
            words: List of words
            n: Number of swaps
            
        Returns:
            Augmented word list
        """
        new_words = words.copy()
        
        for _ in range(n):
            new_words = self._swap_word(new_words)
        
        return new_words
    
    def _swap_word(self, words: List[str]) -> List[str]:
        """Swap two random words."""
        if len(words) < 2:
            return words
        
        new_words = words.copy()
        idx1, idx2 = random.sample(range(len(new_words)), 2)
        new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
        
        return new_words
    
    def random_deletion(self, words: List[str], p: float) -> List[str]:
        """
        Randomly delete words with probability p.
        
        Args:
            words: List of words
            p: Probability of deletion
            
        Returns:
            Augmented word list
        """
        if len(words) == 1:
            return words
        
        new_words = [w for w in words if random.random() > p]
        
        if not new_words:
            return [random.choice(words)]
        
        return new_words
    
    def character_swap(self, text: str, p: float = 0.01) -> str:
        """
        Randomly swap adjacent characters (simulates typos).
        
        Args:
            text: Input text
            p: Probability of swap per character
            
        Returns:
            Augmented text
        """
        chars = list(text)
        
        for i in range(len(chars) - 1):
            if random.random() < p and chars[i].isalpha() and chars[i + 1].isalpha():
                chars[i], chars[i + 1] = chars[i + 1], chars[i]
        
        return ''.join(chars)
    
    def keyboard_augment(self, text: str, p: float = 0.01) -> str:
        """
        Simulate keyboard typing errors.
        
        Args:
            text: Input text
            p: Probability of error per character
            
        Returns:
            Augmented text
        """
        keyboard_neighbors = {
            'a': 'sqwz', 'b': 'vghn', 'c': 'xdfv', 'd': 'serfcx',
            'e': 'wrsdf', 'f': 'drtgvc', 'g': 'ftyhbv', 'h': 'gyujnb',
            'i': 'ujklo', 'j': 'huiknm', 'k': 'jiolm', 'l': 'kop',
            'm': 'njk', 'n': 'bhjm', 'o': 'iklp', 'p': 'ol',
            'q': 'wa', 'r': 'edft', 's': 'awedxz', 't': 'rfgy',
            'u': 'yhjki', 'v': 'cfgb', 'w': 'qase', 'x': 'zsdc',
            'y': 'tghu', 'z': 'asx'
        }
        
        chars = list(text.lower())
        
        for i, char in enumerate(chars):
            if char in keyboard_neighbors and random.random() < p:
                chars[i] = random.choice(keyboard_neighbors[char])
        
        return ''.join(chars)
    
    def augment(self, text: str) -> List[str]:
        """
        Apply all augmentation techniques to generate multiple versions.
        
        Args:
            text: Input text
            
        Returns:
            List of augmented texts
        """
        words = text.split()
        num_words = len(words)
        
        augmented_texts = []
        
        for _ in range(self.num_aug):
            aug_text = None
            
            # Randomly choose augmentation technique
            technique = random.choice(['sr', 'ri', 'rs', 'rd', 'char', 'kb'])
            
            if technique == 'sr':
                n = max(1, int(self.alpha_sr * num_words))
                aug_words = self.synonym_replacement(words, n)
                aug_text = ' '.join(aug_words)
            
            elif technique == 'ri':
                n = max(1, int(self.alpha_ri * num_words))
                aug_words = self.random_insertion(words, n)
                aug_text = ' '.join(aug_words)
            
            elif technique == 'rs':
                n = max(1, int(self.alpha_rs * num_words))
                aug_words = self.random_swap(words, n)
                aug_text = ' '.join(aug_words)
            
            elif technique == 'rd':
                aug_words = self.random_deletion(words, self.alpha_rd)
                aug_text = ' '.join(aug_words)
            
            elif technique == 'char':
                aug_text = self.character_swap(text)
            
            elif technique == 'kb':
                aug_text = self.keyboard_augment(text)
            
            if aug_text and aug_text != text:
                augmented_texts.append(aug_text)
        
        return augmented_texts
    
    def augment_dataset(
        self,
        texts: List[str],
        labels: List[int],
        augment_per_sample: int = 2
    ) -> Tuple[List[str], List[int]]:
        """
        Augment an entire dataset.
        
        Args:
            texts: List of original texts
            labels: List of labels
            augment_per_sample: Augmentations per sample
            
        Returns:
            Tuple of (augmented_texts, augmented_labels)
        """
        self.num_aug = augment_per_sample
        
        all_texts = list(texts)
        all_labels = list(labels)
        
        for text, label in zip(texts, labels):
            aug_texts = self.augment(text)
            all_texts.extend(aug_texts)
            all_labels.extend([label] * len(aug_texts))
        
        return all_texts, all_labels


class BackTranslator:
    """
    Simulated back-translation augmentation.
    Uses word variations to simulate translation effects.
    
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    
    VARIATIONS = {
        'the': ['a', 'this', 'that'],
        'is': ['was', 'becomes', 'remains'],
        'are': ['were', 'become', 'remain'],
        'has': ['had', 'possesses', 'holds'],
        'have': ['had', 'possess', 'hold'],
        'will': ['would', 'shall', 'might'],
        'can': ['could', 'may', 'might'],
        'very': ['extremely', 'highly', 'quite'],
        'said': ['stated', 'mentioned', 'declared'],
        'made': ['created', 'produced', 'developed'],
    }
    
    def __init__(self, variation_prob: float = 0.3):
        self.variation_prob = variation_prob
    
    def back_translate(self, text: str) -> str:
        """
        Simulate back-translation by applying word variations.
        
        Args:
            text: Input text
            
        Returns:
            Simulated back-translated text
        """
        words = text.split()
        new_words = []
        
        for word in words:
            lower_word = word.lower()
            if lower_word in self.VARIATIONS and random.random() < self.variation_prob:
                variation = random.choice(self.VARIATIONS[lower_word])
                # Preserve capitalization
                if word[0].isupper():
                    variation = variation.capitalize()
                new_words.append(variation)
            else:
                new_words.append(word)
        
        return ' '.join(new_words)


class MixupAugmenter:
    """
    Text mixup augmentation for soft label training.
    
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    
    def __init__(self, alpha: float = 0.2):
        """
        Initialize mixup augmenter.
        
        Args:
            alpha: Beta distribution parameter for mixing
        """
        self.alpha = alpha
    
    def mixup_texts(
        self,
        text1: str,
        text2: str,
        label1: int,
        label2: int,
        num_classes: int
    ) -> Tuple[str, List[float]]:
        """
        Mix two texts and their labels.
        
        Args:
            text1: First text
            text2: Second text
            label1: First label
            label2: Second label
            num_classes: Total number of classes
            
        Returns:
            Tuple of (mixed_text, soft_labels)
        """
        # Get mixing coefficient
        lam = random.betavariate(self.alpha, self.alpha)
        
        # Mix texts by interleaving sentences/words
        words1 = text1.split()
        words2 = text2.split()
        
        mixed_words = []
        max_len = max(len(words1), len(words2))
        
        for i in range(max_len):
            if random.random() < lam:
                if i < len(words1):
                    mixed_words.append(words1[i])
            else:
                if i < len(words2):
                    mixed_words.append(words2[i])
        
        mixed_text = ' '.join(mixed_words) if mixed_words else text1
        
        # Create soft labels
        soft_labels = [0.0] * num_classes
        soft_labels[label1] = lam
        soft_labels[label2] = 1 - lam
        
        return mixed_text, soft_labels


def augment_csv_dataset(
    input_path: str,
    output_path: str,
    augment_per_sample: int = 2,
    random_state: int = 42
):
    """
    Augment a CSV dataset and save the result.
    
    Args:
        input_path: Path to input CSV
        output_path: Path to output CSV
        augment_per_sample: Number of augmentations per sample
        random_state: Random seed
        
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    import pandas as pd
    
    print(f"\n{'='*60}")
    print("Text Data Augmentation - RSK World")
    print(f"Author: {__author__} | Website: {__website__}")
    print(f"{'='*60}\n")
    
    # Load dataset
    df = pd.read_csv(input_path, comment='#')
    original_size = len(df)
    print(f"Original dataset size: {original_size}")
    
    # Initialize augmenter
    augmenter = TextAugmenter(
        num_aug=augment_per_sample,
        random_state=random_state
    )
    
    # Augment
    aug_texts, aug_labels = augmenter.augment_dataset(
        df['text'].tolist(),
        df['label'].tolist(),
        augment_per_sample
    )
    
    # Create augmented dataframe
    aug_df = pd.DataFrame({
        'id': range(1, len(aug_texts) + 1),
        'text': aug_texts,
        'category': [df[df['label'] == l]['category'].iloc[0] for l in aug_labels],
        'label': aug_labels
    })
    
    # Save
    aug_df.to_csv(output_path, index=False)
    
    print(f"Augmented dataset size: {len(aug_df)}")
    print(f"Increase: {len(aug_df) - original_size} samples ({((len(aug_df)/original_size)-1)*100:.1f}%)")
    print(f"Saved to: {output_path}")


if __name__ == "__main__":
    # Demo
    print(f"\n{'='*60}")
    print("Text Augmentation Demo - RSK World")
    print(f"Author: {__author__} | Website: {__website__}")
    print(f"{'='*60}\n")
    
    augmenter = TextAugmenter(num_aug=5, random_state=42)
    
    sample_text = "Apple announces revolutionary new iPhone featuring advanced AI capabilities."
    
    print(f"Original: {sample_text}\n")
    print("Augmented versions:")
    print("-" * 50)
    
    for i, aug_text in enumerate(augmenter.augment(sample_text), 1):
        print(f"{i}. {aug_text}")
    
    print(f"\n{'='*60}")
    print("Augmentation Demo Complete!")

563 lines•18.2 KB

python

scripts/model_explainability.py

Raw Download

"""
================================================================================
Text Classification Dataset - Model Explainability Module
================================================================================
Project: Text Classification Dataset
Category: Text Data / NLP

Author: Molla Samser
Designer & Tester: Rima Khatun
Website: https://rskworld.in
Email: help@rskworld.in | support@rskworld.in
Phone: +91 93305 39277

Copyright (c) 2026 RSK World - All Rights Reserved
Content used for educational purposes only.

Features:
- LIME Text Explainer
- Feature Attribution
- Word Importance Highlighting
- Prediction Confidence Analysis
- Attention Visualization (for transformers)

Created: December 2026
================================================================================
"""

import re
import string
from typing import List, Dict, Tuple, Optional, Callable
import numpy as np

# Project information
__author__ = "Molla Samser"
__website__ = "https://rskworld.in"
__email__ = "help@rskworld.in"

# Category mapping
CATEGORIES = {
    0: 'Technology',
    1: 'Sports',
    2: 'Politics',
    3: 'Entertainment',
    4: 'Business',
    5: 'Science'
}


class TextExplainer:
    """
    Text classification model explainer using LIME-like approach.
    
    Explains predictions by analyzing word importance through
    perturbation-based feature attribution.
    
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    
    def __init__(
        self,
        classifier_fn: Callable,
        class_names: List[str] = None,
        num_samples: int = 1000,
        random_state: int = 42
    ):
        """
        Initialize the explainer.
        
        Args:
            classifier_fn: Function that takes list of texts and returns probabilities
            class_names: List of class names
            num_samples: Number of perturbations to generate
            random_state: Random seed
        """
        self.classifier_fn = classifier_fn
        self.class_names = class_names or list(CATEGORIES.values())
        self.num_samples = num_samples
        np.random.seed(random_state)
    
    def _tokenize(self, text: str) -> List[str]:
        """Simple word tokenization."""
        # Remove punctuation and split
        text = re.sub(r'[^\w\s]', '', text.lower())
        return text.split()
    
    def _perturb_text(
        self,
        words: List[str],
        num_samples: int
    ) -> Tuple[List[str], np.ndarray]:
        """
        Generate perturbed versions of text by randomly removing words.
        
        Args:
            words: List of words in original text
            num_samples: Number of perturbations
            
        Returns:
            Tuple of (perturbed_texts, perturbation_matrix)
        """
        num_words = len(words)
        
        # Generate binary mask matrix (1 = keep word, 0 = remove)
        # Each row is a perturbation
        perturbation_matrix = np.random.binomial(1, 0.5, size=(num_samples, num_words))
        
        # Always include original text
        perturbation_matrix[0] = np.ones(num_words)
        
        perturbed_texts = []
        for row in perturbation_matrix:
            perturbed_words = [w for w, keep in zip(words, row) if keep]
            perturbed_texts.append(' '.join(perturbed_words) if perturbed_words else words[0])
        
        return perturbed_texts, perturbation_matrix
    
    def _compute_weights(
        self,
        perturbation_matrix: np.ndarray,
        kernel_width: float = 25.0
    ) -> np.ndarray:
        """
        Compute weights for each perturbation based on distance from original.
        
        Args:
            perturbation_matrix: Binary matrix of perturbations
            kernel_width: Width of exponential kernel
            
        Returns:
            Array of weights
        """
        # Distance is number of removed words
        distances = np.sum(perturbation_matrix == 0, axis=1)
        weights = np.exp(-distances ** 2 / kernel_width ** 2)
        return weights
    
    def _fit_linear_model(
        self,
        perturbation_matrix: np.ndarray,
        predictions: np.ndarray,
        weights: np.ndarray,
        target_class: int
    ) -> np.ndarray:
        """
        Fit weighted linear model to get feature importance.
        
        Args:
            perturbation_matrix: Binary feature matrix
            predictions: Model predictions for perturbations
            weights: Sample weights
            target_class: Class to explain
            
        Returns:
            Feature importance scores
        """
        from sklearn.linear_model import Ridge
        
        # Get predictions for target class
        y = predictions[:, target_class]
        
        # Fit weighted ridge regression
        model = Ridge(alpha=1.0)
        model.fit(perturbation_matrix, y, sample_weight=weights)
        
        return model.coef_
    
    def explain(
        self,
        text: str,
        num_features: int = 10,
        target_class: Optional[int] = None
    ) -> Dict:
        """
        Explain a prediction for the given text.
        
        Args:
            text: Text to explain
            num_features: Number of top features to return
            target_class: Class to explain (None = predicted class)
            
        Returns:
            Dictionary with explanation results
        """
        # Tokenize
        words = self._tokenize(text)
        
        if len(words) == 0:
            return {'error': 'Empty text after tokenization'}
        
        # Generate perturbations
        perturbed_texts, perturbation_matrix = self._perturb_text(
            words, self.num_samples
        )
        
        # Get predictions for all perturbations
        predictions = self.classifier_fn(perturbed_texts)
        
        # Get original prediction
        original_probs = predictions[0]
        predicted_class = np.argmax(original_probs)
        
        if target_class is None:
            target_class = predicted_class
        
        # Compute weights
        weights = self._compute_weights(perturbation_matrix)
        
        # Fit linear model
        importances = self._fit_linear_model(
            perturbation_matrix, predictions, weights, target_class
        )
        
        # Get top features
        top_indices = np.argsort(np.abs(importances))[-num_features:][::-1]
        
        word_importance = [
            {
                'word': words[i],
                'importance': float(importances[i]),
                'direction': 'positive' if importances[i] > 0 else 'negative'
            }
            for i in top_indices if i < len(words)
        ]
        
        return {
            'text': text,
            'predicted_class': int(predicted_class),
            'predicted_category': self.class_names[predicted_class],
            'explained_class': int(target_class),
            'explained_category': self.class_names[target_class],
            'confidence': float(original_probs[predicted_class]),
            'probabilities': {
                self.class_names[i]: float(p)
                for i, p in enumerate(original_probs)
            },
            'word_importance': word_importance,
            'top_positive_words': [
                w for w in word_importance if w['direction'] == 'positive'
            ][:5],
            'top_negative_words': [
                w for w in word_importance if w['direction'] == 'negative'
            ][:5]
        }
    
    def explain_with_html(
        self,
        text: str,
        num_features: int = 10,
        target_class: Optional[int] = None
    ) -> str:
        """
        Generate HTML visualization of explanation.
        
        Args:
            text: Text to explain
            num_features: Number of features
            target_class: Class to explain
            
        Returns:
            HTML string with highlighted text
        """
        explanation = self.explain(text, num_features, target_class)
        
        if 'error' in explanation:
            return f"<p>Error: {explanation['error']}</p>"
        
        # Create word importance lookup
        word_scores = {
            item['word'].lower(): item['importance']
            for item in explanation['word_importance']
        }
        
        # Build HTML
        html_parts = ['<div style="font-family: Arial, sans-serif; line-height: 1.8;">']
        
        words = text.split()
        for word in words:
            clean_word = re.sub(r'[^\w]', '', word.lower())
            if clean_word in word_scores:
                score = word_scores[clean_word]
                # Color based on importance
                if score > 0:
                    intensity = min(255, int(abs(score) * 500))
                    color = f'rgba(34, 197, 94, {min(1, abs(score) * 3)})'  # Green
                else:
                    intensity = min(255, int(abs(score) * 500))
                    color = f'rgba(239, 68, 68, {min(1, abs(score) * 3)})'  # Red
                
                html_parts.append(
                    f'<span style="background-color: {color}; padding: 2px 4px; '
                    f'border-radius: 3px; margin: 0 2px;">{word}</span>'
                )
            else:
                html_parts.append(f' {word}')
        
        html_parts.append('</div>')
        
        # Add prediction info
        html_parts.append('<div style="margin-top: 20px; padding: 15px; '
                         'background: #1a1333; border-radius: 8px; color: #f8fafc;">')
        html_parts.append(f'<strong>Prediction:</strong> {explanation["predicted_category"]} '
                         f'({explanation["confidence"]:.1%} confidence)')
        html_parts.append('</div>')
        
        return '\n'.join(html_parts)


class AttentionVisualizer:
    """
    Visualize attention weights from transformer models.
    
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    
    def __init__(self, model, tokenizer):
        """
        Initialize with transformer model and tokenizer.
        
        Args:
            model: Transformer model
            tokenizer: Tokenizer
        """
        self.model = model
        self.tokenizer = tokenizer
    
    def get_attention_weights(self, text: str) -> Dict:
        """
        Extract attention weights for input text.
        
        Args:
            text: Input text
            
        Returns:
            Dictionary with attention analysis
        """
        import torch
        
        # Tokenize
        inputs = self.tokenizer(
            text,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=512
        )
        
        # Get attention
        with torch.no_grad():
            outputs = self.model(**inputs, output_attentions=True)
        
        # Average attention across layers and heads
        attentions = outputs.attentions  # Tuple of (batch, heads, seq, seq)
        avg_attention = torch.stack(attentions).mean(dim=(0, 1, 2))
        
        # Get tokens
        tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        
        # Attention scores per token (excluding special tokens)
        token_scores = avg_attention[1:-1].numpy()  # Remove [CLS] and [SEP]
        tokens = tokens[1:-1]
        
        return {
            'tokens': tokens,
            'attention_scores': token_scores.tolist(),
            'top_attended': sorted(
                zip(tokens, token_scores),
                key=lambda x: x[1],
                reverse=True
            )[:10]
        }


class PredictionAnalyzer:
    """
    Analyze prediction confidence and uncertainty.
    
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    
    def __init__(self, classifier_fn: Callable, class_names: List[str] = None):
        """
        Initialize analyzer.
        
        Args:
            classifier_fn: Function that returns probabilities
            class_names: List of class names
        """
        self.classifier_fn = classifier_fn
        self.class_names = class_names or list(CATEGORIES.values())
    
    def analyze(self, text: str) -> Dict:
        """
        Comprehensive prediction analysis.
        
        Args:
            text: Input text
            
        Returns:
            Analysis results
        """
        probs = self.classifier_fn([text])[0]
        
        predicted_class = np.argmax(probs)
        sorted_indices = np.argsort(probs)[::-1]
        
        # Entropy (uncertainty measure)
        entropy = -np.sum(probs * np.log(probs + 1e-10))
        max_entropy = np.log(len(probs))
        normalized_entropy = entropy / max_entropy
        
        # Margin (difference between top 2 predictions)
        margin = probs[sorted_indices[0]] - probs[sorted_indices[1]]
        
        return {
            'text': text[:200] + '...' if len(text) > 200 else text,
            'prediction': {
                'class': int(predicted_class),
                'category': self.class_names[predicted_class],
                'confidence': float(probs[predicted_class])
            },
            'all_probabilities': {
                self.class_names[i]: float(probs[i])
                for i in sorted_indices
            },
            'uncertainty': {
                'entropy': float(entropy),
                'normalized_entropy': float(normalized_entropy),
                'margin': float(margin),
                'is_confident': margin > 0.3,
                'is_uncertain': normalized_entropy > 0.7
            },
            'ranked_predictions': [
                {
                    'rank': rank + 1,
                    'category': self.class_names[idx],
                    'probability': float(probs[idx])
                }
                for rank, idx in enumerate(sorted_indices)
            ]
        }


def explain_prediction(
    text: str,
    model_path: str,
    output_html: str = 'explanation.html'
):
    """
    Generate visual explanation for a prediction.
    
    Args:
        text: Text to explain
        model_path: Path to trained model
        output_html: Output HTML file
        
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    import joblib
    
    print(f"\n{'='*60}")
    print("Model Explainability - RSK World")
    print(f"Author: {__author__} | Website: {__website__}")
    print(f"{'='*60}\n")
    
    # Load model
    model_data = joblib.load(model_path)
    pipeline = model_data['pipeline']
    
    # Create classifier function
    def classifier_fn(texts):
        return pipeline.predict_proba(texts)
    
    # Create explainer
    explainer = TextExplainer(classifier_fn)
    
    # Generate explanation
    print(f"Analyzing text: {text[:50]}...")
    explanation = explainer.explain(text)
    
    # Print results
    print(f"\nPredicted: {explanation['predicted_category']} "
          f"({explanation['confidence']:.1%} confidence)")
    print("\nTop important words:")
    for item in explanation['word_importance'][:10]:
        direction = "↑" if item['direction'] == 'positive' else "↓"
        print(f"  {direction} {item['word']}: {item['importance']:.4f}")
    
    # Generate HTML
    html = explainer.explain_with_html(text)
    
    # Create full HTML document
    full_html = f"""
<!DOCTYPE html>
<html>
<head>
    <title>Prediction Explanation - RSK World</title>
    <meta charset="UTF-8">
    <style>
        body {{
            font-family: 'Segoe UI', Arial, sans-serif;
            background: #0f0a1f;
            color: #f8fafc;
            padding: 40px;
            max-width: 800px;
            margin: 0 auto;
        }}
        h1 {{ color: #dc2626; }}
        .container {{
            background: #1a1333;
            padding: 30px;
            border-radius: 12px;
            border: 1px solid #352d54;
        }}
        .footer {{
            margin-top: 30px;
            text-align: center;
            color: #6b6882;
            font-size: 14px;
        }}
        .legend {{
            margin-top: 20px;
            padding: 15px;
            background: #231d3a;
            border-radius: 8px;
        }}
        .legend span {{
            display: inline-block;
            margin-right: 20px;
        }}
        .positive {{ color: #22c55e; }}
        .negative {{ color: #ef4444; }}
    </style>
</head>
<body>
    <h1>🔍 Prediction Explanation</h1>
    <div class="container">
        {html}
        <div class="legend">
            <strong>Legend:</strong>
            <span class="positive">■ Supports prediction</span>
            <span class="negative">■ Against prediction</span>
        </div>
    </div>
    <div class="footer">
        <p>Generated by RSK World Text Classification</p>
        <p>Author: {__author__} | Website: <a href="{__website__}" style="color: #dc2626;">{__website__}</a></p>
    </div>
</body>
</html>
"""
    
    with open(output_html, 'w', encoding='utf-8') as f:
        f.write(full_html)
    
    print(f"\nExplanation saved to: {output_html}")


if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 2:
        text = sys.argv[1]
        model_path = sys.argv[2]
        explain_prediction(text, model_path)
    else:
        print("Usage: python model_explainability.py 'Your text here' model.joblib")
        print("\nDemo mode:")
        
        # Demo without model
        print(f"\n{'='*60}")
        print("Explainability Module Demo - RSK World")
        print(f"Author: {__author__} | Website: {__website__}")
        print(f"{'='*60}")

575 lines•17.9 KB

python

Theme Settings

Color Scheme

Display Options

Font Size