RSK World - Text Classification Dataset - Project Files Browser | RSK World

scripts/deep_learning.py

"""
================================================================================
Text Classification Dataset - Deep Learning Training Module
================================================================================
Project: Text Classification Dataset
Category: Text Data / NLP

Author: Molla Samser
Designer & Tester: Rima Khatun
Website: https://rskworld.in
Email: help@rskworld.in | support@rskworld.in
Phone: +91 93305 39277

Copyright (c) 2026 RSK World - All Rights Reserved
Content used for educational purposes only.

Features:
- PyTorch Neural Network Classifier
- LSTM/GRU Text Classification
- CNN for Text Classification
- Transformer-based Models
- Training with Early Stopping
- Learning Rate Scheduling
- Model Checkpointing
- TensorBoard Logging

Created: December 2026
================================================================================
"""

import os
import re
import string
import json
from typing import List, Dict, Tuple, Optional
from datetime import datetime
from collections import Counter

import numpy as np
import pandas as pd

# Project information
__author__ = "Molla Samser"
__website__ = "https://rskworld.in"
__email__ = "help@rskworld.in"

# Category mapping
CATEGORIES = {
    0: 'Technology', 1: 'Sports', 2: 'Politics',
    3: 'Entertainment', 4: 'Business', 5: 'Science'
}

# Check for PyTorch
try:
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    from torch.utils.data import Dataset, DataLoader
    from torch.optim import Adam, AdamW
    from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingLR
    TORCH_AVAILABLE = True
except ImportError:
    TORCH_AVAILABLE = False
    print("PyTorch not available. Install with: pip install torch")


class Vocabulary:
    """
    Vocabulary class for text tokenization.
    
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    
    def __init__(
        self,
        max_vocab_size: int = 50000,
        min_freq: int = 2,
        pad_token: str = '<PAD>',
        unk_token: str = '<UNK>'
    ):
        self.max_vocab_size = max_vocab_size
        self.min_freq = min_freq
        self.pad_token = pad_token
        self.unk_token = unk_token
        
        self.word2idx = {pad_token: 0, unk_token: 1}
        self.idx2word = {0: pad_token, 1: unk_token}
        self.word_counts = Counter()
        
    def build(self, texts: List[str]):
        """Build vocabulary from texts."""
        for text in texts:
            tokens = self._tokenize(text)
            self.word_counts.update(tokens)
        
        # Filter by frequency and limit size
        sorted_words = [
            word for word, count in self.word_counts.most_common(self.max_vocab_size - 2)
            if count >= self.min_freq
        ]
        
        for word in sorted_words:
            idx = len(self.word2idx)
            self.word2idx[word] = idx
            self.idx2word[idx] = word
    
    def _tokenize(self, text: str) -> List[str]:
        """Simple tokenization."""
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        return text.split()
    
    def encode(self, text: str, max_length: int = 256) -> List[int]:
        """Encode text to indices."""
        tokens = self._tokenize(text)[:max_length]
        indices = [
            self.word2idx.get(token, self.word2idx[self.unk_token])
            for token in tokens
        ]
        # Pad or truncate
        if len(indices) < max_length:
            indices += [self.word2idx[self.pad_token]] * (max_length - len(indices))
        return indices[:max_length]
    
    def decode(self, indices: List[int]) -> str:
        """Decode indices back to text."""
        tokens = [self.idx2word.get(idx, self.unk_token) for idx in indices]
        return ' '.join([t for t in tokens if t != self.pad_token])
    
    def __len__(self):
        return len(self.word2idx)


if TORCH_AVAILABLE:
    
    class TextDataset(Dataset):
        """
        PyTorch Dataset for text classification.
        
        Author: Molla Samser | RSK World (https://rskworld.in)
        """
        
        def __init__(
            self,
            texts: List[str],
            labels: List[int],
            vocab: Vocabulary,
            max_length: int = 256
        ):
            self.texts = texts
            self.labels = labels
            self.vocab = vocab
            self.max_length = max_length
        
        def __len__(self):
            return len(self.texts)
        
        def __getitem__(self, idx):
            text = self.texts[idx]
            label = self.labels[idx]
            
            encoded = self.vocab.encode(text, self.max_length)
            
            return {
                'input_ids': torch.tensor(encoded, dtype=torch.long),
                'label': torch.tensor(label, dtype=torch.long)
            }
    
    
    class LSTMClassifier(nn.Module):
        """
        LSTM-based text classifier.
        
        Author: Molla Samser | RSK World (https://rskworld.in)
        """
        
        def __init__(
            self,
            vocab_size: int,
            embedding_dim: int = 128,
            hidden_dim: int = 256,
            num_layers: int = 2,
            num_classes: int = 6,
            dropout: float = 0.3,
            bidirectional: bool = True
        ):
            super().__init__()
            
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
            
            self.lstm = nn.LSTM(
                embedding_dim,
                hidden_dim,
                num_layers=num_layers,
                batch_first=True,
                dropout=dropout if num_layers > 1 else 0,
                bidirectional=bidirectional
            )
            
            lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
            
            self.attention = nn.Sequential(
                nn.Linear(lstm_output_dim, 64),
                nn.Tanh(),
                nn.Linear(64, 1)
            )
            
            self.classifier = nn.Sequential(
                nn.Dropout(dropout),
                nn.Linear(lstm_output_dim, 128),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(128, num_classes)
            )
        
        def forward(self, input_ids):
            # Embedding
            embedded = self.embedding(input_ids)  # (batch, seq, embed)
            
            # LSTM
            lstm_out, _ = self.lstm(embedded)  # (batch, seq, hidden*2)
            
            # Attention
            attn_weights = self.attention(lstm_out)  # (batch, seq, 1)
            attn_weights = F.softmax(attn_weights, dim=1)
            
            # Weighted sum
            context = torch.sum(attn_weights * lstm_out, dim=1)  # (batch, hidden*2)
            
            # Classification
            logits = self.classifier(context)
            
            return logits
    
    
    class CNNClassifier(nn.Module):
        """
        CNN-based text classifier with multiple filter sizes.
        
        Author: Molla Samser | RSK World (https://rskworld.in)
        """
        
        def __init__(
            self,
            vocab_size: int,
            embedding_dim: int = 128,
            num_filters: int = 100,
            filter_sizes: List[int] = [2, 3, 4, 5],
            num_classes: int = 6,
            dropout: float = 0.5
        ):
            super().__init__()
            
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
            
            self.convs = nn.ModuleList([
                nn.Conv1d(embedding_dim, num_filters, fs)
                for fs in filter_sizes
            ])
            
            self.classifier = nn.Sequential(
                nn.Dropout(dropout),
                nn.Linear(num_filters * len(filter_sizes), 128),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(128, num_classes)
            )
        
        def forward(self, input_ids):
            # Embedding
            embedded = self.embedding(input_ids)  # (batch, seq, embed)
            embedded = embedded.permute(0, 2, 1)  # (batch, embed, seq)
            
            # Convolutions
            conv_outputs = []
            for conv in self.convs:
                conv_out = F.relu(conv(embedded))  # (batch, filters, seq-fs+1)
                pooled = F.max_pool1d(conv_out, conv_out.size(2)).squeeze(2)
                conv_outputs.append(pooled)
            
            # Concatenate
            concat = torch.cat(conv_outputs, dim=1)  # (batch, filters*len(filter_sizes))
            
            # Classification
            logits = self.classifier(concat)
            
            return logits
    
    
    class TransformerClassifier(nn.Module):
        """
        Transformer-based text classifier.
        
        Author: Molla Samser | RSK World (https://rskworld.in)
        """
        
        def __init__(
            self,
            vocab_size: int,
            embedding_dim: int = 128,
            num_heads: int = 4,
            num_layers: int = 2,
            ff_dim: int = 512,
            num_classes: int = 6,
            max_length: int = 256,
            dropout: float = 0.1
        ):
            super().__init__()
            
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
            self.pos_embedding = nn.Embedding(max_length, embedding_dim)
            
            encoder_layer = nn.TransformerEncoderLayer(
                d_model=embedding_dim,
                nhead=num_heads,
                dim_feedforward=ff_dim,
                dropout=dropout,
                batch_first=True
            )
            
            self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
            
            self.classifier = nn.Sequential(
                nn.Dropout(dropout),
                nn.Linear(embedding_dim, 128),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(128, num_classes)
            )
        
        def forward(self, input_ids):
            batch_size, seq_len = input_ids.shape
            
            # Embeddings
            token_emb = self.embedding(input_ids)
            positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
            pos_emb = self.pos_embedding(positions)
            
            embedded = token_emb + pos_emb
            
            # Create padding mask
            padding_mask = (input_ids == 0)
            
            # Transformer
            transformer_out = self.transformer(embedded, src_key_padding_mask=padding_mask)
            
            # Global average pooling (ignoring padding)
            mask = (~padding_mask).unsqueeze(-1).float()
            pooled = (transformer_out * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
            
            # Classification
            logits = self.classifier(pooled)
            
            return logits


class DeepLearningTrainer:
    """
    Trainer for deep learning text classifiers.
    
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    
    def __init__(
        self,
        model_type: str = 'lstm',
        embedding_dim: int = 128,
        hidden_dim: int = 256,
        num_layers: int = 2,
        num_classes: int = 6,
        max_length: int = 256,
        learning_rate: float = 1e-3,
        batch_size: int = 32,
        epochs: int = 10,
        device: str = 'auto',
        early_stopping_patience: int = 3,
        verbose: bool = True
    ):
        if not TORCH_AVAILABLE:
            raise ImportError("PyTorch is required. Install with: pip install torch")
        
        self.model_type = model_type
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.max_length = max_length
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs
        self.early_stopping_patience = early_stopping_patience
        self.verbose = verbose
        
        # Device
        if device == 'auto':
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = torch.device(device)
        
        self.vocab = None
        self.model = None
        self.history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    def _create_model(self, vocab_size: int) -> nn.Module:
        """Create the model based on type."""
        if self.model_type == 'lstm':
            return LSTMClassifier(
                vocab_size=vocab_size,
                embedding_dim=self.embedding_dim,
                hidden_dim=self.hidden_dim,
                num_layers=self.num_layers,
                num_classes=self.num_classes
            )
        elif self.model_type == 'cnn':
            return CNNClassifier(
                vocab_size=vocab_size,
                embedding_dim=self.embedding_dim,
                num_classes=self.num_classes
            )
        elif self.model_type == 'transformer':
            return TransformerClassifier(
                vocab_size=vocab_size,
                embedding_dim=self.embedding_dim,
                num_classes=self.num_classes,
                max_length=self.max_length
            )
        else:
            raise ValueError(f"Unknown model type: {self.model_type}")
    
    def fit(
        self,
        train_texts: List[str],
        train_labels: List[int],
        val_texts: Optional[List[str]] = None,
        val_labels: Optional[List[int]] = None
    ):
        """
        Train the model.
        
        Args:
            train_texts: Training texts
            train_labels: Training labels
            val_texts: Validation texts (optional)
            val_labels: Validation labels (optional)
        """
        if self.verbose:
            print(f"\n{'='*60}")
            print("Deep Learning Text Classification Training")
            print(f"Author: {__author__} | Website: {__website__}")
            print(f"{'='*60}\n")
            print(f"Model Type: {self.model_type.upper()}")
            print(f"Device: {self.device}")
            print(f"Batch Size: {self.batch_size}")
            print(f"Learning Rate: {self.learning_rate}")
            print(f"Epochs: {self.epochs}")
            print("-" * 40)
        
        # Build vocabulary
        self.vocab = Vocabulary()
        self.vocab.build(train_texts)
        
        if self.verbose:
            print(f"Vocabulary Size: {len(self.vocab)}")
        
        # Create datasets
        train_dataset = TextDataset(train_texts, train_labels, self.vocab, self.max_length)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        
        val_loader = None
        if val_texts is not None and val_labels is not None:
            val_dataset = TextDataset(val_texts, val_labels, self.vocab, self.max_length)
            val_loader = DataLoader(val_dataset, batch_size=self.batch_size)
        
        # Create model
        self.model = self._create_model(len(self.vocab))
        self.model.to(self.device)
        
        # Count parameters
        total_params = sum(p.numel() for p in self.model.parameters())
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        
        if self.verbose:
            print(f"Total Parameters: {total_params:,}")
            print(f"Trainable Parameters: {trainable_params:,}")
            print("-" * 40)
        
        # Optimizer and scheduler
        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=0.01)
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
        criterion = nn.CrossEntropyLoss()
        
        # Training loop
        best_val_loss = float('inf')
        patience_counter = 0
        
        for epoch in range(self.epochs):
            # Training
            self.model.train()
            train_loss = 0
            train_correct = 0
            train_total = 0
            
            for batch in train_loader:
                input_ids = batch['input_ids'].to(self.device)
                labels = batch['label'].to(self.device)
                
                optimizer.zero_grad()
                logits = self.model(input_ids)
                loss = criterion(logits, labels)
                loss.backward()
                
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                
                train_loss += loss.item()
                _, predicted = torch.max(logits, 1)
                train_correct += (predicted == labels).sum().item()
                train_total += labels.size(0)
            
            avg_train_loss = train_loss / len(train_loader)
            train_acc = train_correct / train_total
            
            self.history['train_loss'].append(avg_train_loss)
            self.history['train_acc'].append(train_acc)
            
            # Validation
            if val_loader:
                val_loss, val_acc = self._evaluate(val_loader, criterion)
                self.history['val_loss'].append(val_loss)
                self.history['val_acc'].append(val_acc)
                
                scheduler.step(val_loss)
                
                if self.verbose:
                    print(f"Epoch {epoch+1}/{self.epochs} - "
                          f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f} - "
                          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
                
                # Early stopping
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    patience_counter = 0
                else:
                    patience_counter += 1
                    if patience_counter >= self.early_stopping_patience:
                        if self.verbose:
                            print(f"Early stopping at epoch {epoch+1}")
                        break
            else:
                if self.verbose:
                    print(f"Epoch {epoch+1}/{self.epochs} - "
                          f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}")
        
        if self.verbose:
            print(f"\n{'='*60}")
            print("Training Complete!")
            print(f"Best Validation Loss: {best_val_loss:.4f}")
    
    def _evaluate(self, loader, criterion) -> Tuple[float, float]:
        """Evaluate the model."""
        self.model.eval()
        total_loss = 0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch in loader:
                input_ids = batch['input_ids'].to(self.device)
                labels = batch['label'].to(self.device)
                
                logits = self.model(input_ids)
                loss = criterion(logits, labels)
                
                total_loss += loss.item()
                _, predicted = torch.max(logits, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)
        
        return total_loss / len(loader), correct / total
    
    def predict(self, texts: List[str]) -> np.ndarray:
        """Predict labels for texts."""
        if self.model is None:
            raise ValueError("Model not trained. Call fit() first.")
        
        self.model.eval()
        predictions = []
        
        with torch.no_grad():
            for text in texts:
                encoded = self.vocab.encode(text, self.max_length)
                input_ids = torch.tensor([encoded], dtype=torch.long).to(self.device)
                logits = self.model(input_ids)
                _, predicted = torch.max(logits, 1)
                predictions.append(predicted.item())
        
        return np.array(predictions)
    
    def predict_proba(self, texts: List[str]) -> np.ndarray:
        """Predict probabilities for texts."""
        if self.model is None:
            raise ValueError("Model not trained. Call fit() first.")
        
        self.model.eval()
        probabilities = []
        
        with torch.no_grad():
            for text in texts:
                encoded = self.vocab.encode(text, self.max_length)
                input_ids = torch.tensor([encoded], dtype=torch.long).to(self.device)
                logits = self.model(input_ids)
                probs = F.softmax(logits, dim=1)
                probabilities.append(probs.cpu().numpy()[0])
        
        return np.array(probabilities)
    
    def save(self, path: str):
        """Save model and vocabulary."""
        state = {
            'model_state': self.model.state_dict(),
            'vocab': self.vocab,
            'config': {
                'model_type': self.model_type,
                'embedding_dim': self.embedding_dim,
                'hidden_dim': self.hidden_dim,
                'num_layers': self.num_layers,
                'num_classes': self.num_classes,
                'max_length': self.max_length
            },
            'history': self.history,
            'metadata': {
                'author': __author__,
                'website': __website__,
                'created_at': datetime.now().isoformat()
            }
        }
        torch.save(state, path)
        if self.verbose:
            print(f"Model saved to: {path}")
    
    def load(self, path: str):
        """Load model and vocabulary."""
        state = torch.load(path, map_location=self.device)
        
        self.vocab = state['vocab']
        config = state['config']
        
        self.model_type = config['model_type']
        self.model = self._create_model(len(self.vocab))
        self.model.load_state_dict(state['model_state'])
        self.model.to(self.device)
        self.model.eval()
        
        self.history = state.get('history', {})
        
        if self.verbose:
            print(f"Model loaded from: {path}")


if __name__ == "__main__":
    print(f"\n{'='*60}")
    print("Deep Learning Demo - RSK World")
    print(f"Author: {__author__} | Website: {__website__}")
    print(f"{'='*60}\n")
    
    if not TORCH_AVAILABLE:
        print("PyTorch not installed. Install with: pip install torch")
        exit(1)
    
    # Load data
    try:
        train_df = pd.read_csv('../data/csv/train.csv', comment='#')
        val_df = pd.read_csv('../data/csv/validation.csv', comment='#')
        
        # Train model
        trainer = DeepLearningTrainer(
            model_type='lstm',
            embedding_dim=128,
            hidden_dim=256,
            batch_size=16,
            epochs=5,
            verbose=True
        )
        
        trainer.fit(
            train_df['text'].tolist(),
            train_df['label'].tolist(),
            val_df['text'].tolist(),
            val_df['label'].tolist()
        )
        
        # Test prediction
        test_texts = [
            "Apple unveils new iPhone with AI features",
            "Manchester United wins the championship"
        ]
        
        predictions = trainer.predict(test_texts)
        probabilities = trainer.predict_proba(test_texts)
        
        print("\nTest Predictions:")
        for text, pred, prob in zip(test_texts, predictions, probabilities):
            print(f"Text: {text}")
            print(f"Predicted: {CATEGORIES[pred]} ({prob[pred]*100:.1f}%)")
            print()
        
        # Save model
        trainer.save('deep_model.pt')
        
        print(f"\n{'='*60}")
        print("Deep Learning Demo Complete!")
        print(f"Copyright (c) 2026 RSK World - All Rights Reserved")
        
    except FileNotFoundError:
        print("Dataset not found. Please ensure CSV files exist in ../data/csv/")

717 lines•24.2 KB

python

scripts/visualizations.py

Raw Download

"""
================================================================================
Text Classification Dataset - Advanced Visualization Module
================================================================================
Project: Text Classification Dataset
Category: Text Data / NLP

Author: Molla Samser
Designer & Tester: Rima Khatun
Website: https://rskworld.in
Email: help@rskworld.in | support@rskworld.in
Phone: +91 93305 39277

Copyright (c) 2026 RSK World - All Rights Reserved
Content used for educational purposes only.

Features:
- Word Cloud Generation
- Category Distribution Charts
- Text Length Analysis
- Confusion Matrix Heatmaps
- Training History Plots
- Feature Importance Visualization
- t-SNE Embeddings Visualization

Created: December 2026
================================================================================
"""

import os
import re
import string
from typing import List, Dict, Optional, Tuple
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Project information
__author__ = "Molla Samser"
__website__ = "https://rskworld.in"
__email__ = "help@rskworld.in"

# Category configuration
CATEGORIES = {
    0: 'Technology',
    1: 'Sports',
    2: 'Politics',
    3: 'Entertainment',
    4: 'Business',
    5: 'Science'
}

CATEGORY_COLORS = {
    'Technology': '#3b82f6',
    'Sports': '#22c55e',
    'Politics': '#8b5cf6',
    'Entertainment': '#ec4899',
    'Business': '#f59e0b',
    'Science': '#06b6d4'
}


def set_style():
    """Set consistent plotting style."""
    plt.style.use('seaborn-v0_8-darkgrid')
    plt.rcParams['figure.facecolor'] = '#0f0a1f'
    plt.rcParams['axes.facecolor'] = '#1a1333'
    plt.rcParams['axes.edgecolor'] = '#352d54'
    plt.rcParams['axes.labelcolor'] = '#f8fafc'
    plt.rcParams['text.color'] = '#f8fafc'
    plt.rcParams['xtick.color'] = '#a5a3b8'
    plt.rcParams['ytick.color'] = '#a5a3b8'
    plt.rcParams['grid.color'] = '#352d54'
    plt.rcParams['legend.facecolor'] = '#231d3a'
    plt.rcParams['legend.edgecolor'] = '#352d54'
    plt.rcParams['font.family'] = 'sans-serif'


def generate_wordcloud(
    texts: List[str],
    output_path: str = 'wordcloud.png',
    title: str = 'Word Cloud',
    width: int = 1200,
    height: int = 600,
    background_color: str = '#0f0a1f',
    colormap: str = 'Reds'
):
    """
    Generate word cloud from texts.
    
    Args:
        texts: List of text documents
        output_path: Path to save image
        title: Chart title
        width: Image width
        height: Image height
        background_color: Background color
        colormap: Matplotlib colormap
        
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    try:
        from wordcloud import WordCloud, STOPWORDS
    except ImportError:
        print("Please install wordcloud: pip install wordcloud")
        return
    
    # Combine all texts
    combined_text = ' '.join(texts)
    
    # Clean text
    combined_text = combined_text.lower()
    combined_text = re.sub(r'[^\w\s]', '', combined_text)
    
    # Generate word cloud
    wordcloud = WordCloud(
        width=width,
        height=height,
        background_color=background_color,
        colormap=colormap,
        stopwords=STOPWORDS,
        max_words=200,
        max_font_size=150,
        random_state=42
    ).generate(combined_text)
    
    # Plot
    set_style()
    fig, ax = plt.subplots(figsize=(15, 8))
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(title, fontsize=20, fontweight='bold', color='#f8fafc', pad=20)
    
    # Add watermark
    fig.text(0.99, 0.01, 'RSK World | rskworld.in', fontsize=10, color='#6b6882',
             ha='right', va='bottom', style='italic')
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='#0f0a1f')
    plt.close()
    
    print(f"Word cloud saved to: {output_path}")


def generate_wordclouds_by_category(
    df: pd.DataFrame,
    text_column: str = 'text',
    label_column: str = 'label',
    output_dir: str = 'wordclouds'
):
    """
    Generate separate word clouds for each category.
    
    Args:
        df: DataFrame with texts and labels
        text_column: Column name for text
        label_column: Column name for labels
        output_dir: Output directory
        
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    os.makedirs(output_dir, exist_ok=True)
    
    for label, category in CATEGORIES.items():
        texts = df[df[label_column] == label][text_column].tolist()
        if texts:
            output_path = os.path.join(output_dir, f'wordcloud_{category.lower()}.png')
            generate_wordcloud(
                texts,
                output_path=output_path,
                title=f'{category} - Word Cloud',
                colormap='Blues' if category == 'Technology' else 
                         'Greens' if category == 'Sports' else
                         'Purples' if category == 'Politics' else
                         'RdPu' if category == 'Entertainment' else
                         'YlOrBr' if category == 'Business' else 'BuGn'
            )


def plot_category_distribution(
    df: pd.DataFrame,
    label_column: str = 'label',
    output_path: str = 'category_distribution.png'
):
    """
    Plot category distribution as pie and bar charts.
    
    Args:
        df: DataFrame with labels
        label_column: Column name for labels
        output_path: Path to save image
        
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    set_style()
    
    # Count categories
    counts = df[label_column].map(CATEGORIES).value_counts()
    colors = [CATEGORY_COLORS[cat] for cat in counts.index]
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 7))
    
    # Bar chart
    bars = axes[0].bar(counts.index, counts.values, color=colors, edgecolor='white', linewidth=1.5)
    axes[0].set_xlabel('Category', fontsize=12)
    axes[0].set_ylabel('Number of Documents', fontsize=12)
    axes[0].set_title('Category Distribution', fontsize=16, fontweight='bold')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Add value labels
    for bar, val in zip(bars, counts.values):
        axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2,
                    str(val), ha='center', va='bottom', fontsize=11, fontweight='bold')
    
    # Pie chart
    wedges, texts, autotexts = axes[1].pie(
        counts.values,
        labels=counts.index,
        colors=colors,
        autopct='%1.1f%%',
        startangle=90,
        explode=[0.02] * len(counts),
        shadow=True
    )
    axes[1].set_title('Category Proportions', fontsize=16, fontweight='bold')
    
    for autotext in autotexts:
        autotext.set_fontsize(10)
        autotext.set_fontweight('bold')
    
    plt.suptitle('Text Classification Dataset - Category Analysis\nRSK World | rskworld.in',
                 fontsize=18, fontweight='bold', y=1.02)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='#0f0a1f')
    plt.close()
    
    print(f"Category distribution saved to: {output_path}")


def plot_text_length_distribution(
    df: pd.DataFrame,
    text_column: str = 'text',
    label_column: str = 'label',
    output_path: str = 'text_length_distribution.png'
):
    """
    Plot text length distribution by category.
    
    Args:
        df: DataFrame with texts
        text_column: Column name for text
        label_column: Column name for labels
        output_path: Path to save image
        
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    set_style()
    
    # Calculate lengths
    df = df.copy()
    df['word_count'] = df[text_column].str.split().str.len()
    df['char_count'] = df[text_column].str.len()
    df['category'] = df[label_column].map(CATEGORIES)
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Word count histogram
    for cat in CATEGORIES.values():
        data = df[df['category'] == cat]['word_count']
        axes[0, 0].hist(data, bins=30, alpha=0.6, label=cat, color=CATEGORY_COLORS[cat])
    axes[0, 0].set_xlabel('Word Count', fontsize=12)
    axes[0, 0].set_ylabel('Frequency', fontsize=12)
    axes[0, 0].set_title('Word Count Distribution by Category', fontsize=14)
    axes[0, 0].legend(loc='upper right')
    
    # Character count histogram
    for cat in CATEGORIES.values():
        data = df[df['category'] == cat]['char_count']
        axes[0, 1].hist(data, bins=30, alpha=0.6, label=cat, color=CATEGORY_COLORS[cat])
    axes[0, 1].set_xlabel('Character Count', fontsize=12)
    axes[0, 1].set_ylabel('Frequency', fontsize=12)
    axes[0, 1].set_title('Character Count Distribution by Category', fontsize=14)
    axes[0, 1].legend(loc='upper right')
    
    # Box plot - word count
    colors = [CATEGORY_COLORS[CATEGORIES[i]] for i in range(6)]
    bp1 = df.boxplot(column='word_count', by='category', ax=axes[1, 0],
                     patch_artist=True, return_type='dict')
    for patch, color in zip(bp1['word_count']['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)
    axes[1, 0].set_xlabel('Category', fontsize=12)
    axes[1, 0].set_ylabel('Word Count', fontsize=12)
    axes[1, 0].set_title('Word Count Box Plot', fontsize=14)
    plt.suptitle('')
    
    # Violin plot - character count
    violin_data = [df[df['category'] == cat]['char_count'].values for cat in CATEGORIES.values()]
    parts = axes[1, 1].violinplot(violin_data, positions=range(len(CATEGORIES)))
    for i, pc in enumerate(parts['bodies']):
        pc.set_facecolor(colors[i])
        pc.set_alpha(0.7)
    axes[1, 1].set_xticks(range(len(CATEGORIES)))
    axes[1, 1].set_xticklabels(CATEGORIES.values(), rotation=45)
    axes[1, 1].set_xlabel('Category', fontsize=12)
    axes[1, 1].set_ylabel('Character Count', fontsize=12)
    axes[1, 1].set_title('Character Count Violin Plot', fontsize=14)
    
    plt.suptitle('Text Length Analysis - RSK World | rskworld.in',
                 fontsize=18, fontweight='bold', y=1.02)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='#0f0a1f')
    plt.close()
    
    print(f"Text length distribution saved to: {output_path}")


def plot_confusion_matrix(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    output_path: str = 'confusion_matrix.png',
    title: str = 'Confusion Matrix',
    normalize: bool = True
):
    """
    Plot confusion matrix heatmap.
    
    Args:
        y_true: True labels
        y_pred: Predicted labels
        output_path: Path to save image
        title: Chart title
        normalize: Whether to normalize
        
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    from sklearn.metrics import confusion_matrix as cm_func
    
    set_style()
    
    cm = cm_func(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    fig, ax = plt.subplots(figsize=(12, 10))
    
    sns.heatmap(
        cm,
        annot=True,
        fmt='.2%' if normalize else 'd',
        cmap='Reds',
        xticklabels=CATEGORIES.values(),
        yticklabels=CATEGORIES.values(),
        ax=ax,
        linewidths=0.5,
        linecolor='#352d54',
        cbar_kws={'label': 'Proportion' if normalize else 'Count'}
    )
    
    ax.set_xlabel('Predicted Label', fontsize=14)
    ax.set_ylabel('True Label', fontsize=14)
    ax.set_title(f'{title}\nRSK World | rskworld.in', fontsize=16, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='#0f0a1f')
    plt.close()
    
    print(f"Confusion matrix saved to: {output_path}")


def plot_training_history(
    history: Dict[str, List[float]],
    output_path: str = 'training_history.png'
):
    """
    Plot training history (loss and accuracy).
    
    Args:
        history: Dictionary with 'loss', 'val_loss', 'accuracy', 'val_accuracy'
        output_path: Path to save image
        
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    set_style()
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    epochs = range(1, len(history.get('loss', [])) + 1)
    
    # Loss plot
    if 'loss' in history:
        axes[0].plot(epochs, history['loss'], 'o-', color='#dc2626', 
                    label='Training Loss', linewidth=2, markersize=6)
    if 'val_loss' in history:
        axes[0].plot(epochs, history['val_loss'], 's--', color='#f59e0b',
                    label='Validation Loss', linewidth=2, markersize=6)
    axes[0].set_xlabel('Epoch', fontsize=12)
    axes[0].set_ylabel('Loss', fontsize=12)
    axes[0].set_title('Training and Validation Loss', fontsize=14, fontweight='bold')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Accuracy plot
    if 'accuracy' in history:
        axes[1].plot(epochs, history['accuracy'], 'o-', color='#22c55e',
                    label='Training Accuracy', linewidth=2, markersize=6)
    if 'val_accuracy' in history:
        axes[1].plot(epochs, history['val_accuracy'], 's--', color='#3b82f6',
                    label='Validation Accuracy', linewidth=2, markersize=6)
    axes[1].set_xlabel('Epoch', fontsize=12)
    axes[1].set_ylabel('Accuracy', fontsize=12)
    axes[1].set_title('Training and Validation Accuracy', fontsize=14, fontweight='bold')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    axes[1].set_ylim(0, 1)
    
    plt.suptitle('Model Training History - RSK World | rskworld.in',
                 fontsize=18, fontweight='bold', y=1.02)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='#0f0a1f')
    plt.close()
    
    print(f"Training history saved to: {output_path}")


def plot_feature_importance(
    feature_names: List[str],
    importances: np.ndarray,
    top_n: int = 20,
    output_path: str = 'feature_importance.png',
    title: str = 'Top Features'
):
    """
    Plot feature importance bar chart.
    
    Args:
        feature_names: List of feature names
        importances: Feature importance scores
        top_n: Number of top features to show
        output_path: Path to save image
        title: Chart title
        
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    set_style()
    
    # Get top features
    indices = np.argsort(importances)[-top_n:][::-1]
    top_features = [feature_names[i] for i in indices]
    top_importances = importances[indices]
    
    fig, ax = plt.subplots(figsize=(12, 8))
    
    colors = plt.cm.Reds(np.linspace(0.4, 0.9, len(top_features)))
    bars = ax.barh(range(len(top_features)), top_importances[::-1], color=colors[::-1])
    
    ax.set_yticks(range(len(top_features)))
    ax.set_yticklabels(top_features[::-1])
    ax.set_xlabel('Importance Score', fontsize=12)
    ax.set_title(f'{title}\nRSK World | rskworld.in', fontsize=16, fontweight='bold')
    
    # Add value labels
    for bar, val in zip(bars, top_importances[::-1]):
        ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
               f'{val:.4f}', va='center', fontsize=9)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='#0f0a1f')
    plt.close()
    
    print(f"Feature importance saved to: {output_path}")


def plot_tsne_embeddings(
    embeddings: np.ndarray,
    labels: np.ndarray,
    output_path: str = 'tsne_embeddings.png',
    perplexity: int = 30
):
    """
    Plot t-SNE visualization of text embeddings.
    
    Args:
        embeddings: Document embeddings (n_samples, n_features)
        labels: Category labels
        output_path: Path to save image
        perplexity: t-SNE perplexity parameter
        
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    set_style()
    
    # Reduce dimensions with t-SNE
    print("Computing t-SNE embeddings...")
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, n_iter=1000)
    embeddings_2d = tsne.fit_transform(embeddings)
    
    fig, ax = plt.subplots(figsize=(14, 10))
    
    for label, category in CATEGORIES.items():
        mask = labels == label
        ax.scatter(
            embeddings_2d[mask, 0],
            embeddings_2d[mask, 1],
            c=CATEGORY_COLORS[category],
            label=category,
            alpha=0.7,
            s=50,
            edgecolors='white',
            linewidth=0.5
        )
    
    ax.set_xlabel('t-SNE Dimension 1', fontsize=12)
    ax.set_ylabel('t-SNE Dimension 2', fontsize=12)
    ax.set_title('t-SNE Visualization of Document Embeddings\nRSK World | rskworld.in',
                 fontsize=16, fontweight='bold')
    ax.legend(loc='best', framealpha=0.9)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches='tight', facecolor='#0f0a1f')
    plt.close()
    
    print(f"t-SNE visualization saved to: {output_path}")


def generate_all_visualizations(data_dir: str, output_dir: str = 'visualizations'):
    """
    Generate all visualizations from the dataset.
    
    Args:
        data_dir: Path to data directory
        output_dir: Output directory for visualizations
        
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"\n{'='*60}")
    print("Generating Visualizations - RSK World")
    print(f"Author: {__author__} | Website: {__website__}")
    print(f"{'='*60}\n")
    
    # Load data
    train_df = pd.read_csv(os.path.join(data_dir, 'csv', 'train.csv'), comment='#')
    
    # Generate visualizations
    print("1. Generating category distribution...")
    plot_category_distribution(
        train_df,
        output_path=os.path.join(output_dir, 'category_distribution.png')
    )
    
    print("2. Generating text length analysis...")
    plot_text_length_distribution(
        train_df,
        output_path=os.path.join(output_dir, 'text_length_distribution.png')
    )
    
    print("3. Generating word clouds...")
    generate_wordcloud(
        train_df['text'].tolist(),
        output_path=os.path.join(output_dir, 'wordcloud_all.png'),
        title='Text Classification Dataset - All Categories'
    )
    
    generate_wordclouds_by_category(
        train_df,
        output_dir=os.path.join(output_dir, 'wordclouds_by_category')
    )
    
    print(f"\n{'='*60}")
    print("All visualizations generated successfully!")
    print(f"Output directory: {output_dir}")


if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 1:
        data_dir = sys.argv[1]
    else:
        data_dir = '../data'
    
    generate_all_visualizations(data_dir)

594 lines•19 KB

python

Theme Settings

Color Scheme

Display Options

Font Size