RSK World - Text Classification Dataset - Project Files Browser | RSK World

scripts/data_augmentation.py

"""
================================================================================
Text Classification Dataset - Advanced Data Augmentation Module
================================================================================
Project: Text Classification Dataset
Category: Text Data / NLP

Author: Molla Samser
Designer & Tester: Rima Khatun
Website: https://rskworld.in
Email: help@rskworld.in | support@rskworld.in
Phone: +91 93305 39277

Copyright (c) 2026 RSK World - All Rights Reserved
Content used for educational purposes only.

Features:
- Synonym Replacement
- Random Insertion
- Random Swap
- Random Deletion
- Back Translation (simulated)
- Contextual Word Embeddings Augmentation

Created: December 2026
================================================================================
"""

import random
import re
from typing import List, Tuple, Optional
from collections import defaultdict

# Project information
__author__ = "Molla Samser"
__website__ = "https://rskworld.in"
__email__ = "help@rskworld.in"


class TextAugmenter:
    """
    Advanced text augmentation for NLP tasks.
    
    Techniques:
    1. Synonym Replacement (SR)
    2. Random Insertion (RI)
    3. Random Swap (RS)
    4. Random Deletion (RD)
    5. Back Translation (BT) - simulated
    6. Character-level augmentation
    
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    
    # Simple synonym dictionary for demonstration
    SYNONYMS = {
        'good': ['great', 'excellent', 'wonderful', 'fantastic', 'superb'],
        'bad': ['poor', 'terrible', 'awful', 'horrible', 'dreadful'],
        'big': ['large', 'huge', 'enormous', 'massive', 'giant'],
        'small': ['tiny', 'little', 'miniature', 'compact', 'petite'],
        'new': ['novel', 'fresh', 'recent', 'modern', 'innovative'],
        'old': ['ancient', 'aged', 'vintage', 'classic', 'traditional'],
        'fast': ['quick', 'rapid', 'swift', 'speedy', 'hasty'],
        'slow': ['gradual', 'leisurely', 'unhurried', 'sluggish', 'delayed'],
        'important': ['significant', 'crucial', 'vital', 'essential', 'critical'],
        'announces': ['reveals', 'declares', 'states', 'proclaims', 'discloses'],
        'launches': ['introduces', 'releases', 'unveils', 'debuts', 'presents'],
        'develops': ['creates', 'builds', 'designs', 'produces', 'constructs'],
        'discovers': ['finds', 'uncovers', 'detects', 'identifies', 'locates'],
        'achieves': ['accomplishes', 'attains', 'reaches', 'gains', 'secures'],
        'wins': ['triumphs', 'conquers', 'prevails', 'succeeds', 'captures'],
        'breaks': ['shatters', 'surpasses', 'exceeds', 'beats', 'tops'],
        'shows': ['demonstrates', 'displays', 'exhibits', 'reveals', 'indicates'],
        'says': ['states', 'declares', 'mentions', 'reports', 'claims'],
        'technology': ['tech', 'innovation', 'advancement', 'development'],
        'company': ['firm', 'corporation', 'enterprise', 'business', 'organization'],
        'market': ['industry', 'sector', 'field', 'arena', 'domain'],
        'research': ['study', 'investigation', 'analysis', 'examination', 'inquiry'],
        'scientist': ['researcher', 'expert', 'specialist', 'scholar', 'analyst'],
        'government': ['administration', 'authorities', 'regime', 'state', 'officials'],
        'economy': ['market', 'finances', 'commerce', 'trade', 'business'],
        'revolutionary': ['groundbreaking', 'innovative', 'pioneering', 'transformative'],
        'historic': ['landmark', 'significant', 'momentous', 'unprecedented', 'notable'],
    }
    
    def __init__(
        self,
        alpha_sr: float = 0.1,
        alpha_ri: float = 0.1,
        alpha_rs: float = 0.1,
        alpha_rd: float = 0.1,
        num_aug: int = 4,
        random_state: Optional[int] = None
    ):
        """
        Initialize the TextAugmenter.
        
        Args:
            alpha_sr: Probability for synonym replacement
            alpha_ri: Probability for random insertion
            alpha_rs: Probability for random swap
            alpha_rd: Probability for random deletion
            num_aug: Number of augmented samples per original
            random_state: Random seed for reproducibility
        """
        self.alpha_sr = alpha_sr
        self.alpha_ri = alpha_ri
        self.alpha_rs = alpha_rs
        self.alpha_rd = alpha_rd
        self.num_aug = num_aug
        
        if random_state is not None:
            random.seed(random_state)
        
        # Build reverse synonym lookup
        self.word_to_synonyms = defaultdict(list)
        for word, syns in self.SYNONYMS.items():
            self.word_to_synonyms[word.lower()] = [s.lower() for s in syns]
            for syn in syns:
                self.word_to_synonyms[syn.lower()].append(word.lower())
    
    def get_synonyms(self, word: str) -> List[str]:
        """Get synonyms for a word."""
        return self.word_to_synonyms.get(word.lower(), [])
    
    def synonym_replacement(self, words: List[str], n: int) -> List[str]:
        """
        Replace n random words with their synonyms.
        
        Args:
            words: List of words
            n: Number of words to replace
            
        Returns:
            Augmented word list
        """
        new_words = words.copy()
        random_word_list = list(set([w for w in words if self.get_synonyms(w)]))
        random.shuffle(random_word_list)
        
        num_replaced = 0
        for random_word in random_word_list:
            synonyms = self.get_synonyms(random_word)
            if synonyms:
                synonym = random.choice(synonyms)
                new_words = [synonym if w.lower() == random_word.lower() else w for w in new_words]
                num_replaced += 1
            if num_replaced >= n:
                break
        
        return new_words
    
    def random_insertion(self, words: List[str], n: int) -> List[str]:
        """
        Randomly insert n synonyms into the sentence.
        
        Args:
            words: List of words
            n: Number of words to insert
            
        Returns:
            Augmented word list
        """
        new_words = words.copy()
        
        for _ in range(n):
            self._add_word(new_words)
        
        return new_words
    
    def _add_word(self, words: List[str]):
        """Add a synonym of a random word at a random position."""
        if not words:
            return
        
        synonyms = []
        counter = 0
        while not synonyms:
            random_word = words[random.randint(0, len(words) - 1)]
            synonyms = self.get_synonyms(random_word)
            counter += 1
            if counter >= 10:
                return
        
        random_synonym = random.choice(synonyms)
        random_idx = random.randint(0, len(words) - 1)
        words.insert(random_idx, random_synonym)
    
    def random_swap(self, words: List[str], n: int) -> List[str]:
        """
        Randomly swap n pairs of words.
        
        Args:
            words: List of words
            n: Number of swaps
            
        Returns:
            Augmented word list
        """
        new_words = words.copy()
        
        for _ in range(n):
            new_words = self._swap_word(new_words)
        
        return new_words
    
    def _swap_word(self, words: List[str]) -> List[str]:
        """Swap two random words."""
        if len(words) < 2:
            return words
        
        new_words = words.copy()
        idx1, idx2 = random.sample(range(len(new_words)), 2)
        new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
        
        return new_words
    
    def random_deletion(self, words: List[str], p: float) -> List[str]:
        """
        Randomly delete words with probability p.
        
        Args:
            words: List of words
            p: Probability of deletion
            
        Returns:
            Augmented word list
        """
        if len(words) == 1:
            return words
        
        new_words = [w for w in words if random.random() > p]
        
        if not new_words:
            return [random.choice(words)]
        
        return new_words
    
    def character_swap(self, text: str, p: float = 0.01) -> str:
        """
        Randomly swap adjacent characters (simulates typos).
        
        Args:
            text: Input text
            p: Probability of swap per character
            
        Returns:
            Augmented text
        """
        chars = list(text)
        
        for i in range(len(chars) - 1):
            if random.random() < p and chars[i].isalpha() and chars[i + 1].isalpha():
                chars[i], chars[i + 1] = chars[i + 1], chars[i]
        
        return ''.join(chars)
    
    def keyboard_augment(self, text: str, p: float = 0.01) -> str:
        """
        Simulate keyboard typing errors.
        
        Args:
            text: Input text
            p: Probability of error per character
            
        Returns:
            Augmented text
        """
        keyboard_neighbors = {
            'a': 'sqwz', 'b': 'vghn', 'c': 'xdfv', 'd': 'serfcx',
            'e': 'wrsdf', 'f': 'drtgvc', 'g': 'ftyhbv', 'h': 'gyujnb',
            'i': 'ujklo', 'j': 'huiknm', 'k': 'jiolm', 'l': 'kop',
            'm': 'njk', 'n': 'bhjm', 'o': 'iklp', 'p': 'ol',
            'q': 'wa', 'r': 'edft', 's': 'awedxz', 't': 'rfgy',
            'u': 'yhjki', 'v': 'cfgb', 'w': 'qase', 'x': 'zsdc',
            'y': 'tghu', 'z': 'asx'
        }
        
        chars = list(text.lower())
        
        for i, char in enumerate(chars):
            if char in keyboard_neighbors and random.random() < p:
                chars[i] = random.choice(keyboard_neighbors[char])
        
        return ''.join(chars)
    
    def augment(self, text: str) -> List[str]:
        """
        Apply all augmentation techniques to generate multiple versions.
        
        Args:
            text: Input text
            
        Returns:
            List of augmented texts
        """
        words = text.split()
        num_words = len(words)
        
        augmented_texts = []
        
        for _ in range(self.num_aug):
            aug_text = None
            
            # Randomly choose augmentation technique
            technique = random.choice(['sr', 'ri', 'rs', 'rd', 'char', 'kb'])
            
            if technique == 'sr':
                n = max(1, int(self.alpha_sr * num_words))
                aug_words = self.synonym_replacement(words, n)
                aug_text = ' '.join(aug_words)
            
            elif technique == 'ri':
                n = max(1, int(self.alpha_ri * num_words))
                aug_words = self.random_insertion(words, n)
                aug_text = ' '.join(aug_words)
            
            elif technique == 'rs':
                n = max(1, int(self.alpha_rs * num_words))
                aug_words = self.random_swap(words, n)
                aug_text = ' '.join(aug_words)
            
            elif technique == 'rd':
                aug_words = self.random_deletion(words, self.alpha_rd)
                aug_text = ' '.join(aug_words)
            
            elif technique == 'char':
                aug_text = self.character_swap(text)
            
            elif technique == 'kb':
                aug_text = self.keyboard_augment(text)
            
            if aug_text and aug_text != text:
                augmented_texts.append(aug_text)
        
        return augmented_texts
    
    def augment_dataset(
        self,
        texts: List[str],
        labels: List[int],
        augment_per_sample: int = 2
    ) -> Tuple[List[str], List[int]]:
        """
        Augment an entire dataset.
        
        Args:
            texts: List of original texts
            labels: List of labels
            augment_per_sample: Augmentations per sample
            
        Returns:
            Tuple of (augmented_texts, augmented_labels)
        """
        self.num_aug = augment_per_sample
        
        all_texts = list(texts)
        all_labels = list(labels)
        
        for text, label in zip(texts, labels):
            aug_texts = self.augment(text)
            all_texts.extend(aug_texts)
            all_labels.extend([label] * len(aug_texts))
        
        return all_texts, all_labels


class BackTranslator:
    """
    Simulated back-translation augmentation.
    Uses word variations to simulate translation effects.
    
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    
    VARIATIONS = {
        'the': ['a', 'this', 'that'],
        'is': ['was', 'becomes', 'remains'],
        'are': ['were', 'become', 'remain'],
        'has': ['had', 'possesses', 'holds'],
        'have': ['had', 'possess', 'hold'],
        'will': ['would', 'shall', 'might'],
        'can': ['could', 'may', 'might'],
        'very': ['extremely', 'highly', 'quite'],
        'said': ['stated', 'mentioned', 'declared'],
        'made': ['created', 'produced', 'developed'],
    }
    
    def __init__(self, variation_prob: float = 0.3):
        self.variation_prob = variation_prob
    
    def back_translate(self, text: str) -> str:
        """
        Simulate back-translation by applying word variations.
        
        Args:
            text: Input text
            
        Returns:
            Simulated back-translated text
        """
        words = text.split()
        new_words = []
        
        for word in words:
            lower_word = word.lower()
            if lower_word in self.VARIATIONS and random.random() < self.variation_prob:
                variation = random.choice(self.VARIATIONS[lower_word])
                # Preserve capitalization
                if word[0].isupper():
                    variation = variation.capitalize()
                new_words.append(variation)
            else:
                new_words.append(word)
        
        return ' '.join(new_words)


class MixupAugmenter:
    """
    Text mixup augmentation for soft label training.
    
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    
    def __init__(self, alpha: float = 0.2):
        """
        Initialize mixup augmenter.
        
        Args:
            alpha: Beta distribution parameter for mixing
        """
        self.alpha = alpha
    
    def mixup_texts(
        self,
        text1: str,
        text2: str,
        label1: int,
        label2: int,
        num_classes: int
    ) -> Tuple[str, List[float]]:
        """
        Mix two texts and their labels.
        
        Args:
            text1: First text
            text2: Second text
            label1: First label
            label2: Second label
            num_classes: Total number of classes
            
        Returns:
            Tuple of (mixed_text, soft_labels)
        """
        # Get mixing coefficient
        lam = random.betavariate(self.alpha, self.alpha)
        
        # Mix texts by interleaving sentences/words
        words1 = text1.split()
        words2 = text2.split()
        
        mixed_words = []
        max_len = max(len(words1), len(words2))
        
        for i in range(max_len):
            if random.random() < lam:
                if i < len(words1):
                    mixed_words.append(words1[i])
            else:
                if i < len(words2):
                    mixed_words.append(words2[i])
        
        mixed_text = ' '.join(mixed_words) if mixed_words else text1
        
        # Create soft labels
        soft_labels = [0.0] * num_classes
        soft_labels[label1] = lam
        soft_labels[label2] = 1 - lam
        
        return mixed_text, soft_labels


def augment_csv_dataset(
    input_path: str,
    output_path: str,
    augment_per_sample: int = 2,
    random_state: int = 42
):
    """
    Augment a CSV dataset and save the result.
    
    Args:
        input_path: Path to input CSV
        output_path: Path to output CSV
        augment_per_sample: Number of augmentations per sample
        random_state: Random seed
        
    Author: Molla Samser | RSK World (https://rskworld.in)
    """
    import pandas as pd
    
    print(f"\n{'='*60}")
    print("Text Data Augmentation - RSK World")
    print(f"Author: {__author__} | Website: {__website__}")
    print(f"{'='*60}\n")
    
    # Load dataset
    df = pd.read_csv(input_path, comment='#')
    original_size = len(df)
    print(f"Original dataset size: {original_size}")
    
    # Initialize augmenter
    augmenter = TextAugmenter(
        num_aug=augment_per_sample,
        random_state=random_state
    )
    
    # Augment
    aug_texts, aug_labels = augmenter.augment_dataset(
        df['text'].tolist(),
        df['label'].tolist(),
        augment_per_sample
    )
    
    # Create augmented dataframe
    aug_df = pd.DataFrame({
        'id': range(1, len(aug_texts) + 1),
        'text': aug_texts,
        'category': [df[df['label'] == l]['category'].iloc[0] for l in aug_labels],
        'label': aug_labels
    })
    
    # Save
    aug_df.to_csv(output_path, index=False)
    
    print(f"Augmented dataset size: {len(aug_df)}")
    print(f"Increase: {len(aug_df) - original_size} samples ({((len(aug_df)/original_size)-1)*100:.1f}%)")
    print(f"Saved to: {output_path}")


if __name__ == "__main__":
    # Demo
    print(f"\n{'='*60}")
    print("Text Augmentation Demo - RSK World")
    print(f"Author: {__author__} | Website: {__website__}")
    print(f"{'='*60}\n")
    
    augmenter = TextAugmenter(num_aug=5, random_state=42)
    
    sample_text = "Apple announces revolutionary new iPhone featuring advanced AI capabilities."
    
    print(f"Original: {sample_text}\n")
    print("Augmented versions:")
    print("-" * 50)
    
    for i, aug_text in enumerate(augmenter.augment(sample_text), 1):
        print(f"{i}. {aug_text}")
    
    print(f"\n{'='*60}")
    print("Augmentation Demo Complete!")

563 lines•18.2 KB

python

Theme Settings

Color Scheme

Display Options

Font Size