RSK World - Speech Recognition Dataset - Project Files Browser | RSK World

scripts/augmentation.py

"""
============================================================================
Speech Recognition Dataset - Data Augmentation Script
============================================================================

Project: Speech Recognition Dataset
Description: Audio speech recognition dataset with labeled speech samples 
             for training speech-to-text and voice recognition models.

============================================================================
DEVELOPER INFORMATION
============================================================================
Website: https://rskworld.in
Founded by: Molla Samser
Designer & Tester: Rima Khatun
Email: help@rskworld.in
Support: support@rskworld.in
Phone: +91 93305 39277
Address: Nutanhat, Mongolkote, Purba Burdwan, West Bengal, India, 713147

============================================================================
COPYRIGHT NOTICE
============================================================================
© 2026 RSK World. All rights reserved.
This dataset is provided for educational and research purposes.

============================================================================

This script provides various audio augmentation techniques to expand
the training dataset and improve model robustness.
"""

import numpy as np
import librosa
import soundfile as sf
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import random


class AudioAugmentation:
    """
    Audio augmentation class for speech recognition datasets.
    
    Provides various augmentation techniques:
    - Time stretching
    - Pitch shifting
    - Adding noise
    - Time shifting
    - Volume variation
    - Speed perturbation
    - SpecAugment (frequency/time masking)
    """
    
    def __init__(self, sr=16000):
        """
        Initialize the augmentation class.
        
        Args:
            sr: Sample rate for audio processing
        """
        self.sr = sr
    
    def time_stretch(self, audio, rate=None):
        """
        Apply time stretching to audio.
        
        Args:
            audio: Audio signal
            rate: Stretch rate (default: random between 0.8 and 1.2)
            
        Returns:
            Time-stretched audio
        """
        if rate is None:
            rate = np.random.uniform(0.8, 1.2)
        return librosa.effects.time_stretch(audio, rate=rate)
    
    def pitch_shift(self, audio, n_steps=None):
        """
        Apply pitch shifting to audio.
        
        Args:
            audio: Audio signal
            n_steps: Number of semitones to shift (default: random between -4 and 4)
            
        Returns:
            Pitch-shifted audio
        """
        if n_steps is None:
            n_steps = np.random.uniform(-4, 4)
        return librosa.effects.pitch_shift(audio, sr=self.sr, n_steps=n_steps)
    
    def add_noise(self, audio, noise_level=None):
        """
        Add Gaussian noise to audio.
        
        Args:
            audio: Audio signal
            noise_level: Noise amplitude (default: random between 0.001 and 0.01)
            
        Returns:
            Noisy audio
        """
        if noise_level is None:
            noise_level = np.random.uniform(0.001, 0.01)
        noise = np.random.randn(len(audio)) * noise_level
        return audio + noise
    
    def time_shift(self, audio, shift_max=None):
        """
        Apply random time shift to audio.
        
        Args:
            audio: Audio signal
            shift_max: Maximum shift in samples (default: 10% of audio length)
            
        Returns:
            Time-shifted audio
        """
        if shift_max is None:
            shift_max = int(len(audio) * 0.1)
        shift = np.random.randint(-shift_max, shift_max)
        return np.roll(audio, shift)
    
    def change_volume(self, audio, gain=None):
        """
        Change audio volume.
        
        Args:
            audio: Audio signal
            gain: Volume multiplier (default: random between 0.5 and 1.5)
            
        Returns:
            Volume-adjusted audio
        """
        if gain is None:
            gain = np.random.uniform(0.5, 1.5)
        return audio * gain
    
    def speed_perturbation(self, audio, speed_factor=None):
        """
        Apply speed perturbation (changes both speed and pitch).
        
        Args:
            audio: Audio signal
            speed_factor: Speed multiplier (default: random between 0.9 and 1.1)
            
        Returns:
            Speed-perturbed audio
        """
        if speed_factor is None:
            speed_factor = np.random.uniform(0.9, 1.1)
        
        # Resample to change speed
        indices = np.round(np.arange(0, len(audio), speed_factor))
        indices = indices[indices < len(audio)].astype(int)
        return audio[indices]
    
    def add_background_noise(self, audio, noise_audio, snr_db=None):
        """
        Add background noise from another audio file.
        
        Args:
            audio: Original audio signal
            noise_audio: Background noise audio
            snr_db: Signal-to-noise ratio in dB (default: random between 5 and 20)
            
        Returns:
            Audio with background noise
        """
        if snr_db is None:
            snr_db = np.random.uniform(5, 20)
        
        # Adjust noise length
        if len(noise_audio) < len(audio):
            noise_audio = np.tile(noise_audio, int(np.ceil(len(audio) / len(noise_audio))))
        noise_audio = noise_audio[:len(audio)]
        
        # Calculate scaling factor for desired SNR
        signal_power = np.mean(audio ** 2)
        noise_power = np.mean(noise_audio ** 2)
        
        if noise_power > 0:
            scale = np.sqrt(signal_power / (noise_power * 10 ** (snr_db / 10)))
            return audio + scale * noise_audio
        return audio
    
    def frequency_mask(self, spectrogram, num_masks=1, mask_factor=27):
        """
        Apply frequency masking (SpecAugment).
        
        Args:
            spectrogram: Mel spectrogram
            num_masks: Number of frequency masks
            mask_factor: Maximum mask width
            
        Returns:
            Masked spectrogram
        """
        spec = spectrogram.copy()
        num_freqs = spec.shape[0]
        
        for _ in range(num_masks):
            f = np.random.randint(0, mask_factor)
            f0 = np.random.randint(0, num_freqs - f)
            spec[f0:f0 + f, :] = 0
        
        return spec
    
    def time_mask(self, spectrogram, num_masks=1, mask_factor=100):
        """
        Apply time masking (SpecAugment).
        
        Args:
            spectrogram: Mel spectrogram
            num_masks: Number of time masks
            mask_factor: Maximum mask width
            
        Returns:
            Masked spectrogram
        """
        spec = spectrogram.copy()
        num_frames = spec.shape[1]
        
        for _ in range(num_masks):
            t = np.random.randint(0, min(mask_factor, num_frames))
            t0 = np.random.randint(0, num_frames - t)
            spec[:, t0:t0 + t] = 0
        
        return spec
    
    def spec_augment(self, spectrogram, num_freq_masks=2, num_time_masks=2):
        """
        Apply SpecAugment (combined frequency and time masking).
        
        Args:
            spectrogram: Mel spectrogram
            num_freq_masks: Number of frequency masks
            num_time_masks: Number of time masks
            
        Returns:
            Augmented spectrogram
        """
        spec = self.frequency_mask(spectrogram, num_freq_masks)
        spec = self.time_mask(spec, num_time_masks)
        return spec
    
    def random_augment(self, audio, augmentations=None):
        """
        Apply random augmentations to audio.
        
        Args:
            audio: Audio signal
            augmentations: List of augmentation names to apply
                          (default: random selection)
            
        Returns:
            Augmented audio
        """
        if augmentations is None:
            augmentations = random.sample([
                'time_stretch', 'pitch_shift', 'add_noise',
                'time_shift', 'change_volume', 'speed_perturbation'
            ], k=random.randint(1, 3))
        
        augmented = audio.copy()
        
        for aug in augmentations:
            if aug == 'time_stretch':
                augmented = self.time_stretch(augmented)
            elif aug == 'pitch_shift':
                augmented = self.pitch_shift(augmented)
            elif aug == 'add_noise':
                augmented = self.add_noise(augmented)
            elif aug == 'time_shift':
                augmented = self.time_shift(augmented)
            elif aug == 'change_volume':
                augmented = self.change_volume(augmented)
            elif aug == 'speed_perturbation':
                augmented = self.speed_perturbation(augmented)
        
        return augmented


class DatasetAugmenter:
    """
    Augment entire dataset with various audio transformations.
    """
    
    def __init__(self, audio_dir, output_dir, sr=16000):
        """
        Initialize the dataset augmenter.
        
        Args:
            audio_dir: Directory containing original audio files
            output_dir: Directory to save augmented files
            sr: Sample rate
        """
        self.audio_dir = Path(audio_dir)
        self.output_dir = Path(output_dir)
        self.sr = sr
        self.augmenter = AudioAugmentation(sr)
        
        self.output_dir.mkdir(parents=True, exist_ok=True)
    
    def augment_file(self, audio_path, num_augmentations=3):
        """
        Generate multiple augmented versions of a single audio file.
        
        Args:
            audio_path: Path to audio file
            num_augmentations: Number of augmented versions to create
            
        Returns:
            List of (augmented_audio, augmentation_info) tuples
        """
        # Load audio
        audio, _ = librosa.load(str(audio_path), sr=self.sr)
        
        augmented_versions = []
        augmentation_types = [
            ['time_stretch'],
            ['pitch_shift'],
            ['add_noise'],
            ['time_stretch', 'add_noise'],
            ['pitch_shift', 'change_volume'],
            ['speed_perturbation', 'add_noise']
        ]
        
        for i in range(num_augmentations):
            augs = augmentation_types[i % len(augmentation_types)]
            augmented = self.augmenter.random_augment(audio, augs)
            augmented_versions.append((augmented, '_'.join(augs)))
        
        return augmented_versions
    
    def augment_dataset(self, metadata_path, num_augmentations=2):
        """
        Augment entire dataset.
        
        Args:
            metadata_path: Path to metadata CSV
            num_augmentations: Number of augmented versions per file
            
        Returns:
            Updated metadata DataFrame
        """
        # Load metadata
        metadata = pd.read_csv(metadata_path)
        new_rows = []
        
        print(f"Augmenting dataset with {num_augmentations} versions per file...")
        
        for idx, row in tqdm(metadata.iterrows(), total=len(metadata)):
            audio_path = self.audio_dir / row['file_name']
            
            if not audio_path.exists():
                continue
            
            try:
                # Generate augmented versions
                augmented_versions = self.augment_file(
                    audio_path, 
                    num_augmentations
                )
                
                # Save augmented files
                for i, (aug_audio, aug_type) in enumerate(augmented_versions):
                    # Generate new filename
                    stem = audio_path.stem
                    new_filename = f"{stem}_aug{i}_{aug_type}.wav"
                    output_path = self.output_dir / new_filename
                    
                    # Save audio
                    sf.write(str(output_path), aug_audio, self.sr)
                    
                    # Create new metadata row
                    new_row = row.copy()
                    new_row['id'] = f"{row['id']}_aug{i}"
                    new_row['file_name'] = new_filename
                    new_row['augmentation'] = aug_type
                    new_rows.append(new_row)
                    
            except Exception as e:
                print(f"Error augmenting {audio_path}: {str(e)}")
                continue
        
        # Create augmented metadata
        if new_rows:
            augmented_df = pd.DataFrame(new_rows)
            
            # Save augmented metadata
            augmented_df.to_csv(
                self.output_dir / 'augmented_metadata.csv',
                index=False
            )
            
            print(f"\nAugmentation complete!")
            print(f"Original files: {len(metadata)}")
            print(f"Augmented files: {len(new_rows)}")
            print(f"Total files: {len(metadata) + len(new_rows)}")
            
            return augmented_df
        
        return pd.DataFrame()


def main():
    """Main function to run data augmentation"""
    # Initialize augmenter
    augmenter = DatasetAugmenter(
        audio_dir='data/audio',
        output_dir='data/augmented',
        sr=16000
    )
    
    # Augment dataset
    augmented_metadata = augmenter.augment_dataset(
        metadata_path='data/metadata.csv',
        num_augmentations=2
    )
    
    print("\nData augmentation completed successfully!")
    print("Augmented files saved to: data/augmented/")


if __name__ == '__main__':
    main()

432 lines•13.9 KB

python

Theme Settings

Color Scheme

Display Options

Font Size