RSK World - Speech Recognition Dataset - Project Files Browser | RSK World

scripts/transformer_model.py

"""
============================================================================
Speech Recognition Dataset - Transformer Model
============================================================================

Project: Speech Recognition Dataset
Description: Audio speech recognition dataset with labeled speech samples 
             for training speech-to-text and voice recognition models.

============================================================================
DEVELOPER INFORMATION
============================================================================
Website: https://rskworld.in
Founded by: Molla Samser
Designer & Tester: Rima Khatun
Email: help@rskworld.in
Support: support@rskworld.in
Phone: +91 93305 39277
Address: Nutanhat, Mongolkote, Purba Burdwan, West Bengal, India, 713147

============================================================================
COPYRIGHT NOTICE
============================================================================
© 2026 RSK World. All rights reserved.
This dataset is provided for educational and research purposes.

============================================================================

This script implements a Transformer-based model for speech recognition,
supporting both TensorFlow and PyTorch implementations.
"""

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle


class PositionalEncoding(layers.Layer):
    """
    Positional encoding layer for Transformer.
    Adds positional information to the input embeddings.
    """
    
    def __init__(self, max_len=5000, d_model=256):
        super().__init__()
        self.max_len = max_len
        self.d_model = d_model
        
        # Create positional encoding matrix
        position = np.arange(max_len)[:, np.newaxis]
        div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        
        pe = np.zeros((max_len, d_model))
        pe[:, 0::2] = np.sin(position * div_term)
        pe[:, 1::2] = np.cos(position * div_term)
        
        self.pe = tf.constant(pe[np.newaxis, :, :], dtype=tf.float32)
    
    def call(self, x):
        seq_len = tf.shape(x)[1]
        return x + self.pe[:, :seq_len, :]


class TransformerBlock(layers.Layer):
    """
    Transformer encoder block with multi-head attention.
    """
    
    def __init__(self, d_model, num_heads, ff_dim, dropout_rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=d_model
        )
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation='gelu'),
            layers.Dense(d_model),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)
    
    def call(self, inputs, training=False):
        # Multi-head attention
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        
        # Feed-forward network
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


class ConformerBlock(layers.Layer):
    """
    Conformer block combining convolution and self-attention.
    More effective for speech recognition tasks.
    """
    
    def __init__(self, d_model, num_heads, conv_kernel_size=31, dropout_rate=0.1):
        super().__init__()
        
        # Feed Forward Module 1
        self.ff1 = keras.Sequential([
            layers.LayerNormalization(epsilon=1e-6),
            layers.Dense(d_model * 4, activation='swish'),
            layers.Dropout(dropout_rate),
            layers.Dense(d_model),
            layers.Dropout(dropout_rate),
        ])
        
        # Multi-Head Self-Attention Module
        self.att = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=d_model // num_heads
        )
        self.att_norm = layers.LayerNormalization(epsilon=1e-6)
        self.att_dropout = layers.Dropout(dropout_rate)
        
        # Convolution Module
        self.conv_norm = layers.LayerNormalization(epsilon=1e-6)
        self.conv = keras.Sequential([
            layers.Conv1D(d_model * 2, 1),  # Pointwise
            layers.Activation('gelu'),
            layers.Conv1D(d_model * 2, conv_kernel_size, padding='same', groups=d_model),  # Depthwise
            layers.BatchNormalization(),
            layers.Activation('swish'),
            layers.Conv1D(d_model, 1),  # Pointwise
            layers.Dropout(dropout_rate),
        ])
        
        # Feed Forward Module 2
        self.ff2 = keras.Sequential([
            layers.LayerNormalization(epsilon=1e-6),
            layers.Dense(d_model * 4, activation='swish'),
            layers.Dropout(dropout_rate),
            layers.Dense(d_model),
            layers.Dropout(dropout_rate),
        ])
        
        self.final_norm = layers.LayerNormalization(epsilon=1e-6)
    
    def call(self, inputs, training=False):
        # Feed Forward 1 (with residual)
        x = inputs + 0.5 * self.ff1(inputs, training=training)
        
        # Self-Attention (with residual)
        attn_out = self.att_norm(x)
        attn_out = self.att(attn_out, attn_out)
        attn_out = self.att_dropout(attn_out, training=training)
        x = x + attn_out
        
        # Convolution (with residual)
        conv_out = self.conv_norm(x)
        conv_out = self.conv(conv_out, training=training)
        x = x + conv_out
        
        # Feed Forward 2 (with residual)
        x = x + 0.5 * self.ff2(x, training=training)
        
        return self.final_norm(x)


class SpeechTransformer:
    """
    Transformer-based speech recognition model.
    """
    
    def __init__(self, 
                 input_dim=13,
                 d_model=256,
                 num_heads=8,
                 num_layers=4,
                 ff_dim=1024,
                 max_len=500,
                 dropout_rate=0.1,
                 use_conformer=False):
        """
        Initialize the Speech Transformer model.
        
        Args:
            input_dim: Input feature dimension (e.g., 13 for MFCC)
            d_model: Model dimension
            num_heads: Number of attention heads
            num_layers: Number of transformer/conformer blocks
            ff_dim: Feed-forward dimension
            max_len: Maximum sequence length
            dropout_rate: Dropout rate
            use_conformer: Use Conformer blocks instead of Transformer
        """
        self.input_dim = input_dim
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.ff_dim = ff_dim
        self.max_len = max_len
        self.dropout_rate = dropout_rate
        self.use_conformer = use_conformer
        
        self.model = None
        self.label_encoder = LabelEncoder()
    
    def build_model(self, num_classes):
        """
        Build the Transformer model.
        
        Args:
            num_classes: Number of output classes
            
        Returns:
            Compiled Keras model
        """
        inputs = layers.Input(shape=(None, self.input_dim))
        
        # Project input to model dimension
        x = layers.Dense(self.d_model)(inputs)
        
        # Add positional encoding
        x = PositionalEncoding(self.max_len, self.d_model)(x)
        x = layers.Dropout(self.dropout_rate)(x)
        
        # Transformer/Conformer blocks
        for _ in range(self.num_layers):
            if self.use_conformer:
                x = ConformerBlock(
                    self.d_model,
                    self.num_heads,
                    dropout_rate=self.dropout_rate
                )(x)
            else:
                x = TransformerBlock(
                    self.d_model,
                    self.num_heads,
                    self.ff_dim,
                    self.dropout_rate
                )(x)
        
        # Global average pooling
        x = layers.GlobalAveragePooling1D()(x)
        
        # Classification head
        x = layers.Dense(256, activation='gelu')(x)
        x = layers.Dropout(self.dropout_rate)(x)
        x = layers.Dense(128, activation='gelu')(x)
        x = layers.Dropout(self.dropout_rate)(x)
        outputs = layers.Dense(num_classes, activation='softmax')(x)
        
        model = keras.Model(inputs, outputs)
        
        # Compile with label smoothing
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=1e-4),
            loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
            metrics=['accuracy']
        )
        
        return model
    
    def load_features(self, feature_dir, feature_name='mfcc'):
        """
        Load features from the dataset.
        
        Args:
            feature_dir: Directory containing features
            feature_name: Name of feature to load
            
        Returns:
            X, y, metadata
        """
        feature_dir = Path(feature_dir)
        metadata_path = feature_dir / 'features_metadata.csv'
        metadata = pd.read_csv(metadata_path)
        
        X = []
        y = []
        
        print(f"Loading {feature_name} features...")
        for idx, row in metadata.iterrows():
            file_id = row['id']
            feature_path = feature_dir / f"{file_id}_{feature_name}.npy"
            
            if feature_path.exists():
                feature = np.load(feature_path)
                X.append(feature)
                y.append(row['speaker'])
        
        return X, np.array(y), metadata
    
    def pad_sequences(self, sequences, max_length=None):
        """Pad sequences to same length."""
        if max_length is None:
            max_length = min(max(len(seq) for seq in sequences), self.max_len)
        
        padded = []
        for seq in sequences:
            if len(seq) < max_length:
                pad_width = max_length - len(seq)
                padded_seq = np.pad(seq, ((0, pad_width), (0, 0)), mode='constant')
            else:
                padded_seq = seq[:max_length]
            padded.append(padded_seq)
        
        return np.array(padded)
    
    def train(self, X, y, epochs=50, batch_size=32, test_size=0.2, model_dir='models'):
        """
        Train the model.
        
        Args:
            X: Feature arrays
            y: Labels
            epochs: Training epochs
            batch_size: Batch size
            test_size: Test set proportion
            model_dir: Directory to save model
        """
        model_dir = Path(model_dir)
        model_dir.mkdir(parents=True, exist_ok=True)
        
        # Encode labels
        y_encoded = self.label_encoder.fit_transform(y)
        num_classes = len(self.label_encoder.classes_)
        y_categorical = keras.utils.to_categorical(y_encoded, num_classes)
        
        # Pad sequences
        print("Padding sequences...")
        X_padded = self.pad_sequences(X)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_padded, y_categorical, test_size=test_size, 
            random_state=42, stratify=y_encoded
        )
        
        X_train, X_val, y_train, y_val = train_test_split(
            X_train, y_train, test_size=0.1, random_state=42
        )
        
        # Build model
        print("\nBuilding Transformer model...")
        self.model = self.build_model(num_classes)
        
        print("\nModel Architecture:")
        self.model.summary()
        
        # Callbacks
        callbacks = [
            keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=10,
                restore_best_weights=True
            ),
            keras.callbacks.ModelCheckpoint(
                str(model_dir / 'transformer_best.h5'),
                monitor='val_accuracy',
                save_best_only=True
            ),
            keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=5,
                min_lr=1e-7
            ),
            keras.callbacks.TensorBoard(
                log_dir=str(model_dir / 'logs'),
                histogram_freq=1
            )
        ]
        
        # Train
        print("\nTraining model...")
        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks
        )
        
        # Evaluate
        print("\nEvaluating on test set...")
        test_loss, test_accuracy = self.model.evaluate(X_test, y_test, verbose=0)
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Test Loss: {test_loss:.4f}")
        
        # Save model and encoder
        self.model.save(str(model_dir / 'transformer_final.h5'))
        with open(model_dir / 'transformer_label_encoder.pkl', 'wb') as f:
            pickle.dump(self.label_encoder, f)
        
        print(f"\nModel saved to: {model_dir}")
        return history
    
    def predict(self, features):
        """Make predictions on new features."""
        X = self.pad_sequences([features])
        probs = self.model.predict(X)[0]
        pred_idx = np.argmax(probs)
        pred_class = self.label_encoder.inverse_transform([pred_idx])[0]
        return pred_class, probs


def main():
    """Main function to train the Transformer model."""
    # Initialize model
    transformer = SpeechTransformer(
        input_dim=13,
        d_model=256,
        num_heads=8,
        num_layers=4,
        ff_dim=1024,
        max_len=500,
        dropout_rate=0.1,
        use_conformer=False  # Set True for Conformer
    )
    
    # Load features
    X, y, metadata = transformer.load_features(
        feature_dir='data/features',
        feature_name='mfcc'
    )
    
    print(f"\nDataset Info:")
    print(f"Total samples: {len(X)}")
    print(f"Number of classes: {len(np.unique(y))}")
    print(f"Feature shape (sample): {X[0].shape}")
    
    # Train model
    history = transformer.train(
        X, y,
        epochs=50,
        batch_size=32,
        model_dir='models'
    )
    
    print("\nTransformer training completed!")


if __name__ == '__main__':
    main()

450 lines•14.9 KB

python

Theme Settings

Color Scheme

Display Options

Font Size