RSK World - Speech Recognition Dataset - Project Files Browser | RSK World

scripts/train_model.py

"""
============================================================================
Speech Recognition Dataset - Model Training Script
============================================================================

Project: Speech Recognition Dataset
Description: Audio speech recognition dataset with labeled speech samples 
             for training speech-to-text and voice recognition models.

============================================================================
DEVELOPER INFORMATION
============================================================================
Website: https://rskworld.in
Founded by: Molla Samser
Designer & Tester: Rima Khatun
Email: help@rskworld.in
Support: support@rskworld.in
Phone: +91 93305 39277
Address: Nutanhat, Mongolkote, Purba Burdwan, West Bengal, India, 713147

============================================================================
COPYRIGHT NOTICE
============================================================================
© 2026 RSK World. All rights reserved.
This dataset is provided for educational and research purposes.

============================================================================
"""

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
import pickle

class SpeechRecognitionModel:
    """
    LSTM-based model for speech recognition
    """
    
    def __init__(self, feature_dir='data/features', model_dir='models'):
        """
        Initialize the model trainer
        
        Args:
            feature_dir: Directory containing extracted features
            model_dir: Directory to save trained models
        """
        self.feature_dir = Path(feature_dir)
        self.model_dir = Path(model_dir)
        self.model_dir.mkdir(parents=True, exist_ok=True)
        
        self.model = None
        self.label_encoder = LabelEncoder()
    
    def load_features(self, feature_name='mfcc'):
        """
        Load all features from the dataset
        
        Args:
            feature_name: Name of the feature to load
            
        Returns:
            X: Feature arrays
            y: Labels (speakers or transcripts)
            metadata: Metadata dataframe
        """
        metadata_path = self.feature_dir / 'features_metadata.csv'
        metadata = pd.read_csv(metadata_path)
        
        X = []
        y = []
        
        print(f"Loading {feature_name} features...")
        for idx, row in metadata.iterrows():
            file_id = row['id']
            feature_path = self.feature_dir / f"{file_id}_{feature_name}.npy"
            
            if feature_path.exists():
                feature = np.load(feature_path)
                X.append(feature)
                # Using speaker as label for speaker recognition
                # For speech-to-text, you would use transcript
                y.append(row['speaker'])
        
        return np.array(X, dtype=object), np.array(y), metadata
    
    def pad_sequences(self, sequences, max_length=None):
        """
        Pad sequences to the same length
        
        Args:
            sequences: List of variable-length sequences
            max_length: Maximum length (if None, uses max sequence length)
            
        Returns:
            Padded sequences array
        """
        if max_length is None:
            max_length = max(len(seq) for seq in sequences)
        
        padded = []
        for seq in sequences:
            if len(seq) < max_length:
                # Pad with zeros
                pad_width = max_length - len(seq)
                padded_seq = np.pad(seq, ((0, pad_width), (0, 0)), mode='constant')
            else:
                # Truncate if longer
                padded_seq = seq[:max_length]
            padded.append(padded_seq)
        
        return np.array(padded)
    
    def build_model(self, input_shape, num_classes):
        """
        Build LSTM model for speech recognition
        
        Args:
            input_shape: Shape of input data (timesteps, features)
            num_classes: Number of output classes
            
        Returns:
            Compiled Keras model
        """
        model = Sequential([
            Bidirectional(LSTM(128, return_sequences=True), input_shape=input_shape),
            Dropout(0.3),
            Bidirectional(LSTM(64, return_sequences=True)),
            Dropout(0.3),
            LSTM(32),
            Dropout(0.3),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(num_classes, activation='softmax')
        ])
        
        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    def train(self, X, y, test_size=0.2, validation_size=0.1, epochs=50, batch_size=32):
        """
        Train the model
        
        Args:
            X: Feature arrays
            y: Labels
            test_size: Proportion of data for testing
            validation_size: Proportion of training data for validation
            epochs: Number of training epochs
            batch_size: Batch size for training
        """
        # Encode labels
        y_encoded = self.label_encoder.fit_transform(y)
        num_classes = len(self.label_encoder.classes_)
        y_categorical = to_categorical(y_encoded, num_classes)
        
        # Pad sequences
        print("Padding sequences...")
        X_padded = self.pad_sequences(X)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_padded, y_categorical, test_size=test_size, random_state=42, stratify=y_encoded
        )
        
        X_train, X_val, y_train, y_val = train_test_split(
            X_train, y_train, test_size=validation_size, random_state=42
        )
        
        # Build model
        input_shape = (X_padded.shape[1], X_padded.shape[2])
        self.model = self.build_model(input_shape, num_classes)
        
        print("\nModel Architecture:")
        self.model.summary()
        
        # Callbacks
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
            ModelCheckpoint(
                self.model_dir / 'best_model.h5',
                monitor='val_accuracy',
                save_best_only=True,
                verbose=1
            )
        ]
        
        # Train model
        print("\nTraining model...")
        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=1
        )
        
        # Evaluate on test set
        print("\nEvaluating on test set...")
        test_loss, test_accuracy = self.model.evaluate(X_test, y_test, verbose=0)
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"Test Loss: {test_loss:.4f}")
        
        # Save label encoder
        with open(self.model_dir / 'label_encoder.pkl', 'wb') as f:
            pickle.dump(self.label_encoder, f)
        
        # Save final model
        self.model.save(self.model_dir / 'final_model.h5')
        
        print(f"\nModel saved to: {self.model_dir}")
        return history
    
    def predict(self, feature_array):
        """
        Make predictions on new audio features
        
        Args:
            feature_array: Feature array from audio file
            
        Returns:
            Predicted class and probabilities
        """
        if self.model is None:
            # Load saved model
            model_path = self.model_dir / 'best_model.h5'
            if model_path.exists():
                self.model = tf.keras.models.load_model(model_path)
            else:
                raise ValueError("Model not found. Please train the model first.")
        
        # Load label encoder
        with open(self.model_dir / 'label_encoder.pkl', 'rb') as f:
            self.label_encoder = pickle.load(f)
        
        # Pad sequence
        X = self.pad_sequences([feature_array])
        
        # Predict
        probabilities = self.model.predict(X)[0]
        predicted_class_idx = np.argmax(probabilities)
        predicted_class = self.label_encoder.inverse_transform([predicted_class_idx])[0]
        
        return predicted_class, probabilities


def main():
    """Main function to train the model"""
    # Initialize model trainer
    trainer = SpeechRecognitionModel(
        feature_dir='data/features',
        model_dir='models'
    )
    
    # Load features
    X, y, metadata = trainer.load_features(feature_name='mfcc')
    
    print(f"\nDataset Info:")
    print(f"Total samples: {len(X)}")
    print(f"Number of classes: {len(np.unique(y))}")
    print(f"Feature shape (sample): {X[0].shape}")
    
    # Train model
    history = trainer.train(
        X, y,
        test_size=0.2,
        validation_size=0.1,
        epochs=50,
        batch_size=32
    )
    
    print("\nTraining completed successfully!")


if __name__ == '__main__':
    main()

291 lines•9.5 KB

python

data/transcripts.json

Raw Download

{
  "1": "Hello, how are you today?",
  "2": "Good morning",
  "3": "Please turn on the lights",
  "4": "What's the weather like outside?",
  "5": "Set a timer for five minutes",
  "6": "Thank you very much",
  "7": "Can you play some music?",
  "8": "How much does it cost?",
  "9": "Goodbye",
  "10": "What time is it now?",
  "11": "Turn off the television",
  "12": "Where is the nearest hospital?",
  "13": "Yes please",
  "14": "Call my mother",
  "15": "What is the capital of France?",
  "16": "Send a message to John",
  "17": "No thank you",
  "18": "Play the next song",
  "19": "How do I get there?",
  "20": "Set an alarm for seven AM",
  "21": "Nice to meet you",
  "22": "What movies are playing tonight?",
  "23": "Stop the music",
  "24": "Can you repeat that please?",
  "25": "Open the door",
  "26": "What is your name?",
  "27": "Volume up",
  "28": "Have a nice day",
  "29": "Search for Italian restaurants",
  "30": "Read my emails",
  "31": "What is the temperature today?",
  "32": "Lock the front door",
  "33": "See you later",
  "34": "Navigate to the airport",
  "35": "Pause the video",
  "36": "How far is the beach?",
  "37": "Dim the lights to fifty percent",
  "38": "Good evening",
  "39": "Find me a recipe for pasta",
  "40": "Skip this track",
  "41": "What are my appointments today?",
  "42": "Close the window",
  "43": "Thanks a lot",
  "44": "How do you spell that?",
  "45": "Increase the brightness",
  "46": "What is the stock price of Apple?",
  "47": "Add milk to my shopping list",
  "48": "Good night",
  "49": "Show me photos from last week",
  "50": "Translate hello to Spanish"
}

53 lines•1.6 KB

json

Theme Settings

Color Scheme

Display Options

Font Size