RSK World - Speech Recognition Dataset - Project Files Browser | RSK World

scripts/preprocess.py

"""
============================================================================
Speech Recognition Dataset - Preprocessing Script
============================================================================

Project: Speech Recognition Dataset
Description: Audio speech recognition dataset with labeled speech samples 
             for training speech-to-text and voice recognition models.

============================================================================
DEVELOPER INFORMATION
============================================================================
Website: https://rskworld.in
Founded by: Molla Samser
Designer & Tester: Rima Khatun
Email: help@rskworld.in
Support: support@rskworld.in
Phone: +91 93305 39277
Address: Nutanhat, Mongolkote, Purba Burdwan, West Bengal, India, 713147

============================================================================
COPYRIGHT NOTICE
============================================================================
© 2026 RSK World. All rights reserved.
This dataset is provided for educational and research purposes.

============================================================================
"""

import os
import numpy as np
import librosa
import pandas as pd
from pathlib import Path
import json
from tqdm import tqdm

class SpeechRecognitionPreprocessor:
    """
    Preprocessor for Speech Recognition Dataset
    
    Extracts features from audio files including:
    - MFCC (Mel-frequency cepstral coefficients)
    - Mel spectrograms
    - Chroma features
    - Spectral contrast
    """
    
    def __init__(self, audio_dir='data/audio', output_dir='data/features', sr=16000):
        """
        Initialize the preprocessor
        
        Args:
            audio_dir: Directory containing audio files
            output_dir: Directory to save extracted features
            sr: Sample rate for audio loading
        """
        self.audio_dir = Path(audio_dir)
        self.output_dir = Path(output_dir)
        self.sr = sr
        
        # Create output directory if it doesn't exist
        self.output_dir.mkdir(parents=True, exist_ok=True)
    
    def extract_mfcc(self, audio_path, n_mfcc=13, n_fft=2048, hop_length=512):
        """
        Extract MFCC features from audio file
        
        Args:
            audio_path: Path to audio file
            n_mfcc: Number of MFCC coefficients
            n_fft: FFT window size
            hop_length: Number of samples between successive frames
            
        Returns:
            MFCC features array
        """
        y, sr = librosa.load(audio_path, sr=self.sr)
        mfcc = librosa.feature.mfcc(
            y=y, 
            sr=sr, 
            n_mfcc=n_mfcc,
            n_fft=n_fft,
            hop_length=hop_length
        )
        return mfcc.T  # Transpose to get (time, features)
    
    def extract_mel_spectrogram(self, audio_path, n_mels=128, n_fft=2048, hop_length=512):
        """
        Extract Mel spectrogram from audio file
        
        Args:
            audio_path: Path to audio file
            n_mels: Number of mel filterbanks
            n_fft: FFT window size
            hop_length: Number of samples between successive frames
            
        Returns:
            Mel spectrogram array
        """
        y, sr = librosa.load(audio_path, sr=self.sr)
        mel_spec = librosa.feature.melspectrogram(
            y=y,
            sr=sr,
            n_mels=n_mels,
            n_fft=n_fft,
            hop_length=hop_length
        )
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        return mel_spec_db.T
    
    def extract_chroma(self, audio_path, n_fft=2048, hop_length=512):
        """
        Extract Chroma features from audio file
        
        Args:
            audio_path: Path to audio file
            n_fft: FFT window size
            hop_length: Number of samples between successive frames
            
        Returns:
            Chroma features array
        """
        y, sr = librosa.load(audio_path, sr=self.sr)
        chroma = librosa.feature.chroma(
            y=y,
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length
        )
        return chroma.T
    
    def extract_spectral_contrast(self, audio_path, n_fft=2048, hop_length=512):
        """
        Extract Spectral Contrast features from audio file
        
        Args:
            audio_path: Path to audio file
            n_fft: FFT window size
            hop_length: Number of samples between successive frames
            
        Returns:
            Spectral contrast features array
        """
        y, sr = librosa.load(audio_path, sr=self.sr)
        contrast = librosa.feature.spectral_contrast(
            y=y,
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length
        )
        return contrast.T
    
    def extract_all_features(self, audio_path):
        """
        Extract all features from audio file
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            Dictionary containing all extracted features
        """
        features = {
            'mfcc': self.extract_mfcc(audio_path),
            'mel_spectrogram': self.extract_mel_spectrogram(audio_path),
            'chroma': self.extract_chroma(audio_path),
            'spectral_contrast': self.extract_spectral_contrast(audio_path)
        }
        return features
    
    def process_dataset(self, metadata_path='data/metadata.csv'):
        """
        Process entire dataset and extract features
        
        Args:
            metadata_path: Path to metadata CSV file
        """
        # Load metadata
        metadata = pd.read_csv(metadata_path)
        
        # Initialize lists to store features
        all_features = []
        
        print("Processing audio files and extracting features...")
        for idx, row in tqdm(metadata.iterrows(), total=len(metadata)):
            audio_file = self.audio_dir / row['file_name']
            
            if audio_file.exists():
                try:
                    # Extract features
                    features = self.extract_all_features(str(audio_file))
                    
                    # Save individual feature files
                    file_id = row['id']
                    for feature_name, feature_data in features.items():
                        feature_path = self.output_dir / f"{file_id}_{feature_name}.npy"
                        np.save(feature_path, feature_data)
                    
                    # Store metadata
                    all_features.append({
                        'id': file_id,
                        'file_name': row['file_name'],
                        'speaker': row['speaker'],
                        'duration': row['duration'],
                        'transcript': row['transcript']
                    })
                    
                except Exception as e:
                    print(f"Error processing {audio_file}: {str(e)}")
                    continue
        
        # Save combined features metadata
        features_metadata = pd.DataFrame(all_features)
        features_metadata.to_csv(
            self.output_dir / 'features_metadata.csv',
            index=False
        )
        
        print(f"\nProcessing complete!")
        print(f"Processed {len(all_features)} audio files")
        print(f"Features saved to: {self.output_dir}")
    
    def load_features(self, file_id, feature_name='mfcc'):
        """
        Load pre-extracted features
        
        Args:
            file_id: ID of the audio file
            feature_name: Name of the feature to load
            
        Returns:
            Feature array
        """
        feature_path = self.output_dir / f"{file_id}_{feature_name}.npy"
        if feature_path.exists():
            return np.load(feature_path)
        else:
            raise FileNotFoundError(f"Feature file not found: {feature_path}")


def main():
    """Main function to run preprocessing"""
    preprocessor = SpeechRecognitionPreprocessor(
        audio_dir='data/audio',
        output_dir='data/features',
        sr=16000
    )
    
    # Process the dataset
    preprocessor.process_dataset(metadata_path='data/metadata.csv')
    
    print("\nPreprocessing completed successfully!")
    print("You can now use the extracted features for model training.")


if __name__ == '__main__':
    main()

260 lines•8.5 KB

python

Theme Settings

Color Scheme

Display Options

Font Size