help@rskworld.in +91 93305 39277
RSK World
  • Home
  • Development
    • Web Development
    • Mobile Apps
    • Software
    • Games
    • Project
  • Technologies
    • Data Science
    • AI Development
    • Cloud Development
    • Blockchain
    • Cyber Security
    • Dev Tools
    • Testing Tools
  • About
  • Contact

Theme Settings

Color Scheme
Display Options
Font Size
100%
Back to Project
RSK World
speech-recognition
/
scripts
RSK World
speech-recognition
Speech Recognition Dataset - Audio AI + Speech-to-Text + Voice Recognition
scripts
  • __init__.py848 B
  • augmentation.py13.9 KB
  • evaluate_model.py13.9 KB
  • example_usage.py5.3 KB
  • generate_sample_audio.py10.2 KB
  • load_dataset.py9 KB
  • preprocess.py8.5 KB
  • train_model.py9.5 KB
  • transformer_model.py14.9 KB
preprocess.pyexample_usage.py
scripts/preprocess.py
Raw Download
Find: Go to:
"""
============================================================================
Speech Recognition Dataset - Preprocessing Script
============================================================================

Project: Speech Recognition Dataset
Description: Audio speech recognition dataset with labeled speech samples 
             for training speech-to-text and voice recognition models.

============================================================================
DEVELOPER INFORMATION
============================================================================
Website: https://rskworld.in
Founded by: Molla Samser
Designer & Tester: Rima Khatun
Email: help@rskworld.in
Support: support@rskworld.in
Phone: +91 93305 39277
Address: Nutanhat, Mongolkote, Purba Burdwan, West Bengal, India, 713147

============================================================================
COPYRIGHT NOTICE
============================================================================
© 2026 RSK World. All rights reserved.
This dataset is provided for educational and research purposes.

============================================================================
"""

import os
import numpy as np
import librosa
import pandas as pd
from pathlib import Path
import json
from tqdm import tqdm

class SpeechRecognitionPreprocessor:
    """
    Preprocessor for Speech Recognition Dataset
    
    Extracts features from audio files including:
    - MFCC (Mel-frequency cepstral coefficients)
    - Mel spectrograms
    - Chroma features
    - Spectral contrast
    """
    
    def __init__(self, audio_dir='data/audio', output_dir='data/features', sr=16000):
        """
        Initialize the preprocessor
        
        Args:
            audio_dir: Directory containing audio files
            output_dir: Directory to save extracted features
            sr: Sample rate for audio loading
        """
        self.audio_dir = Path(audio_dir)
        self.output_dir = Path(output_dir)
        self.sr = sr
        
        # Create output directory if it doesn't exist
        self.output_dir.mkdir(parents=True, exist_ok=True)
    
    def extract_mfcc(self, audio_path, n_mfcc=13, n_fft=2048, hop_length=512):
        """
        Extract MFCC features from audio file
        
        Args:
            audio_path: Path to audio file
            n_mfcc: Number of MFCC coefficients
            n_fft: FFT window size
            hop_length: Number of samples between successive frames
            
        Returns:
            MFCC features array
        """
        y, sr = librosa.load(audio_path, sr=self.sr)
        mfcc = librosa.feature.mfcc(
            y=y, 
            sr=sr, 
            n_mfcc=n_mfcc,
            n_fft=n_fft,
            hop_length=hop_length
        )
        return mfcc.T  # Transpose to get (time, features)
    
    def extract_mel_spectrogram(self, audio_path, n_mels=128, n_fft=2048, hop_length=512):
        """
        Extract Mel spectrogram from audio file
        
        Args:
            audio_path: Path to audio file
            n_mels: Number of mel filterbanks
            n_fft: FFT window size
            hop_length: Number of samples between successive frames
            
        Returns:
            Mel spectrogram array
        """
        y, sr = librosa.load(audio_path, sr=self.sr)
        mel_spec = librosa.feature.melspectrogram(
            y=y,
            sr=sr,
            n_mels=n_mels,
            n_fft=n_fft,
            hop_length=hop_length
        )
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        return mel_spec_db.T
    
    def extract_chroma(self, audio_path, n_fft=2048, hop_length=512):
        """
        Extract Chroma features from audio file
        
        Args:
            audio_path: Path to audio file
            n_fft: FFT window size
            hop_length: Number of samples between successive frames
            
        Returns:
            Chroma features array
        """
        y, sr = librosa.load(audio_path, sr=self.sr)
        chroma = librosa.feature.chroma(
            y=y,
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length
        )
        return chroma.T
    
    def extract_spectral_contrast(self, audio_path, n_fft=2048, hop_length=512):
        """
        Extract Spectral Contrast features from audio file
        
        Args:
            audio_path: Path to audio file
            n_fft: FFT window size
            hop_length: Number of samples between successive frames
            
        Returns:
            Spectral contrast features array
        """
        y, sr = librosa.load(audio_path, sr=self.sr)
        contrast = librosa.feature.spectral_contrast(
            y=y,
            sr=sr,
            n_fft=n_fft,
            hop_length=hop_length
        )
        return contrast.T
    
    def extract_all_features(self, audio_path):
        """
        Extract all features from audio file
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            Dictionary containing all extracted features
        """
        features = {
            'mfcc': self.extract_mfcc(audio_path),
            'mel_spectrogram': self.extract_mel_spectrogram(audio_path),
            'chroma': self.extract_chroma(audio_path),
            'spectral_contrast': self.extract_spectral_contrast(audio_path)
        }
        return features
    
    def process_dataset(self, metadata_path='data/metadata.csv'):
        """
        Process entire dataset and extract features
        
        Args:
            metadata_path: Path to metadata CSV file
        """
        # Load metadata
        metadata = pd.read_csv(metadata_path)
        
        # Initialize lists to store features
        all_features = []
        
        print("Processing audio files and extracting features...")
        for idx, row in tqdm(metadata.iterrows(), total=len(metadata)):
            audio_file = self.audio_dir / row['file_name']
            
            if audio_file.exists():
                try:
                    # Extract features
                    features = self.extract_all_features(str(audio_file))
                    
                    # Save individual feature files
                    file_id = row['id']
                    for feature_name, feature_data in features.items():
                        feature_path = self.output_dir / f"{file_id}_{feature_name}.npy"
                        np.save(feature_path, feature_data)
                    
                    # Store metadata
                    all_features.append({
                        'id': file_id,
                        'file_name': row['file_name'],
                        'speaker': row['speaker'],
                        'duration': row['duration'],
                        'transcript': row['transcript']
                    })
                    
                except Exception as e:
                    print(f"Error processing {audio_file}: {str(e)}")
                    continue
        
        # Save combined features metadata
        features_metadata = pd.DataFrame(all_features)
        features_metadata.to_csv(
            self.output_dir / 'features_metadata.csv',
            index=False
        )
        
        print(f"\nProcessing complete!")
        print(f"Processed {len(all_features)} audio files")
        print(f"Features saved to: {self.output_dir}")
    
    def load_features(self, file_id, feature_name='mfcc'):
        """
        Load pre-extracted features
        
        Args:
            file_id: ID of the audio file
            feature_name: Name of the feature to load
            
        Returns:
            Feature array
        """
        feature_path = self.output_dir / f"{file_id}_{feature_name}.npy"
        if feature_path.exists():
            return np.load(feature_path)
        else:
            raise FileNotFoundError(f"Feature file not found: {feature_path}")


def main():
    """Main function to run preprocessing"""
    preprocessor = SpeechRecognitionPreprocessor(
        audio_dir='data/audio',
        output_dir='data/features',
        sr=16000
    )
    
    # Process the dataset
    preprocessor.process_dataset(metadata_path='data/metadata.csv')
    
    print("\nPreprocessing completed successfully!")
    print("You can now use the extracted features for model training.")


if __name__ == '__main__':
    main()

260 lines•8.5 KB
python
scripts/example_usage.py
Raw Download
Find: Go to:
"""
============================================================================
Speech Recognition Dataset - Example Usage
============================================================================

Project: Speech Recognition Dataset
Description: Audio speech recognition dataset with labeled speech samples 
             for training speech-to-text and voice recognition models.

============================================================================
DEVELOPER INFORMATION
============================================================================
Website: https://rskworld.in
Founded by: Molla Samser
Designer & Tester: Rima Khatun
Email: help@rskworld.in
Support: support@rskworld.in
Phone: +91 93305 39277
Address: Nutanhat, Mongolkote, Purba Burdwan, West Bengal, India, 713147

============================================================================
COPYRIGHT NOTICE
============================================================================
© 2026 RSK World. All rights reserved.
This dataset is provided for educational and research purposes.

============================================================================

This script demonstrates how to use the Speech Recognition Dataset.
"""

import sys
from pathlib import Path

# Add parent directory to path
sys.path.append(str(Path(__file__).parent.parent))

from scripts.load_dataset import SpeechRecognitionDataset
from scripts.preprocess import SpeechRecognitionPreprocessor

def example_load_dataset():
    """Example: Loading and exploring the dataset"""
    print("=" * 60)
    print("Example 1: Loading Dataset")
    print("=" * 60)
    
    # Initialize dataset
    dataset = SpeechRecognitionDataset(data_dir='data')
    
    # Get statistics
    stats = dataset.get_statistics()
    print("\nDataset Statistics:")
    for key, value in stats.items():
        print(f"  {key}: {value}")
    
    # Get a sample
    print("\nSample files:")
    sample = dataset.sample(n=3)
    if sample is not None:
        print(sample[['id', 'file_name', 'speaker', 'duration', 'transcript']])
    
    # Get files by speaker
    print("\nFiles by Speaker_001:")
    speaker_files = dataset.get_files_by_speaker('Speaker_001')
    print(f"  Found {len(speaker_files)} files")
    
    # Get files by category
    print("\nFiles by Category (Greeting):")
    category_files = dataset.get_files_by_category('Greeting')
    print(f"  Found {len(category_files)} files")


def example_preprocessing():
    """Example: Preprocessing audio files"""
    print("\n" + "=" * 60)
    print("Example 2: Preprocessing Audio Files")
    print("=" * 60)
    
    # Initialize preprocessor
    preprocessor = SpeechRecognitionPreprocessor(
        audio_dir='data/audio',
        output_dir='data/features',
        sr=16000
    )
    
    print("\nPreprocessor initialized with:")
    print(f"  Audio directory: {preprocessor.audio_dir}")
    print(f"  Output directory: {preprocessor.output_dir}")
    print(f"  Sample rate: {preprocessor.sr} Hz")
    
    # Note: Uncomment the following line to process the entire dataset
    # preprocessor.process_dataset(metadata_path='data/metadata.csv')
    
    print("\nTo process the dataset, uncomment the process_dataset() call")


def example_feature_extraction():
    """Example: Extracting features from a single file"""
    print("\n" + "=" * 60)
    print("Example 3: Feature Extraction")
    print("=" * 60)
    
    from pathlib import Path
    
    # Check if audio file exists
    audio_file = Path('data/audio/audio_001.wav')
    
    if audio_file.exists():
        preprocessor = SpeechRecognitionPreprocessor()
        
        print(f"\nExtracting features from: {audio_file}")
        
        # Extract MFCC
        mfcc = preprocessor.extract_mfcc(str(audio_file))
        print(f"  MFCC shape: {mfcc.shape}")
        
        # Extract Mel Spectrogram
        mel_spec = preprocessor.extract_mel_spectrogram(str(audio_file))
        print(f"  Mel Spectrogram shape: {mel_spec.shape}")
        
        # Extract Chroma
        chroma = preprocessor.extract_chroma(str(audio_file))
        print(f"  Chroma shape: {chroma.shape}")
    else:
        print(f"\nAudio file not found: {audio_file}")
        print("Please add audio files to data/audio/ directory")


def example_model_training():
    """Example: Training a model"""
    print("\n" + "=" * 60)
    print("Example 4: Model Training")
    print("=" * 60)
    
    print("\nTo train a model:")
    print("1. First, extract features using preprocess.py")
    print("2. Then, run train_model.py")
    print("\nExample command:")
    print("  python scripts/train_model.py")


def main():
    """Run all examples"""
    print("\n" + "=" * 60)
    print("SPEECH RECOGNITION DATASET - USAGE EXAMPLES")
    print("=" * 60)
    print("\nWebsite: https://rskworld.in")
    print("© 2026 RSK World. All rights reserved.\n")
    
    # Run examples
    example_load_dataset()
    example_preprocessing()
    example_feature_extraction()
    example_model_training()
    
    print("\n" + "=" * 60)
    print("Examples completed!")
    print("=" * 60)
    print("\nFor more information, visit: https://rskworld.in")


if __name__ == '__main__':
    main()

165 lines•5.3 KB
python

About RSK World

Founded by Molla Samser, with Designer & Tester Rima Khatun, RSK World is your one-stop destination for free programming resources, source code, and development tools.

Founder: Molla Samser
Designer & Tester: Rima Khatun

Development

  • Game Development
  • Web Development
  • Mobile Development
  • AI Development
  • Development Tools

Legal

  • Terms & Conditions
  • Privacy Policy
  • Disclaimer

Contact Info

Nutanhat, Mongolkote
Purba Burdwan, West Bengal
India, 713147

+91 93305 39277

hello@rskworld.in
support@rskworld.in

© 2026 RSK World. All rights reserved.

Content used for educational purposes only. View Disclaimer