help@rskworld.in +91 93305 39277
RSK World
  • Home
  • Development
    • Web Development
    • Mobile Apps
    • Software
    • Games
    • Project
  • Technologies
    • Data Science
    • AI Development
    • Cloud Development
    • Blockchain
    • Cyber Security
    • Dev Tools
    • Testing Tools
  • About
  • Contact

Theme Settings

Color Scheme
Display Options
Font Size
100%
Back to Project
RSK World
speech-recognition
/
data
/
audio
RSK World
speech-recognition
Speech Recognition Dataset - Audio AI + Speech-to-Text + Voice Recognition
audio
  • README.txt1.1 KB
  • audio_001.wav110.5 KB
  • audio_002.wav65.5 KB
  • audio_003.wav97.5 KB
  • audio_004.wav142.5 KB
  • audio_005.wav90.4 KB
  • audio_006.wav67.2 KB
  • audio_007.wav107.9 KB
  • audio_008.wav83.5 KB
  • audio_009.wav61.9 KB
  • audio_010.wav118.2 KB
  • audio_011.wav76.6 KB
  • audio_012.wav100.4 KB
  • audio_013.wav48.8 KB
  • audio_014.wav90.4 KB
  • audio_015.wav128.8 KB
  • audio_016.wav73.2 KB
  • audio_017.wav55.7 KB
  • audio_018.wav107.9 KB
  • audio_019.wav83.5 KB
  • audio_020.wav97.5 KB
  • audio_021.wav90.4 KB
  • audio_022.wav132.2 KB
  • audio_023.wav61.9 KB
  • audio_024.wav111.3 KB
  • audio_025.wav76.6 KB
  • audio_026.wav118.2 KB
  • audio_027.wav66.3 KB
  • audio_028.wav90.4 KB
  • audio_029.wav104.4 KB
  • audio_030.wav80 KB
  • audio_031.wav128.8 KB
  • audio_032.wav69.7 KB
  • audio_033.wav59.1 KB
  • audio_034.wav114.7 KB
  • audio_035.wav76.6 KB
  • audio_036.wav121.6 KB
  • audio_037.wav86.9 KB
  • audio_038.wav73.2 KB
  • audio_039.wav139.1 KB
  • audio_040.wav66.3 KB
  • audio_041.wav101 KB
  • audio_042.wav80 KB
  • audio_043.wav55.7 KB
  • audio_044.wav121.6 KB
  • audio_045.wav76.6 KB
  • audio_046.wav128.8 KB
  • audio_047.wav90.4 KB
  • audio_048.wav73.2 KB
  • audio_049.wav111.3 KB
  • audio_050.wav86.9 KB
load_dataset.py
scripts/load_dataset.py
Raw Download
Find: Go to:
"""
============================================================================
Speech Recognition Dataset - Dataset Loader
============================================================================

Project: Speech Recognition Dataset
Description: Audio speech recognition dataset with labeled speech samples 
             for training speech-to-text and voice recognition models.

============================================================================
DEVELOPER INFORMATION
============================================================================
Website: https://rskworld.in
Founded by: Molla Samser
Designer & Tester: Rima Khatun
Email: help@rskworld.in
Support: support@rskworld.in
Phone: +91 93305 39277
Address: Nutanhat, Mongolkote, Purba Burdwan, West Bengal, India, 713147

============================================================================
COPYRIGHT NOTICE
============================================================================
© 2026 RSK World. All rights reserved.
This dataset is provided for educational and research purposes.

============================================================================
"""

import pandas as pd
import numpy as np
import librosa
from pathlib import Path
import json

class SpeechRecognitionDataset:
    """
    Dataset loader for Speech Recognition Dataset
    
    Provides easy access to audio files, metadata, and transcripts
    """
    
    def __init__(self, data_dir='data'):
        """
        Initialize the dataset loader
        
        Args:
            data_dir: Root directory of the dataset
        """
        self.data_dir = Path(data_dir)
        self.audio_dir = self.data_dir / 'audio'
        self.metadata_path = self.data_dir / 'metadata.csv'
        self.transcripts_path = self.data_dir / 'transcripts.json'
        
        # Load metadata
        if self.metadata_path.exists():
            self.metadata = pd.read_csv(self.metadata_path)
        else:
            self.metadata = None
            print(f"Warning: Metadata file not found at {self.metadata_path}")
        
        # Load transcripts
        if self.transcripts_path.exists():
            with open(self.transcripts_path, 'r') as f:
                self.transcripts = json.load(f)
        else:
            self.transcripts = {}
            print(f"Warning: Transcripts file not found at {self.transcripts_path}")
    
    def get_audio_file(self, file_id):
        """
        Get path to audio file by ID
        
        Args:
            file_id: ID of the audio file
            
        Returns:
            Path to audio file
        """
        if self.metadata is None:
            raise ValueError("Metadata not loaded")
        
        row = self.metadata[self.metadata['id'] == file_id]
        if row.empty:
            raise ValueError(f"File ID {file_id} not found in metadata")
        
        file_name = row.iloc[0]['file_name']
        audio_path = self.audio_dir / file_name
        
        if not audio_path.exists():
            raise FileNotFoundError(f"Audio file not found: {audio_path}")
        
        return audio_path
    
    def load_audio(self, file_id, sr=16000):
        """
        Load audio file as numpy array
        
        Args:
            file_id: ID of the audio file
            sr: Sample rate
            
        Returns:
            Audio array and sample rate
        """
        audio_path = self.get_audio_file(file_id)
        y, sr = librosa.load(str(audio_path), sr=sr)
        return y, sr
    
    def get_transcript(self, file_id):
        """
        Get transcript for audio file
        
        Args:
            file_id: ID of the audio file
            
        Returns:
            Transcript text
        """
        if file_id in self.transcripts:
            return self.transcripts[file_id]
        
        # Try to get from metadata
        if self.metadata is not None:
            row = self.metadata[self.metadata['id'] == file_id]
            if not row.empty and 'transcript' in row.columns:
                return row.iloc[0]['transcript']
        
        return None
    
    def get_speaker(self, file_id):
        """
        Get speaker ID for audio file
        
        Args:
            file_id: ID of the audio file
            
        Returns:
            Speaker ID
        """
        if self.metadata is None:
            return None
        
        row = self.metadata[self.metadata['id'] == file_id]
        if not row.empty and 'speaker' in row.columns:
            return row.iloc[0]['speaker']
        
        return None
    
    def get_metadata(self, file_id):
        """
        Get all metadata for audio file
        
        Args:
            file_id: ID of the audio file
            
        Returns:
            Dictionary with metadata
        """
        if self.metadata is None:
            return None
        
        row = self.metadata[self.metadata['id'] == file_id]
        if row.empty:
            return None
        
        return row.iloc[0].to_dict()
    
    def get_files_by_speaker(self, speaker_id):
        """
        Get all file IDs for a specific speaker
        
        Args:
            speaker_id: ID of the speaker
            
        Returns:
            List of file IDs
        """
        if self.metadata is None:
            return []
        
        rows = self.metadata[self.metadata['speaker'] == speaker_id]
        return rows['id'].tolist()
    
    def get_files_by_category(self, category):
        """
        Get all file IDs for a specific category
        
        Args:
            category: Category name (e.g., 'Greeting', 'Command')
            
        Returns:
            List of file IDs
        """
        if self.metadata is None:
            return []
        
        if 'category' not in self.metadata.columns:
            return []
        
        rows = self.metadata[self.metadata['category'] == category]
        return rows['id'].tolist()
    
    def get_statistics(self):
        """
        Get dataset statistics
        
        Returns:
            Dictionary with statistics
        """
        if self.metadata is None:
            return {}
        
        stats = {
            'total_files': len(self.metadata),
            'unique_speakers': self.metadata['speaker'].nunique() if 'speaker' in self.metadata.columns else 0,
            'total_duration': self.metadata['duration'].sum() if 'duration' in self.metadata.columns else 0,
            'average_duration': self.metadata['duration'].mean() if 'duration' in self.metadata.columns else 0,
            'min_duration': self.metadata['duration'].min() if 'duration' in self.metadata.columns else 0,
            'max_duration': self.metadata['duration'].max() if 'duration' in self.metadata.columns else 0,
        }
        
        if 'category' in self.metadata.columns:
            stats['categories'] = self.metadata['category'].value_counts().to_dict()
        
        return stats
    
    def sample(self, n=5, speaker_id=None, category=None):
        """
        Get random sample of files
        
        Args:
            n: Number of samples
            speaker_id: Filter by speaker (optional)
            category: Filter by category (optional)
            
        Returns:
            DataFrame with sample metadata
        """
        if self.metadata is None:
            return None
        
        df = self.metadata.copy()
        
        if speaker_id:
            df = df[df['speaker'] == speaker_id]
        
        if category:
            if 'category' in df.columns:
                df = df[df['category'] == category]
        
        return df.sample(min(n, len(df)))


def main():
    """Example usage of the dataset loader"""
    # Initialize dataset
    dataset = SpeechRecognitionDataset(data_dir='data')
    
    # Get statistics
    stats = dataset.get_statistics()
    print("Dataset Statistics:")
    for key, value in stats.items():
        print(f"  {key}: {value}")
    
    # Get a sample
    print("\nSample files:")
    sample = dataset.sample(n=5)
    if sample is not None:
        print(sample[['id', 'file_name', 'speaker', 'duration', 'transcript']].head())
    
    # Load an audio file
    if sample is not None and len(sample) > 0:
        file_id = sample.iloc[0]['id']
        print(f"\nLoading audio file {file_id}...")
        try:
            audio, sr = dataset.load_audio(file_id)
            transcript = dataset.get_transcript(file_id)
            speaker = dataset.get_speaker(file_id)
            
            print(f"  Audio shape: {audio.shape}")
            print(f"  Sample rate: {sr}")
            print(f"  Duration: {len(audio) / sr:.2f} seconds")
            print(f"  Speaker: {speaker}")
            print(f"  Transcript: {transcript}")
        except Exception as e:
            print(f"  Error loading audio: {str(e)}")


if __name__ == '__main__':
    main()

293 lines•9 KB
python

About RSK World

Founded by Molla Samser, with Designer & Tester Rima Khatun, RSK World is your one-stop destination for free programming resources, source code, and development tools.

Founder: Molla Samser
Designer & Tester: Rima Khatun

Development

  • Game Development
  • Web Development
  • Mobile Development
  • AI Development
  • Development Tools

Legal

  • Terms & Conditions
  • Privacy Policy
  • Disclaimer

Contact Info

Nutanhat, Mongolkote
Purba Burdwan, West Bengal
India, 713147

+91 93305 39277

hello@rskworld.in
support@rskworld.in

© 2026 RSK World. All rights reserved.

Content used for educational purposes only. View Disclaimer