help@rskworld.in +91 93305 39277
RSK World
  • Home
  • Development
    • Web Development
    • Mobile Apps
    • Software
    • Games
    • Project
  • Technologies
    • Data Science
    • AI Development
    • Cloud Development
    • Blockchain
    • Cyber Security
    • Dev Tools
    • Testing Tools
  • About
  • Contact

Theme Settings

Color Scheme
Display Options
Font Size
100%
Back to Project
RSK World
dask-parallel
/
scripts
RSK World
dask-parallel
Parallel and distributed computing with Dask
scripts
  • advanced_data_processing.py6.6 KB
  • create_basic_data.py4.9 KB
  • distributed_workflow.py4.3 KB
  • generate_advanced_data.py6.8 KB
  • memory_efficient_ops.py3.6 KB
  • parallel_processing.py2.2 KB
  • performance_profiling.py6 KB
create_basic_data.py
scripts/create_basic_data.py
Raw Download
Find: Go to:
#!/usr/bin/env python3
"""
Create Basic Sample Data for Dask Examples
Author: Molla Samser
Designer & Tester: Rima Khatun
Website: https://rskworld.in
Email: help@rskworld.in, support@rskworld.in
Phone: +91 93305 39277
"""

import pandas as pd
import numpy as np
import os


def create_sample_data():
    """Create basic sample data for DataFrame examples"""
    print("Creating sample_data.csv...")
    
    n_rows = 100000
    data = {
        'id': range(n_rows),
        'value1': np.random.randn(n_rows),
        'value2': np.random.randn(n_rows),
        'category': np.random.choice(['A', 'B', 'C', 'D'], n_rows),
        'date': pd.date_range('2020-01-01', periods=n_rows, freq='1min')
    }
    
    df = pd.DataFrame(data)
    df.to_csv('data/sample_data.csv', index=False)
    print(f"Created sample_data.csv with {n_rows} rows")


def create_advanced_data():
    """Create advanced data files for advanced DataFrame notebook"""
    print("\nCreating advanced_data_1.csv and advanced_data_2.csv...")
    
    n_rows = 100000
    dates = pd.date_range('2020-01-01', periods=n_rows, freq='1h')
    
    df1 = pd.DataFrame({
        'id': range(n_rows),
        'timestamp': dates,
        'value': np.random.randn(n_rows),
        'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], n_rows),
        'region': np.random.choice(['North', 'South', 'East', 'West'], n_rows),
        'amount': np.random.uniform(100, 10000, n_rows)
    })
    
    df2 = pd.DataFrame({
        'id': range(n_rows),
        'metadata': np.random.choice(['Type1', 'Type2', 'Type3'], n_rows),
        'status': np.random.choice(['Active', 'Inactive'], n_rows),
        'score': np.random.randint(0, 100, n_rows)
    })
    
    df1.to_csv('data/advanced_data_1.csv', index=False)
    df2.to_csv('data/advanced_data_2.csv', index=False)
    print(f"Created advanced_data_1.csv and advanced_data_2.csv with {n_rows} rows each")


def create_timeseries_data():
    """Create time series data for advanced processing"""
    print("\nCreating timeseries_data.csv...")
    
    n_points = 100000
    dates = pd.date_range('2020-01-01', periods=n_points, freq='1h')
    
    data = {
        'timestamp': dates,
        'value': np.random.randn(n_points).cumsum(),
        'temperature': np.random.uniform(15, 35, n_points),
        'humidity': np.random.uniform(30, 90, n_points),
        'pressure': np.random.uniform(980, 1020, n_points)
    }
    
    df = pd.DataFrame(data)
    df.to_csv('data/timeseries_data.csv', index=False)
    print(f"Created timeseries_data.csv with {n_points} rows")


def create_profile_data():
    """Create data for performance profiling"""
    print("\nCreating profile_data.csv...")
    
    n_rows = 100000
    data = {
        'id': range(n_rows),
        'value': np.random.randn(n_rows),
        'category': np.random.choice(['A', 'B', 'C', 'D'], n_rows),
        'amount': np.random.uniform(100, 10000, n_rows)
    }
    
    df = pd.DataFrame(data)
    df.to_csv('data/profile_data.csv', index=False)
    print(f"Created profile_data.csv with {n_rows} rows")


def create_complex_data():
    """Create complex data for advanced processing"""
    print("\nCreating complex_data.csv...")
    
    n_rows = 100000
    data = {
        'id': range(n_rows),
        'date': pd.date_range('2020-01-01', periods=n_rows, freq='1h'),
        'amount': np.random.uniform(10, 1000, n_rows),
        'category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books'], n_rows),
        'region': np.random.choice(['US', 'EU', 'Asia', 'Other'], n_rows),
        'discount': np.random.uniform(0, 0.5, n_rows)
    }
    
    df = pd.DataFrame(data)
    df['final_amount'] = df['amount'] * (1 - df['discount'])
    df.to_csv('data/complex_data.csv', index=False)
    print(f"Created complex_data.csv with {n_rows} rows")


def create_batch_files():
    """Create batch files for parallel processing"""
    print("\nCreating batch_file_*.csv files...")
    
    for i in range(5):
        data = {
            'id': range(i * 10000, (i + 1) * 10000),
            'value': np.random.randn(10000),
            'category': np.random.choice(['A', 'B', 'C'], 10000)
        }
        df = pd.DataFrame(data)
        df.to_csv(f'data/file_{i}.csv', index=False)
    
    print("Created 5 batch files (file_0.csv to file_4.csv)")


def main():
    """Main function"""
    print("\n" + "=" * 60)
    print("Creating Basic Sample Data for Dask Examples")
    print("=" * 60)
    
    # Ensure data directory exists
    os.makedirs('data', exist_ok=True)
    
    create_sample_data()
    create_advanced_data()
    create_timeseries_data()
    create_profile_data()
    create_complex_data()
    create_batch_files()
    
    print("\n" + "=" * 60)
    print("All basic data files created successfully!")
    print("=" * 60)


if __name__ == "__main__":
    main()

159 lines•4.9 KB
python

About RSK World

Founded by Molla Samser, with Designer & Tester Rima Khatun, RSK World is your one-stop destination for free programming resources, source code, and development tools.

Founder: Molla Samser
Designer & Tester: Rima Khatun

Development

  • Game Development
  • Web Development
  • Mobile Development
  • AI Development
  • Development Tools

Legal

  • Terms & Conditions
  • Privacy Policy
  • Disclaimer

Contact Info

Nutanhat, Mongolkote
Purba Burdwan, West Bengal
India, 713147

+91 93305 39277

hello@rskworld.in
support@rskworld.in

© 2026 RSK World. All rights reserved.

Content used for educational purposes only. View Disclaimer