RSK World - Dask Parallel Computing - Project Files | RSK World - Free Programming Resources & Source Code

scripts/generate_advanced_data.py

#!/usr/bin/env python3
"""
Generate Advanced Sample Data for Dask Examples
Author: Molla Samser
Designer & Tester: Rima Khatun
Website: https://rskworld.in
Email: help@rskworld.in, support@rskworld.in
Phone: +91 93305 39277
"""

import pandas as pd
import numpy as np
import json
import os
from datetime import datetime, timedelta


def generate_time_series_data(n_rows=1000000, filename='data/timeseries_large.csv'):
    """Generate large time series dataset"""
    print(f"Generating time series data with {n_rows} rows...")
    
    dates = pd.date_range('2020-01-01', periods=n_rows, freq='1min')
    
    data = {
        'timestamp': dates,
        'sensor_id': np.random.randint(1, 100, n_rows),
        'temperature': np.random.normal(25, 5, n_rows),
        'humidity': np.random.normal(60, 15, n_rows),
        'pressure': np.random.normal(1013, 10, n_rows),
        'value': np.random.randn(n_rows).cumsum(),
        'status': np.random.choice(['OK', 'WARNING', 'ERROR'], n_rows, p=[0.9, 0.08, 0.02])
    }
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Saved to {filename}")
    return df


def generate_transaction_data(n_rows=2000000, filename='data/transactions_large.csv'):
    """Generate large transaction dataset"""
    print(f"Generating transaction data with {n_rows} rows...")
    
    data = {
        'transaction_id': range(n_rows),
        'timestamp': pd.date_range('2023-01-01', periods=n_rows, freq='1min'),
        'customer_id': np.random.randint(1, 100000, n_rows),
        'product_id': np.random.randint(1, 10000, n_rows),
        'amount': np.random.uniform(10, 5000, n_rows),
        'quantity': np.random.randint(1, 10, n_rows),
        'category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books', 'Home'], n_rows),
        'region': np.random.choice(['North', 'South', 'East', 'West', 'Central'], n_rows),
        'payment_method': np.random.choice(['Credit', 'Debit', 'Cash', 'Online'], n_rows),
        'discount': np.random.uniform(0, 0.3, n_rows)
    }
    
    df = pd.DataFrame(data)
    df['final_amount'] = df['amount'] * df['quantity'] * (1 - df['discount'])
    df.to_csv(filename, index=False)
    print(f"Saved to {filename}")
    return df


def generate_ml_dataset(n_samples=500000, n_features=100, filename='data/ml_dataset.csv'):
    """Generate machine learning dataset"""
    print(f"Generating ML dataset with {n_samples} samples and {n_features} features...")
    
    # Generate features
    features = {}
    for i in range(n_features):
        features[f'feature_{i}'] = np.random.randn(n_samples)
    
    # Generate target
    features['target'] = np.random.choice([0, 1, 2], n_samples)
    
    df = pd.DataFrame(features)
    df.to_csv(filename, index=False)
    print(f"Saved to {filename}")
    return df


def generate_json_data(n_records=100000, filename='data/nested_data.jsonl'):
    """Generate nested JSON data"""
    print(f"Generating JSON data with {n_records} records...")
    
    with open(filename, 'w') as f:
        for i in range(n_records):
            record = {
                'id': i,
                'user': {
                    'name': f'User_{i}',
                    'age': np.random.randint(18, 80),
                    'email': f'user_{i}@example.com'
                },
                'orders': [
                    {
                        'order_id': j,
                        'amount': np.random.uniform(10, 500),
                        'date': (datetime.now() - timedelta(days=np.random.randint(0, 365))).isoformat()
                    }
                    for j in range(np.random.randint(1, 6))
                ],
                'metadata': {
                    'city': np.random.choice(['NYC', 'LA', 'Chicago', 'Houston', 'Phoenix']),
                    'country': 'USA',
                    'preferences': np.random.choice(['A', 'B', 'C'], 3).tolist()
                }
            }
            f.write(json.dumps(record) + '\n')
    
    print(f"Saved to {filename}")


def generate_multiple_files(n_files=10, rows_per_file=100000):
    """Generate multiple CSV files for parallel processing"""
    print(f"Generating {n_files} files with {rows_per_file} rows each...")
    
    for i in range(n_files):
        data = {
            'id': range(i * rows_per_file, (i + 1) * rows_per_file),
            'value': np.random.randn(rows_per_file),
            'category': np.random.choice(['A', 'B', 'C', 'D'], rows_per_file),
            'score': np.random.randint(0, 100, rows_per_file),
            'date': pd.date_range('2023-01-01', periods=rows_per_file, freq='1H')
        }
        df = pd.DataFrame(data)
        df.to_csv(f'data/batch_file_{i:03d}.csv', index=False)
    
    print(f"Generated {n_files} files in data/ directory")


def generate_network_data(n_nodes=10000, filename='data/network_data.csv'):
    """Generate network/graph data"""
    print(f"Generating network data with {n_nodes} nodes...")
    
    # Generate edges
    n_edges = n_nodes * 10
    data = {
        'source': np.random.randint(0, n_nodes, n_edges),
        'target': np.random.randint(0, n_nodes, n_edges),
        'weight': np.random.uniform(0, 1, n_edges),
        'timestamp': pd.date_range('2023-01-01', periods=n_edges, freq='1min')
    }
    
    df = pd.DataFrame(data)
    # Remove self-loops
    df = df[df['source'] != df['target']]
    df.to_csv(filename, index=False)
    print(f"Saved to {filename}")
    return df


def main():
    """Main function to generate all datasets"""
    print("\n" + "=" * 60)
    print("Generating Advanced Sample Data for Dask Examples")
    print("=" * 60)
    
    # Create data directory if it doesn't exist
    os.makedirs('data', exist_ok=True)
    
    # Generate different types of datasets
    print("\n1. Time Series Data")
    generate_time_series_data(n_rows=1000000)
    
    print("\n2. Transaction Data")
    generate_transaction_data(n_rows=2000000)
    
    print("\n3. Machine Learning Dataset")
    generate_ml_dataset(n_samples=500000, n_features=100)
    
    print("\n4. JSON/Nested Data")
    generate_json_data(n_records=100000)
    
    print("\n5. Multiple Files for Parallel Processing")
    generate_multiple_files(n_files=10, rows_per_file=100000)
    
    print("\n6. Network Data")
    generate_network_data(n_nodes=10000)
    
    print("\n" + "=" * 60)
    print("All datasets generated successfully!")
    print("=" * 60)
    print("\nGenerated files:")
    print("  - data/timeseries_large.csv")
    print("  - data/transactions_large.csv")
    print("  - data/ml_dataset.csv")
    print("  - data/nested_data.jsonl")
    print("  - data/batch_file_*.csv (10 files)")
    print("  - data/network_data.csv")


if __name__ == "__main__":
    main()

196 lines•6.8 KB

python

Theme Settings

Color Scheme

Display Options

Font Size