help@rskworld.in +91 93305 39277
RSK World
  • Home
  • Development
    • Web Development
    • Mobile Apps
    • Software
    • Games
    • Project
  • Technologies
    • Data Science
    • AI Development
    • Cloud Development
    • Blockchain
    • Cyber Security
    • Dev Tools
    • Testing Tools
  • About
  • Contact

Theme Settings

Color Scheme
Display Options
Font Size
100%
Back to Project
RSK World
polars-fastdataframes
/
scripts
RSK World
polars-fastdataframes
High-performance DataFrames with Polars
scripts
  • __pycache__
  • advanced_queries.py7 KB
  • basic_operations.py3 KB
  • data_generator.py4.2 KB
  • lazy_evaluation.py3.2 KB
  • performance_comparison.py7.3 KB
data_generator.py
scripts/data_generator.py
Raw Download
Find: Go to:
"""
Data Generator for Polars Fast DataFrames
Generates sample datasets for demonstration purposes

Author: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
"""

import polars as pl
import numpy as np
import pandas as pd
import os
from datetime import datetime, timedelta
import random

def generate_large_dataset(num_rows=1000000):
    """
    Generate a large dataset for performance testing
    
    Args:
        num_rows: Number of rows to generate
        
    Returns:
        polars.DataFrame: Generated DataFrame
    """
    print(f"Generating {num_rows:,} rows of sample data...")
    
    # Generate random data
    np.random.seed(42)
    random.seed(42)
    
    dates = [datetime(2020, 1, 1) + timedelta(days=x) for x in range(num_rows)]
    
    data = {
        'id': range(1, num_rows + 1),
        'date': dates,
        'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], num_rows),
        'value1': np.random.randn(num_rows) * 100,
        'value2': np.random.randn(num_rows) * 50,
        'value3': np.random.randint(1, 1000, num_rows),
        'status': np.random.choice(['active', 'inactive', 'pending'], num_rows),
        'score': np.random.uniform(0, 100, num_rows)
    }
    
    df = pl.DataFrame(data)
    print(f"Generated DataFrame with shape: {df.shape}")
    return df

def generate_sample_data(num_rows=10000):
    """
    Generate a smaller sample dataset for basic demonstrations
    
    Args:
        num_rows: Number of rows to generate
        
    Returns:
        polars.DataFrame: Generated DataFrame
    """
    print(f"Generating {num_rows:,} rows of sample data...")
    
    np.random.seed(42)
    random.seed(42)
    
    dates = [datetime(2023, 1, 1) + timedelta(days=x % 365) for x in range(num_rows)]
    
    data = {
        'id': range(1, num_rows + 1),
        'name': [f'Product_{i}' for i in range(1, num_rows + 1)],
        'date': dates,
        'category': np.random.choice(['Electronics', 'Clothing', 'Food', 'Books', 'Toys'], num_rows),
        'price': np.random.uniform(10, 1000, num_rows),
        'quantity': np.random.randint(1, 100, num_rows),
        'rating': np.random.uniform(1, 5, num_rows),
        'in_stock': np.random.choice([True, False], num_rows)
    }
    
    df = pl.DataFrame(data)
    print(f"Generated DataFrame with shape: {df.shape}")
    return df

def save_to_csv(df, filename=None):
    """
    Save DataFrame to CSV file
    
    Args:
        df: Polars DataFrame
        filename: Output filename (defaults to data/sample_data.csv relative to project root)
    """
    if filename is None:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        project_root = os.path.dirname(script_dir)
        filename = os.path.join(project_root, 'data', 'sample_data.csv')
    df.write_csv(filename)
    print(f"Data saved to {filename}")

def save_to_parquet(df, filename=None):
    """
    Save DataFrame to Parquet file
    
    Args:
        df: Polars DataFrame
        filename: Output filename (defaults to data/sample_data.parquet relative to project root)
    """
    if filename is None:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        project_root = os.path.dirname(script_dir)
        filename = os.path.join(project_root, 'data', 'sample_data.parquet')
    df.write_parquet(filename)
    print(f"Data saved to {filename}")

if __name__ == "__main__":
    script_dir = os.path.dirname(os.path.abspath(__file__))
    project_root = os.path.dirname(script_dir)
    data_dir = os.path.join(project_root, 'data')
    
    # Ensure data directory exists
    os.makedirs(data_dir, exist_ok=True)
    
    # Generate sample data
    sample_df = generate_sample_data(10000)
    save_to_csv(sample_df, os.path.join(data_dir, 'sample_data.csv'))
    save_to_parquet(sample_df, os.path.join(data_dir, 'sample_data.parquet'))
    
    # Generate large dataset for performance testing
    large_df = generate_large_dataset(1000000)
    save_to_csv(large_df, os.path.join(data_dir, 'large_dataset.csv'))
    save_to_parquet(large_df, os.path.join(data_dir, 'large_dataset.parquet'))
    
    print("\nData generation complete!")

133 lines•4.2 KB
python

About RSK World

Founded by Molla Samser, with Designer & Tester Rima Khatun, RSK World is your one-stop destination for free programming resources, source code, and development tools.

Founder: Molla Samser
Designer & Tester: Rima Khatun

Development

  • Game Development
  • Web Development
  • Mobile Development
  • AI Development
  • Development Tools

Legal

  • Terms & Conditions
  • Privacy Policy
  • Disclaimer

Contact Info

Nutanhat, Mongolkote
Purba Burdwan, West Bengal
India, 713147

+91 93305 39277

hello@rskworld.in
support@rskworld.in

© 2026 RSK World. All rights reserved.

Content used for educational purposes only. View Disclaimer