RSK World - Polars Fast DataFrames - Project Files | RSK World - Free Programming Resources & Source Code

data_generator.py transfer_learning.py generate_data_standalone.py transformers.py performance_comparison.py

scripts/performance_comparison.py

"""
Performance Comparison: Polars vs Pandas
Compares performance of Polars and Pandas for various operations

Author: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
"""

import polars as pl
import pandas as pd
import numpy as np
import time
from datetime import datetime

def compare_filtering(num_rows=1000000):
    """Compare filtering performance"""
    print("=" * 60)
    print("FILTERING PERFORMANCE COMPARISON")
    print("=" * 60)
    
    print(f"\nGenerating {num_rows:,} rows of test data...")
    np.random.seed(42)
    data = {
        'id': range(1, num_rows + 1),
        'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], num_rows),
        'value1': np.random.randn(num_rows) * 100,
        'value2': np.random.randn(num_rows) * 50,
        'value3': np.random.randint(1, 1000, num_rows)
    }
    
    df_polars = pl.DataFrame(data)
    df_pandas = pd.DataFrame(data)
    
    # Polars filtering
    print("\n1. Filtering (value1 > 50):")
    start = time.time()
    result_polars = df_polars.filter(pl.col('value1') > 50)
    polars_time = time.time() - start
    print(f"   Polars time: {polars_time:.4f} seconds")
    print(f"   Result shape: {result_polars.shape}")
    
    # Pandas filtering
    start = time.time()
    result_pandas = df_pandas[df_pandas['value1'] > 50]
    pandas_time = time.time() - start
    print(f"   Pandas time: {pandas_time:.4f} seconds")
    print(f"   Result shape: {result_pandas.shape}")
    
    speedup = pandas_time / polars_time if polars_time > 0 else 0
    print(f"\n   Speedup: {speedup:.2f}x faster with Polars")
    
    return polars_time, pandas_time

def compare_groupby(num_rows=1000000):
    """Compare group by performance"""
    print("\n" + "=" * 60)
    print("GROUP BY PERFORMANCE COMPARISON")
    print("=" * 60)
    
    print(f"\nGenerating {num_rows:,} rows of test data...")
    np.random.seed(42)
    data = {
        'id': range(1, num_rows + 1),
        'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], num_rows),
        'value1': np.random.randn(num_rows) * 100,
        'value2': np.random.randn(num_rows) * 50,
        'value3': np.random.randint(1, 1000, num_rows)
    }
    
    df_polars = pl.DataFrame(data)
    df_pandas = pd.DataFrame(data)
    
    # Polars group by
    print("\n1. Group By and Aggregate:")
    start = time.time()
    result_polars = df_polars.group_by('category').agg([
        pl.col('value1').mean().alias('avg_value1'),
        pl.col('value2').mean().alias('avg_value2'),
        pl.col('value3').sum().alias('sum_value3'),
        pl.count().alias('count')
    ])
    polars_time = time.time() - start
    print(f"   Polars time: {polars_time:.4f} seconds")
    print(result_polars)
    
    # Pandas group by
    start = time.time()
    result_pandas = df_pandas.groupby('category').agg({
        'value1': 'mean',
        'value2': 'mean',
        'value3': 'sum'
    }).reset_index()
    result_pandas['count'] = df_pandas.groupby('category').size().values
    pandas_time = time.time() - start
    print(f"\n   Pandas time: {pandas_time:.4f} seconds")
    print(result_pandas)
    
    speedup = pandas_time / polars_time if polars_time > 0 else 0
    print(f"\n   Speedup: {speedup:.2f}x faster with Polars")
    
    return polars_time, pandas_time

def compare_lazy_evaluation(num_rows=1000000):
    """Compare lazy evaluation performance"""
    print("\n" + "=" * 60)
    print("LAZY EVALUATION PERFORMANCE COMPARISON")
    print("=" * 60)
    
    print(f"\nGenerating {num_rows:,} rows of test data...")
    np.random.seed(42)
    data = {
        'id': range(1, num_rows + 1),
        'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], num_rows),
        'value1': np.random.randn(num_rows) * 100,
        'value2': np.random.randn(num_rows) * 50,
        'value3': np.random.randint(1, 1000, num_rows)
    }
    
    df_polars = pl.DataFrame(data)
    df_pandas = pd.DataFrame(data)
    
    # Polars with lazy evaluation
    print("\n1. Complex Query with Lazy Evaluation:")
    start = time.time()
    result_polars_lazy = (df_polars.lazy()
        .filter(pl.col('value1') > 50)
        .filter(pl.col('value2') < 20)
        .group_by('category')
        .agg([
            pl.col('value1').mean().alias('avg_value1'),
            pl.count().alias('count')
        ])
        .collect()
    )
    polars_lazy_time = time.time() - start
    print(f"   Polars lazy time: {polars_lazy_time:.4f} seconds")
    print(result_polars_lazy)
    
    # Pandas equivalent
    start = time.time()
    filtered = df_pandas[(df_pandas['value1'] > 50) & (df_pandas['value2'] < 20)]
    result_pandas = filtered.groupby('category').agg({
        'value1': 'mean'
    }).reset_index()
    result_pandas['count'] = filtered.groupby('category').size().values
    pandas_time = time.time() - start
    print(f"\n   Pandas time: {pandas_time:.4f} seconds")
    print(result_pandas)
    
    speedup = pandas_time / polars_lazy_time if polars_lazy_time > 0 else 0
    print(f"\n   Speedup: {speedup:.2f}x faster with Polars lazy evaluation")
    
    return polars_lazy_time, pandas_time

def compare_memory_usage(num_rows=1000000):
    """Compare memory usage"""
    print("\n" + "=" * 60)
    print("MEMORY USAGE COMPARISON")
    print("=" * 60)
    
    print(f"\nGenerating {num_rows:,} rows of test data...")
    np.random.seed(42)
    data = {
        'id': range(1, num_rows + 1),
        'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], num_rows),
        'value1': np.random.randn(num_rows) * 100,
        'value2': np.random.randn(num_rows) * 50,
        'value3': np.random.randint(1, 1000, num_rows)
    }
    
    df_polars = pl.DataFrame(data)
    df_pandas = pd.DataFrame(data)
    
    # Memory usage
    polars_memory = df_polars.estimated_size() / (1024 * 1024)  # MB
    pandas_memory = df_pandas.memory_usage(deep=True).sum() / (1024 * 1024)  # MB
    
    print(f"\nPolars memory usage: {polars_memory:.2f} MB")
    print(f"Pandas memory usage: {pandas_memory:.2f} MB")
    
    if pandas_memory > 0:
        efficiency = (1 - polars_memory / pandas_memory) * 100
        print(f"Memory efficiency: {efficiency:.1f}% less memory with Polars")

def run_all_comparisons():
    """Run all performance comparisons"""
    print("\n" + "=" * 60)
    print("POLARS VS PANDAS PERFORMANCE COMPARISON")
    print("=" * 60)
    print("\nNote: Using 1,000,000 rows for testing")
    print("Adjust num_rows parameter for different dataset sizes\n")
    
    # Run comparisons with smaller dataset for faster execution
    num_rows = 100000  # Reduced for faster demo
    
    compare_filtering(num_rows)
    compare_groupby(num_rows)
    compare_lazy_evaluation(num_rows)
    compare_memory_usage(num_rows)
    
    print("\n" + "=" * 60)
    print("Performance comparison complete!")
    print("=" * 60)
    print("\nKey Takeaways:")
    print("- Polars is typically 5-30x faster than Pandas")
    print("- Polars uses less memory due to Apache Arrow format")
    print("- Lazy evaluation provides additional optimization")
    print("- Polars is ideal for large-scale data processing")

if __name__ == "__main__":
    run_all_comparisons()

215 lines•7.3 KB

python

Theme Settings

Color Scheme

Display Options

Font Size