help@rskworld.in +91 93305 39277
RSK World
  • Home
  • Development
    • Web Development
    • Mobile Apps
    • Software
    • Games
    • Project
  • Technologies
    • Data Science
    • AI Development
    • Cloud Development
    • Blockchain
    • Cyber Security
    • Dev Tools
    • Testing Tools
  • About
  • Contact

Theme Settings

Color Scheme
Display Options
Font Size
100%
Back to Project
RSK World
polars-fastdataframes
/
scripts
RSK World
polars-fastdataframes
High-performance DataFrames with Polars
scripts
  • __pycache__
  • advanced_queries.py7 KB
  • basic_operations.py3 KB
  • data_generator.py4.2 KB
  • lazy_evaluation.py3.2 KB
  • performance_comparison.py7.3 KB
performance_comparison.py
scripts/performance_comparison.py
Raw Download
Find: Go to:
"""
Performance Comparison: Polars vs Pandas
Compares performance of Polars and Pandas for various operations

Author: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
"""

import polars as pl
import pandas as pd
import numpy as np
import time
from datetime import datetime

def compare_filtering(num_rows=1000000):
    """Compare filtering performance"""
    print("=" * 60)
    print("FILTERING PERFORMANCE COMPARISON")
    print("=" * 60)
    
    print(f"\nGenerating {num_rows:,} rows of test data...")
    np.random.seed(42)
    data = {
        'id': range(1, num_rows + 1),
        'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], num_rows),
        'value1': np.random.randn(num_rows) * 100,
        'value2': np.random.randn(num_rows) * 50,
        'value3': np.random.randint(1, 1000, num_rows)
    }
    
    df_polars = pl.DataFrame(data)
    df_pandas = pd.DataFrame(data)
    
    # Polars filtering
    print("\n1. Filtering (value1 > 50):")
    start = time.time()
    result_polars = df_polars.filter(pl.col('value1') > 50)
    polars_time = time.time() - start
    print(f"   Polars time: {polars_time:.4f} seconds")
    print(f"   Result shape: {result_polars.shape}")
    
    # Pandas filtering
    start = time.time()
    result_pandas = df_pandas[df_pandas['value1'] > 50]
    pandas_time = time.time() - start
    print(f"   Pandas time: {pandas_time:.4f} seconds")
    print(f"   Result shape: {result_pandas.shape}")
    
    speedup = pandas_time / polars_time if polars_time > 0 else 0
    print(f"\n   Speedup: {speedup:.2f}x faster with Polars")
    
    return polars_time, pandas_time

def compare_groupby(num_rows=1000000):
    """Compare group by performance"""
    print("\n" + "=" * 60)
    print("GROUP BY PERFORMANCE COMPARISON")
    print("=" * 60)
    
    print(f"\nGenerating {num_rows:,} rows of test data...")
    np.random.seed(42)
    data = {
        'id': range(1, num_rows + 1),
        'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], num_rows),
        'value1': np.random.randn(num_rows) * 100,
        'value2': np.random.randn(num_rows) * 50,
        'value3': np.random.randint(1, 1000, num_rows)
    }
    
    df_polars = pl.DataFrame(data)
    df_pandas = pd.DataFrame(data)
    
    # Polars group by
    print("\n1. Group By and Aggregate:")
    start = time.time()
    result_polars = df_polars.group_by('category').agg([
        pl.col('value1').mean().alias('avg_value1'),
        pl.col('value2').mean().alias('avg_value2'),
        pl.col('value3').sum().alias('sum_value3'),
        pl.count().alias('count')
    ])
    polars_time = time.time() - start
    print(f"   Polars time: {polars_time:.4f} seconds")
    print(result_polars)
    
    # Pandas group by
    start = time.time()
    result_pandas = df_pandas.groupby('category').agg({
        'value1': 'mean',
        'value2': 'mean',
        'value3': 'sum'
    }).reset_index()
    result_pandas['count'] = df_pandas.groupby('category').size().values
    pandas_time = time.time() - start
    print(f"\n   Pandas time: {pandas_time:.4f} seconds")
    print(result_pandas)
    
    speedup = pandas_time / polars_time if polars_time > 0 else 0
    print(f"\n   Speedup: {speedup:.2f}x faster with Polars")
    
    return polars_time, pandas_time

def compare_lazy_evaluation(num_rows=1000000):
    """Compare lazy evaluation performance"""
    print("\n" + "=" * 60)
    print("LAZY EVALUATION PERFORMANCE COMPARISON")
    print("=" * 60)
    
    print(f"\nGenerating {num_rows:,} rows of test data...")
    np.random.seed(42)
    data = {
        'id': range(1, num_rows + 1),
        'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], num_rows),
        'value1': np.random.randn(num_rows) * 100,
        'value2': np.random.randn(num_rows) * 50,
        'value3': np.random.randint(1, 1000, num_rows)
    }
    
    df_polars = pl.DataFrame(data)
    df_pandas = pd.DataFrame(data)
    
    # Polars with lazy evaluation
    print("\n1. Complex Query with Lazy Evaluation:")
    start = time.time()
    result_polars_lazy = (df_polars.lazy()
        .filter(pl.col('value1') > 50)
        .filter(pl.col('value2') < 20)
        .group_by('category')
        .agg([
            pl.col('value1').mean().alias('avg_value1'),
            pl.count().alias('count')
        ])
        .collect()
    )
    polars_lazy_time = time.time() - start
    print(f"   Polars lazy time: {polars_lazy_time:.4f} seconds")
    print(result_polars_lazy)
    
    # Pandas equivalent
    start = time.time()
    filtered = df_pandas[(df_pandas['value1'] > 50) & (df_pandas['value2'] < 20)]
    result_pandas = filtered.groupby('category').agg({
        'value1': 'mean'
    }).reset_index()
    result_pandas['count'] = filtered.groupby('category').size().values
    pandas_time = time.time() - start
    print(f"\n   Pandas time: {pandas_time:.4f} seconds")
    print(result_pandas)
    
    speedup = pandas_time / polars_lazy_time if polars_lazy_time > 0 else 0
    print(f"\n   Speedup: {speedup:.2f}x faster with Polars lazy evaluation")
    
    return polars_lazy_time, pandas_time

def compare_memory_usage(num_rows=1000000):
    """Compare memory usage"""
    print("\n" + "=" * 60)
    print("MEMORY USAGE COMPARISON")
    print("=" * 60)
    
    print(f"\nGenerating {num_rows:,} rows of test data...")
    np.random.seed(42)
    data = {
        'id': range(1, num_rows + 1),
        'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], num_rows),
        'value1': np.random.randn(num_rows) * 100,
        'value2': np.random.randn(num_rows) * 50,
        'value3': np.random.randint(1, 1000, num_rows)
    }
    
    df_polars = pl.DataFrame(data)
    df_pandas = pd.DataFrame(data)
    
    # Memory usage
    polars_memory = df_polars.estimated_size() / (1024 * 1024)  # MB
    pandas_memory = df_pandas.memory_usage(deep=True).sum() / (1024 * 1024)  # MB
    
    print(f"\nPolars memory usage: {polars_memory:.2f} MB")
    print(f"Pandas memory usage: {pandas_memory:.2f} MB")
    
    if pandas_memory > 0:
        efficiency = (1 - polars_memory / pandas_memory) * 100
        print(f"Memory efficiency: {efficiency:.1f}% less memory with Polars")

def run_all_comparisons():
    """Run all performance comparisons"""
    print("\n" + "=" * 60)
    print("POLARS VS PANDAS PERFORMANCE COMPARISON")
    print("=" * 60)
    print("\nNote: Using 1,000,000 rows for testing")
    print("Adjust num_rows parameter for different dataset sizes\n")
    
    # Run comparisons with smaller dataset for faster execution
    num_rows = 100000  # Reduced for faster demo
    
    compare_filtering(num_rows)
    compare_groupby(num_rows)
    compare_lazy_evaluation(num_rows)
    compare_memory_usage(num_rows)
    
    print("\n" + "=" * 60)
    print("Performance comparison complete!")
    print("=" * 60)
    print("\nKey Takeaways:")
    print("- Polars is typically 5-30x faster than Pandas")
    print("- Polars uses less memory due to Apache Arrow format")
    print("- Lazy evaluation provides additional optimization")
    print("- Polars is ideal for large-scale data processing")

if __name__ == "__main__":
    run_all_comparisons()

215 lines•7.3 KB
python

About RSK World

Founded by Molla Samser, with Designer & Tester Rima Khatun, RSK World is your one-stop destination for free programming resources, source code, and development tools.

Founder: Molla Samser
Designer & Tester: Rima Khatun

Development

  • Game Development
  • Web Development
  • Mobile Development
  • AI Development
  • Development Tools

Legal

  • Terms & Conditions
  • Privacy Policy
  • Disclaimer

Contact Info

Nutanhat, Mongolkote
Purba Burdwan, West Bengal
India, 713147

+91 93305 39277

hello@rskworld.in
support@rskworld.in

© 2026 RSK World. All rights reserved.

Content used for educational purposes only. View Disclaimer