Skip to content

polars_vs_pandas #682

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Binary file not shown.
205 changes: 205 additions & 0 deletions polars-vs-pandas/DataFrame_Plots.ipynb

Large diffs are not rendered by default.

Binary file added polars-vs-pandas/Online_Retail.parquet
Binary file not shown.
14 changes: 14 additions & 0 deletions polars-vs-pandas/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
The materials contained in this download are designed to complement the RealPython tutorial [Polars vs pandas - What's the Difference](https://realpython.com/polars-vs-pandas-difference/).

You should create a new folder named marimo on your computer and place each of these files inside it. You may also consider creating a [Python virtual environment](https://realpython.com/python-virtual-environments-a-primer/) within this folder.

Your download bundle contains the following files:

Online_Retail.parquet - This parquet file contains retail data used in some of the queries.
data_generation.py - This script contains the generate_data() function used to generate different quantities of data.
dataframe_and_lazyframe_time_tests.py - This script performs time tests for DataFrames and a LazyFrame.
streaming_test.py - This script performs time tests for a LazyFrame with streaming enabled.

dataframe_conversions.py - This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example.
sample_pandas_and_polars_code.py - This file contains the code used to illustrate the differences between pandas and Polars syntax.
DataFrame_Plots.ipynb - This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities.
19 changes: 19 additions & 0 deletions polars-vs-pandas/data_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import numpy as np


def generate_data(number_of_rows):
rng = np.random.default_rng()

return {
"order_id": range(1, number_of_rows + 1),
"region": rng.choice(
["North", "South", "East", "West"], size=number_of_rows
),
"sales_person": rng.choice(
["Armstrong", "Aldrin", "Collins"], size=number_of_rows
),
"product": rng.choice(
["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
),
"sales_income": rng.integers(1, 5001, size=number_of_rows),
}
93 changes: 93 additions & 0 deletions polars-vs-pandas/dataframe_and_lazyframe_time_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import functools
import sys
from timeit import Timer

import pandas as pd
import polars as pl
from data_generation import generate_data


def create_pandas_dataframe(test_data):
return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")


def create_polars_dataframe(test_data):
return pl.DataFrame(test_data)


def create_polars_lazyframe(test_data):
return pl.LazyFrame(test_data)


def analyze_pandas_dataframe(pandas_df):
return pandas_df.groupby(["region", "product", "sales_person"])[
"sales_income"
].sum()


def analyze_polars_dataframe(polars_df):
return polars_df.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
)


def analyze_polars_lazyframe(polars_lf):
return (
polars_lf.group_by(["region", "product", "sales_person"])
.agg(total_sales=pl.col("sales_income").sum())
.collect()
)


test_data = generate_data(int(sys.argv[1]))

print("Creating Dataframes...")
print(f"Pandas dataframe creation time for {int(sys.argv[1])} rows:")
print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100))
print(f"\nPolars dataframe creation time for {int(sys.argv[1])} rows:")
print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100))
print(f"\nPolars lazyframe creation time for {int(sys.argv[1])} rows:")
print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100))

pandas_df = create_pandas_dataframe(test_data)
polars_df = create_polars_dataframe(test_data)
polars_lf = create_polars_lazyframe(test_data)

print("\nAnalyzing Dataframes...")
print(f"Pandas dataframe analysis time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100)
)

print(f"\nPolars dataframe analysis time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100)
)

print(f"\nPolars lazyframe analysis time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
)

print("\nShow Boots sales in the East region for pandas DataFrame")
print(analyze_pandas_dataframe(pandas_df)["East"]["Boots"])

print("\nShow Boots sales in the East region for polars DataFrame")
print(
(
analyze_polars_dataframe(polars_df).filter(
pl.col("region") == "East",
pl.col("product") == "Boots",
)
)
)

print("\nShow Boots sales in the East region for pandas LazyFrame")
print(
(
analyze_polars_lazyframe(polars_lf).filter(
pl.col("region") == "East",
pl.col("product") == "Boots",
)
)
)
32 changes: 32 additions & 0 deletions polars-vs-pandas/sample_pandas_and_polars_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pandas as pd
import polars as pl

# Pandas index-based syntax
orders_pandas = pd.read_parquet("online_retail.parquet")

orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]

orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
orders_pandas["Total"] > 100
].head(3)

# Pandas method chaining syntax
orders_pandas = pd.read_parquet("online_retail.parquet")

(
orders_pandas.assign(
Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
)
.filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
.query("Total > 100")
).head(3)

# Polars method chaining syntax
orders_polars = pl.read_parquet("online_retail.parquet")

(
orders_polars.select(
pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
total=pl.col("Quantity") * pl.col("UnitPrice"),
).filter(pl.col("total") > 100)
).head(3)
37 changes: 37 additions & 0 deletions polars-vs-pandas/streaming_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import functools
import sys
from timeit import Timer

import polars as pl
from data_generation import generate_data


def create_polars_lazyframe(test_data):
return pl.LazyFrame(test_data)


def analyze_polars_lazyframe(polars_lf):
polars_lf.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
).collect()


def analyze_polars_streaming(polars_lf):
polars_lf.group_by(["region", "product", "sales_person"]).agg(
total_sales=pl.col("sales_income").sum()
).collect(engine="streaming")


test_data = generate_data(int(sys.argv[1]))

polars_lf = create_polars_lazyframe(test_data)

print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
)

print(f"\nPolars streaming analysis time for {int(sys.argv[1])} rows:")
print(
Timer(functools.partial(analyze_polars_streaming, polars_lf)).timeit(100)
)