realpython · eyrei123 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/...ediaFmultiuploaddufevrierFeltonjohnFfreFRFeltonjohnEjpgxqualityeltonjohnjpg.jpg b/...ediaFmultiuploaddufevrierFeltonjohnFfreFRFeltonjohnEjpgxqualityeltonjohnjpg.jpg
diff --git a/...ediaFmultiuploaddufevrierFeltonjohnFfreFRFeltonjohnEjpgxqualityeltonjohnjpg.jpg b/...ediaFmultiuploaddufevrierFeltonjohnFfreFRFeltonjohnEjpgxqualityeltonjohnjpg.jpg
diff --git a/...ediaFmultiuploaddufevrierFeltonjohnFfreFRFeltonjohnEjpgxqualityeltonjohnjpg.jpg b/...ediaFmultiuploaddufevrierFeltonjohnFfreFRFeltonjohnEjpgxqualityeltonjohnjpg.jpg
diff --git a/polars-vs-pandas/DataFrame_Plots.ipynb b/polars-vs-pandas/DataFrame_Plots.ipynb
diff --git a/polars-vs-pandas/Online_Retail.parquet b/polars-vs-pandas/Online_Retail.parquet
diff --git a/polars-vs-pandas/README.md b/polars-vs-pandas/README.md
@@ -0,0 +1,14 @@
+The materials contained in this download are designed to complement the RealPython tutorial [Polars vs pandas - What's the Difference](https://realpython.com/polars-vs-pandas-difference/).
+
+You should create a new folder named marimo on your computer and place each of these files inside it. You may also consider creating a [Python virtual environment](https://realpython.com/python-virtual-environments-a-primer/) within this folder.
+
+Your download bundle contains the following files:
+
+Online_Retail.parquet		 	- This parquet file contains retail data used in some of the queries.
+data_generation.py		 	- This script contains the generate_data() function used to generate different quantities of data.
+dataframe_and_lazyframe_time_tests.py	- This script performs time tests for DataFrames and a LazyFrame.
+streaming_test.py	                - This script performs time tests for a LazyFrame with streaming enabled.
+
+dataframe_conversions.py	 	- This file contains the code used to convert between pandas and Polars DataFrames, plus a Narwhals example.
+sample_pandas_and_polars_code.py 	- This file contains the code used to illustrate the differences between pandas and Polars syntax.
+DataFrame_Plots.ipynb			- This Jupyter Notebook file contains the plotting code to demonstrate default plotting capabilities.
diff --git a/polars-vs-pandas/data_generation.py b/polars-vs-pandas/data_generation.py
@@ -0,0 +1,19 @@
+import numpy as np
+
+
+def generate_data(number_of_rows):
+    rng = np.random.default_rng()
+
+    return {
+        "order_id": range(1, number_of_rows + 1),
+        "region": rng.choice(
+            ["North", "South", "East", "West"], size=number_of_rows
+        ),
+        "sales_person": rng.choice(
+            ["Armstrong", "Aldrin", "Collins"], size=number_of_rows
+        ),
+        "product": rng.choice(
+            ["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
+        ),
+        "sales_income": rng.integers(1, 5001, size=number_of_rows),
+    }
diff --git a/polars-vs-pandas/dataframe_and_lazyframe_time_tests.py b/polars-vs-pandas/dataframe_and_lazyframe_time_tests.py
@@ -0,0 +1,93 @@
+import functools
+import sys
+from timeit import Timer
+
+import pandas as pd
+import polars as pl
+from data_generation import generate_data
+
+
+def create_pandas_dataframe(test_data):
+    return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow")
+
+
+def create_polars_dataframe(test_data):
+    return pl.DataFrame(test_data)
+
+
+def create_polars_lazyframe(test_data):
+    return pl.LazyFrame(test_data)
+
+
+def analyze_pandas_dataframe(pandas_df):
+    return pandas_df.groupby(["region", "product", "sales_person"])[
+        "sales_income"
+    ].sum()
+
+
+def analyze_polars_dataframe(polars_df):
+    return polars_df.group_by(["region", "product", "sales_person"]).agg(
+        total_sales=pl.col("sales_income").sum()
+    )
+
+
+def analyze_polars_lazyframe(polars_lf):
+    return (
+        polars_lf.group_by(["region", "product", "sales_person"])
+        .agg(total_sales=pl.col("sales_income").sum())
+        .collect()
+    )
+
+
+test_data = generate_data(int(sys.argv[1]))
+
+print("Creating Dataframes...")
+print(f"Pandas dataframe creation time for {int(sys.argv[1])} rows:")
+print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100))
+print(f"\nPolars dataframe creation time for {int(sys.argv[1])} rows:")
+print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100))
+print(f"\nPolars lazyframe creation time for {int(sys.argv[1])} rows:")
+print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100))
+
+pandas_df = create_pandas_dataframe(test_data)
+polars_df = create_polars_dataframe(test_data)
+polars_lf = create_polars_lazyframe(test_data)
+
+print("\nAnalyzing Dataframes...")
+print(f"Pandas dataframe analysis time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100)
+)
+
+print(f"\nPolars dataframe analysis time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100)
+)
+
+print(f"\nPolars lazyframe analysis time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
+)
+
+print("\nShow Boots sales in the East region for pandas DataFrame")
+print(analyze_pandas_dataframe(pandas_df)["East"]["Boots"])
+
+print("\nShow Boots sales in the East region for polars DataFrame")
+print(
+    (
+        analyze_polars_dataframe(polars_df).filter(
+            pl.col("region") == "East",
+            pl.col("product") == "Boots",
+        )
+    )
+)
+
+print("\nShow Boots sales in the East region for pandas LazyFrame")
+print(
+    (
+        analyze_polars_lazyframe(polars_lf).filter(
+            pl.col("region") == "East",
+            pl.col("product") == "Boots",
+        )
+    )
+)
diff --git a/polars-vs-pandas/sample_pandas_and_polars_code.py b/polars-vs-pandas/sample_pandas_and_polars_code.py
@@ -0,0 +1,32 @@
+import pandas as pd
+import polars as pl
+
+# Pandas index-based syntax
+orders_pandas = pd.read_parquet("online_retail.parquet")
+
+orders_pandas["Total"] = orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
+
+orders_pandas[["InvoiceNo", "Quantity", "UnitPrice", "Total"]][
+    orders_pandas["Total"] > 100
+].head(3)
+
+# Pandas method chaining syntax
+orders_pandas = pd.read_parquet("online_retail.parquet")
+
+(
+    orders_pandas.assign(
+        Total=orders_pandas["Quantity"] * orders_pandas["UnitPrice"]
+    )
+    .filter(["InvoiceNo", "Quantity", "UnitPrice", "Total"])
+    .query("Total > 100")
+).head(3)
+
+# Polars method chaining syntax
+orders_polars = pl.read_parquet("online_retail.parquet")
+
+(
+    orders_polars.select(
+        pl.col(["InvoiceNo", "Quantity", "UnitPrice"]),
+        total=pl.col("Quantity") * pl.col("UnitPrice"),
+    ).filter(pl.col("total") > 100)
+).head(3)
diff --git a/polars-vs-pandas/streaming_test.py b/polars-vs-pandas/streaming_test.py
@@ -0,0 +1,37 @@
+import functools
+import sys
+from timeit import Timer
+
+import polars as pl
+from data_generation import generate_data
+
+
+def create_polars_lazyframe(test_data):
+    return pl.LazyFrame(test_data)
+
+
+def analyze_polars_lazyframe(polars_lf):
+    polars_lf.group_by(["region", "product", "sales_person"]).agg(
+        total_sales=pl.col("sales_income").sum()
+    ).collect()
+
+
+def analyze_polars_streaming(polars_lf):
+    polars_lf.group_by(["region", "product", "sales_person"]).agg(
+        total_sales=pl.col("sales_income").sum()
+    ).collect(engine="streaming")
+
+
+test_data = generate_data(int(sys.argv[1]))
+
+polars_lf = create_polars_lazyframe(test_data)
+
+print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100)
+)
+
+print(f"\nPolars streaming analysis time for {int(sys.argv[1])} rows:")
+print(
+    Timer(functools.partial(analyze_polars_streaming, polars_lf)).timeit(100)
+)