From 2a2cddfc6c9805046f1289c68daa72f6bb956bfe Mon Sep 17 00:00:00 2001 From: Marc Jones Date: Mon, 28 Jul 2025 10:25:07 +0100 Subject: [PATCH 1/2] Update documentation for sort_values and natural sorting --- pandas/core/frame.py | 68 +++++++++++++++++++++++++++++------------- pandas/core/generic.py | 61 +++++++++++++++++++++++++++---------- 2 files changed, 92 insertions(+), 37 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 48a5596e00061..d5fe5a98723e6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7173,35 +7173,61 @@ def sort_values( `natural sorting `__. This can be done using ``natsort`` `package `__, - which provides sorted indices according - to their natural order, as shown below: + which provides a function to generate a key + to sort data in their natural order: >>> df = pd.DataFrame( ... { - ... "time": ["0hr", "128hr", "72hr", "48hr", "96hr"], - ... "value": [10, 20, 30, 40, 50], + ... "hours": ["0hr", "128hr", "0hr", "64hr", "64hr", "128hr"], + ... "mins": [ + ... "10mins", + ... "40mins", + ... "40mins", + ... "40mins", + ... "10mins", + ... "10mins", + ... ], + ... "value": [10, 20, 30, 40, 50, 60], ... } ... ) >>> df - time value - 0 0hr 10 - 1 128hr 20 - 2 72hr 30 - 3 48hr 40 - 4 96hr 50 - >>> from natsort import index_natsorted - >>> index_natsorted(df["time"]) - [0, 3, 2, 4, 1] + hours mins value + 0 0hr 10mins 10 + 1 128hr 40mins 20 + 2 0hr 40mins 30 + 3 64hr 40mins 40 + 4 64hr 10mins 50 + 5 128hr 10mins 60 + >>> from natsort import natsort_keygen + >>> natsort_keygen()(df["hours"]) + ( + ('', 0, 'hr'), + ('', 128, 'hr'), + ('', 0, 'hr'), + ('', 64, 'hr'), + ('', 64, 'hr'), + ('', 128, 'hr'), + ) + >>> natsort_keygen()(df["mins"]) + ( + ('', 10, 'mins'), + ('', 40, 'mins'), + ('', 40, 'mins'), + ('', 40, 'mins'), + ('', 10, 'mins'), + ('', 10, 'mins'), + ) >>> df.sort_values( - ... by="time", - ... key=lambda x: np.argsort(index_natsorted(x)), + ... by=["hours", "mins"], + ... key=natsort_keygen(), ... ) - time value - 0 0hr 10 - 3 48hr 40 - 2 72hr 30 - 4 96hr 50 - 1 128hr 20 + hours mins value + 0 0hr 10mins 10 + 2 0hr 40mins 30 + 4 64hr 10mins 50 + 3 64hr 40mins 40 + 5 128hr 10mins 60 + 1 128hr 40mins 20 """ inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7b70ac3588f2a..a13cd19dc5ac2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5004,27 +5004,56 @@ def sort_values( >>> df = pd.DataFrame( ... { - ... "time": ["0hr", "128hr", "72hr", "48hr", "96hr"], - ... "value": [10, 20, 30, 40, 50], + ... "hours": ["0hr", "128hr", "0hr", "64hr", "64hr", "128hr"], + ... "mins": [ + ... "10mins", + ... "40mins", + ... "40mins", + ... "40mins", + ... "10mins", + ... "10mins", + ... ], + ... "value": [10, 20, 30, 40, 50, 60], ... } ... ) >>> df - time value - 0 0hr 10 - 1 128hr 20 - 2 72hr 30 - 3 48hr 40 - 4 96hr 50 - >>> from natsort import index_natsorted + hours mins value + 0 0hr 10mins 10 + 1 128hr 40mins 20 + 2 0hr 40mins 30 + 3 64hr 40mins 40 + 4 64hr 10mins 50 + 5 128hr 10mins 60 + >>> from natsort import natsort_keygen + >>> natsort_keygen()(df["hours"]) + ( + ('', 0, 'hr'), + ('', 128, 'hr'), + ('', 0, 'hr'), + ('', 64, 'hr'), + ('', 64, 'hr'), + ('', 128, 'hr'), + ) + >>> natsort_keygen()(df["mins"]) + ( + ('', 10, 'mins'), + ('', 40, 'mins'), + ('', 40, 'mins'), + ('', 40, 'mins'), + ('', 10, 'mins'), + ('', 10, 'mins'), + ) >>> df.sort_values( - ... by="time", key=lambda x: np.argsort(index_natsorted(df["time"])) + ... by=["hours", "mins"], + ... key=natsort_keygen(), ... ) - time value - 0 0hr 10 - 3 48hr 40 - 2 72hr 30 - 4 96hr 50 - 1 128hr 20 + hours mins value + 0 0hr 10mins 10 + 2 0hr 40mins 30 + 4 64hr 10mins 50 + 3 64hr 40mins 40 + 5 128hr 10mins 60 + 1 128hr 40mins 20 """ raise AbstractMethodError(self) From 713cc47dc73c00cb6b837b229e952b1175c9ce8f Mon Sep 17 00:00:00 2001 From: Marc Jones Date: Mon, 28 Jul 2025 12:22:06 +0100 Subject: [PATCH 2/2] Remove natsort_keygen calls --- pandas/core/frame.py | 18 ------------------ pandas/core/generic.py | 18 ------------------ 2 files changed, 36 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d5fe5a98723e6..d657f2124c61f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7199,24 +7199,6 @@ def sort_values( 4 64hr 10mins 50 5 128hr 10mins 60 >>> from natsort import natsort_keygen - >>> natsort_keygen()(df["hours"]) - ( - ('', 0, 'hr'), - ('', 128, 'hr'), - ('', 0, 'hr'), - ('', 64, 'hr'), - ('', 64, 'hr'), - ('', 128, 'hr'), - ) - >>> natsort_keygen()(df["mins"]) - ( - ('', 10, 'mins'), - ('', 40, 'mins'), - ('', 40, 'mins'), - ('', 40, 'mins'), - ('', 10, 'mins'), - ('', 10, 'mins'), - ) >>> df.sort_values( ... by=["hours", "mins"], ... key=natsort_keygen(), diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a13cd19dc5ac2..cbd853886a0f4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5025,24 +5025,6 @@ def sort_values( 4 64hr 10mins 50 5 128hr 10mins 60 >>> from natsort import natsort_keygen - >>> natsort_keygen()(df["hours"]) - ( - ('', 0, 'hr'), - ('', 128, 'hr'), - ('', 0, 'hr'), - ('', 64, 'hr'), - ('', 64, 'hr'), - ('', 128, 'hr'), - ) - >>> natsort_keygen()(df["mins"]) - ( - ('', 10, 'mins'), - ('', 40, 'mins'), - ('', 40, 'mins'), - ('', 40, 'mins'), - ('', 10, 'mins'), - ('', 10, 'mins'), - ) >>> df.sort_values( ... by=["hours", "mins"], ... key=natsort_keygen(),