pandas-dev · jorisvandenbossche · Aug 15, 2025 · Aug 15, 2025 · Aug 15, 2025 · Aug 15, 2025
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -10,7 +10,10 @@ import warnings
 
 from pandas.util._exceptions import find_stack_level
 
-from pandas import StringDtype
+from pandas import (
+    ArrowDtype,
+    StringDtype,
+)
 from pandas.core.arrays import (
     ArrowExtensionArray,
     BooleanArray,
@@ -43,7 +46,6 @@ from libc.string cimport (
     strncpy,
 )
 
-
 import numpy as np
 
 cimport numpy as cnp
@@ -1452,7 +1454,13 @@ def _maybe_upcast(
 
     elif arr.dtype == np.object_:
         if use_dtype_backend:
-            dtype = StringDtype()
+            if dtype_backend == "pyarrow":
+                # using the StringDtype below would use large_string by default
+                # keep here to pyarrow's default of string
+                import pyarrow as pa
+                dtype = ArrowDtype(pa.string())
+            else:
+                dtype = StringDtype()
             cls = dtype.construct_array_type()
             arr = cls._from_sequence(arr, dtype=dtype)
 

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -126,10 +126,10 @@ class StringDtype(StorageExtensionDtype):
     Examples
     --------
     >>> pd.StringDtype()
-    <StringDtype(storage='python', na_value=<NA>)>
-
-    >>> pd.StringDtype(storage="pyarrow")
     <StringDtype(na_value=<NA>)>
+
+    >>> pd.StringDtype(storage="python")
+    <StringDtype(storage='python', na_value=<NA>)>
     """
 
     @property
@@ -154,16 +154,11 @@ def __init__(
     ) -> None:
         # infer defaults
         if storage is None:
-            if na_value is not libmissing.NA:
-                storage = get_option("mode.string_storage")
-                if storage == "auto":
-                    if HAS_PYARROW:
-                        storage = "pyarrow"
-                    else:
-                        storage = "python"
-            else:
-                storage = get_option("mode.string_storage")
-                if storage == "auto":
+            storage = get_option("mode.string_storage")
+            if storage == "auto":
+                if HAS_PYARROW:
+                    storage = "pyarrow"
+                else:
                     storage = "python"
 
         if storage == "pyarrow_numpy":
@@ -617,7 +612,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
     Examples
     --------
     >>> pd.array(["This is", "some text", None, "data."], dtype="string")
-    <StringArray>
+    <ArrowStringArray>
     ['This is', 'some text', <NA>, 'data.']
     Length: 4, dtype: string
 
@@ -629,15 +624,15 @@ class StringArray(BaseStringArray, NumpyExtensionArray):  # type: ignore[misc]
     ['1', 1]
     Length: 2, dtype: object
     >>> pd.array(["1", 1], dtype="string")
-    <StringArray>
+    <ArrowStringArray>
     ['1', '1']
     Length: 2, dtype: string
 
     However, instantiating StringArrays directly with non-strings will raise an error.
 
     For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
 
-    >>> pd.array(["a", None, "c"], dtype="string") == "a"
+    >>> pd.array(["a", None, "c"], dtype="string[python]") == "a"
     <BooleanArray>
     [True, <NA>, False]
     Length: 3, dtype: boolean

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -230,14 +230,14 @@ def array(
     Length: 2, dtype: Float64
 
     >>> pd.array(["a", None, "c"])
-    <StringArray>
+    <ArrowStringArray>
     ['a', <NA>, 'c']
     Length: 3, dtype: string
 
-    >>> with pd.option_context("string_storage", "pyarrow"):
+    >>> with pd.option_context("string_storage", "python"):
     ...     arr = pd.array(["a", None, "c"])
     >>> arr
-    <ArrowStringArray>
+    <StringArray>
     ['a', <NA>, 'c']
     Length: 3, dtype: string
 

diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -736,9 +736,7 @@ def test_interval(self):
 
     def test_categorical_extension_array_nullable(self, nulls_fixture):
         # GH:
-        arr = pd.arrays.StringArray._from_sequence(
-            [nulls_fixture] * 2, dtype=pd.StringDtype()
-        )
+        arr = pd.array([nulls_fixture] * 2, dtype=pd.StringDtype())
         result = Categorical(arr)
         assert arr.dtype == result.categories.dtype
         expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype))

diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pytest
 
+from pandas.compat import HAS_PYARROW
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -26,10 +27,10 @@ def test_eq_all_na():
     tm.assert_extension_array_equal(result, expected)
 
 
-def test_config(string_storage, using_infer_string):
+def test_config(string_storage):
     # with the default string_storage setting
     # always "python" at the moment
-    assert StringDtype().storage == "python"
+    assert StringDtype().storage == "pyarrow" if HAS_PYARROW else "python"
 
     with pd.option_context("string_storage", string_storage):
         assert StringDtype().storage == string_storage

diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py
@@ -128,7 +128,7 @@ def test_dataframe_array_ea_dtypes():
 
 
 def test_dataframe_array_string_dtype():
-    df = DataFrame({"a": ["a", "b"]}, dtype="string")
+    df = DataFrame({"a": ["a", "b"]}, dtype="string[python]")
     arr = np.asarray(df)
     assert np.shares_memory(arr, get_array(df, "a"))
     assert arr.flags.writeable is False

diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
@@ -83,7 +83,7 @@ def test_astype_numpy_to_ea():
 
 
 @pytest.mark.parametrize(
-    "dtype, new_dtype", [("object", "string"), ("string", "object")]
+    "dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")]
 )
 def test_astype_string_and_object(dtype, new_dtype):
     df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
@@ -96,7 +96,7 @@ def test_astype_string_and_object(dtype, new_dtype):
 
 
 @pytest.mark.parametrize(
-    "dtype, new_dtype", [("object", "string"), ("string", "object")]
+    "dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")]
 )
 def test_astype_string_and_object_update_original(dtype, new_dtype):
     df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
@@ -224,9 +224,7 @@ def test_convert_dtypes(using_infer_string):
     df_orig = df.copy()
     df2 = df.convert_dtypes()
 
-    if using_infer_string and HAS_PYARROW:
-        # TODO the default nullable string dtype still uses python storage
-        # this should be changed to pyarrow if installed
+    if HAS_PYARROW:
         assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
     else:
         assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))

diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
@@ -117,7 +117,7 @@ def test_period_dtype(self, dtype):
     "float": np.dtype(np.float64),
     "object": np.dtype(object),
     "category": com.pandas_dtype("category"),
-    "string": pd.StringDtype(),
+    "string": pd.StringDtype("python"),
 }
 
 

diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py
@@ -199,7 +199,7 @@ def test_convert_dtypes_avoid_block_splitting(self):
             {
                 "a": [1, 2, 3],
                 "b": [4, 5, 6],
-                "c": pd.Series(["a"] * 3, dtype="string[python]"),
+                "c": pd.Series(["a"] * 3, dtype="string"),
             }
         )
         tm.assert_frame_equal(result, expected)
@@ -209,7 +209,7 @@ def test_convert_dtypes_from_arrow(self):
         # GH#56581
         df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])
         result = df.convert_dtypes()
-        expected = df.astype({"a": "string[python]"})
+        expected = df.astype({"a": "string"})
         tm.assert_frame_equal(result, expected)
 
     def test_convert_dtype_pyarrow_timezone_preserve(self):

diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -657,6 +657,10 @@ def test_dtype_backend(self, read_ext, dtype_backend, engine, tmp_excel):
                     for col in df.columns
                 }
             )
+
+            # pandas uses large_string by default, but pyarrow infers string
+            expected["d"] = expected["d"].astype(pd.ArrowDtype(pa.string()))
+            expected["h"] = expected["h"].astype(pd.ArrowDtype(pa.string()))
             # pyarrow by default infers timestamp resolution as us, not ns
             expected["i"] = ArrowExtensionArray(
                 expected["i"].array._pa_array.cast(pa.timestamp(unit="us"))

diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py
@@ -350,11 +350,7 @@ def test_read_clipboard_dtype_backend(
         # GH#50502
         if dtype_backend == "pyarrow":
             pa = pytest.importorskip("pyarrow")
-            if engine == "c" and string_storage == "pyarrow":
-                # TODO avoid this exception?
-                string_dtype = pd.ArrowDtype(pa.large_string())
-            else:
-                string_dtype = pd.ArrowDtype(pa.string())
+            string_dtype = pd.ArrowDtype(pa.string())
         else:
             string_dtype = pd.StringDtype(string_storage)
 

diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
@@ -12,7 +12,6 @@
 import pandas as pd
 from pandas import read_orc
 import pandas._testing as tm
-from pandas.core.arrays import StringArray
 
 pytest.importorskip("pyarrow.orc")
 
@@ -368,13 +367,9 @@ def test_orc_dtype_backend_numpy_nullable():
 
     expected = pd.DataFrame(
         {
-            "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
-            "string_with_nan": StringArray(
-                np.array(["a", pd.NA, "c"], dtype=np.object_)
-            ),
-            "string_with_none": StringArray(
-                np.array(["a", pd.NA, "c"], dtype=np.object_)
-            ),
+            "string": pd.array(np.array(["a", "b", "c"], dtype=np.object_)),
+            "string_with_nan": pd.array(np.array(["a", pd.NA, "c"], dtype=np.object_)),
+            "string_with_none": pd.array(np.array(["a", pd.NA, "c"], dtype=np.object_)),
             "int": pd.Series([1, 2, 3], dtype="Int64"),
             "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
             "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -2137,7 +2137,9 @@ def test_series_string_inference_storage_definition(self):
         # but after PDEP-14 (string dtype), it was decided to keep dtype="string"
         # returning the NA string dtype, so expected is changed from
         # "string[pyarrow_numpy]" to "string[python]"
-        expected = Series(["a", "b"], dtype="string[python]")
+        expected = Series(
+            ["a", "b"], dtype="string[pyarrow]" if HAS_PYARROW else "string[python]"
+        )
         with pd.option_context("future.infer_string", True):
             result = Series(["a", "b"], dtype="string")
         tm.assert_series_equal(result, expected)