From 71b1b3fe66ee49dbc6b94ede6bd724ea8fde137f Mon Sep 17 00:00:00 2001 From: Jay <2594jaypatel@gmail.com> Date: Thu, 17 Jul 2025 00:58:06 -0400 Subject: [PATCH 1/9] BUG: Fix warning for extra fields in read_csv with on_bad_lines callable --- doc/source/whatsnew/v3.0.0.rst | 4 ++++ pandas/io/parsers/base_parser.py | 8 +++----- .../tests/io/parser/test_python_parser_only.py | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index be7a07dface0a..855b5b1f17eb5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -817,6 +817,10 @@ I/O - Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) - Bug in :meth:`to_csv` where ``quotechar``` is not escaped when ``escapechar`` is not None (:issue:`61407`) - Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) +- Bug in :func:`read_csv` with ``engine="python"`` and callable ``on_bad_lines`` + where a ``ParserWarning`` for extra fields returned by the callable was only + raised when ``index_col`` was ``None``. Now the warning is consistently raised + regardless of ``index_col`` (:issue:`#61837`) Period ^^^^^^ diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 23efc9c87e07c..dc52daad7f470 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -615,15 +615,13 @@ def _check_data_length( data: list of array-likes containing the data column-wise. """ if not self.index_col and len(columns) != len(data) and columns: - empty_str = is_object_dtype(data[-1]) and data[-1] == "" # error: No overload variant of "__ror__" of "ndarray" matches # argument type "ExtensionArray" - empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator] - if len(columns) == len(data) - 1 and np.all(empty_str_or_na): + if len(data) > len(columns) : return warnings.warn( - "Length of header or names does not match length of data. This leads " - "to a loss of data with index_col=False.", + f"Length of header or names ({len(columns)}) does not match number of " + f"fields in line ({len(data)}). Extra field will be dropped.", ParserWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index a5bb151e84f47..a0a949a515bea 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -322,6 +322,23 @@ def test_malformed_skipfooter(python_parser_only): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) +def test_on_bad_lines_extra_fields_warns(python_parser_only): + parser = python_parser_only + data = """id,field_1,field_2 +101,A,B +102,C,D, E +103,F,G +""" + + def line_fixer(_line): + return ["1", "2", "3", "4", "5"] + for index_col in [None, 0]: + with tm.assert_produces_warning(ParserWarning): + parser.read_csv( + StringIO(data), on_bad_lines=line_fixer, index_col=index_col + ) + + def test_python_engine_file_no_next(python_parser_only): parser = python_parser_only From bf675551d119cc3ac1348a042393bc721bc5d031 Mon Sep 17 00:00:00 2001 From: Jay <2594jaypatel@gmail.com> Date: Thu, 17 Jul 2025 01:53:45 -0400 Subject: [PATCH 2/9] mend --- doc/source/whatsnew/v3.0.0.rst | 5 +---- pandas/io/parsers/base_parser.py | 6 +----- pandas/tests/io/parser/test_python_parser_only.py | 5 +++-- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 855b5b1f17eb5..91bb463d6ac11 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -817,10 +817,7 @@ I/O - Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) - Bug in :meth:`to_csv` where ``quotechar``` is not escaped when ``escapechar`` is not None (:issue:`61407`) - Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) -- Bug in :func:`read_csv` with ``engine="python"`` and callable ``on_bad_lines`` - where a ``ParserWarning`` for extra fields returned by the callable was only - raised when ``index_col`` was ``None``. Now the warning is consistently raised - regardless of ``index_col`` (:issue:`#61837`) +- Bug in :func:`read_csv` with ``engine="python"`` and callable ``on_bad_lines`` where a ``ParserWarning`` for extra fields returned by the callable was only raised when ``index_col`` was ``None``. Now the warning is consistently raised regardless of ``index_col`` (:issue:`#61837`) Period ^^^^^^ diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index dc52daad7f470..d59e49056fd59 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -614,11 +614,7 @@ def _check_data_length( columns: list of column names data: list of array-likes containing the data column-wise. """ - if not self.index_col and len(columns) != len(data) and columns: - # error: No overload variant of "__ror__" of "ndarray" matches - # argument type "ExtensionArray" - if len(data) > len(columns) : - return + if columns and len(data)!=len(columns): warnings.warn( f"Length of header or names ({len(columns)}) does not match number of " f"fields in line ({len(data)}). Extra field will be dropped.", diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index a0a949a515bea..d72cf98c4cc33 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -332,11 +332,12 @@ def test_on_bad_lines_extra_fields_warns(python_parser_only): def line_fixer(_line): return ["1", "2", "3", "4", "5"] + for index_col in [None, 0]: with tm.assert_produces_warning(ParserWarning): parser.read_csv( - StringIO(data), on_bad_lines=line_fixer, index_col=index_col - ) + StringIO(data), on_bad_lines=line_fixer, index_col=index_col + ) def test_python_engine_file_no_next(python_parser_only): From ea362d8885ca07b43197247c05d007040eb8ebc0 Mon Sep 17 00:00:00 2001 From: Jay <2594jaypatel@gmail.com> Date: Mon, 28 Jul 2025 21:28:21 -0400 Subject: [PATCH 3/9] warn-extra-fields-on-bad-lines --- pandas/tests/io/parser/test_python_parser_only.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index d72cf98c4cc33..9e25621894ff7 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -333,8 +333,11 @@ def test_on_bad_lines_extra_fields_warns(python_parser_only): def line_fixer(_line): return ["1", "2", "3", "4", "5"] + expected_warning = ( + r"Length of head or names \(3)\ does not match number of fields in line \(5\)\. Extra field will be dropped\." + for index_col in [None, 0]: - with tm.assert_produces_warning(ParserWarning): + with pytest.warns(ParserWarning, match=expected_warning): parser.read_csv( StringIO(data), on_bad_lines=line_fixer, index_col=index_col ) From ad1d1dafd2b47f51e02b357fcf1217173bed3b61 Mon Sep 17 00:00:00 2001 From: Jay <2594jaypatel@gmail.com> Date: Mon, 28 Jul 2025 21:57:03 -0400 Subject: [PATCH 4/9] warn-extra-fields-on-bad-lines --- pandas/tests/io/parser/test_python_parser_only.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 9e25621894ff7..acc0428f2c620 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -335,6 +335,7 @@ def line_fixer(_line): expected_warning = ( r"Length of head or names \(3)\ does not match number of fields in line \(5\)\. Extra field will be dropped\." + ) for index_col in [None, 0]: with pytest.warns(ParserWarning, match=expected_warning): From e78977e66f7b9d333975d1ef9d244b5e316fe1aa Mon Sep 17 00:00:00 2001 From: Jay <2594jaypatel@gmail.com> Date: Mon, 28 Jul 2025 23:20:23 -0400 Subject: [PATCH 5/9] warn-extra-fields-on-bad-lines --- doc/make.py | 2 +- pandas/tests/io/parser/test_python_parser_only.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/make.py b/doc/make.py index 9542563dc037b..06f7c84d8753e 100755 --- a/doc/make.py +++ b/doc/make.py @@ -130,7 +130,7 @@ def _sphinx_build(self, kind: str): Examples -------- - >>> DocBuilder(num_jobs=4)._sphinx_build("html") + >>> DocBuilder(num_jobs=1)._sphinx_build("html") """ if kind not in ("html", "latex", "linkcheck"): raise ValueError(f"kind must be html, latex or linkcheck, not {kind}") diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index acc0428f2c620..94bec8844ca34 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -334,7 +334,7 @@ def line_fixer(_line): return ["1", "2", "3", "4", "5"] expected_warning = ( - r"Length of head or names \(3)\ does not match number of fields in line \(5\)\. Extra field will be dropped\." + r"Length of header or names \(3\) does not match number of fields in line \(5\)\. Extra field will be dropped\." ) for index_col in [None, 0]: From 8c592d530b11fb6ab9193e22d4c5f6fea356e9ae Mon Sep 17 00:00:00 2001 From: Jay <2594jaypatel@gmail.com> Date: Mon, 28 Jul 2025 23:37:44 -0400 Subject: [PATCH 6/9] warn-extra-fields-on-bad-lines --- pandas/tests/io/parser/test_python_parser_only.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 94bec8844ca34..47e2abadc1a67 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -334,11 +334,12 @@ def line_fixer(_line): return ["1", "2", "3", "4", "5"] expected_warning = ( - r"Length of header or names \(3\) does not match number of fields in line \(5\)\. Extra field will be dropped\." + r"Length of header or names \(3\) does not match number of fields in line \(5\)\. " + r"Extra field will be dropped\." ) for index_col in [None, 0]: - with pytest.warns(ParserWarning, match=expected_warning): + with tm.assert_produces_warning(ParserWarning, match=expected_warning): parser.read_csv( StringIO(data), on_bad_lines=line_fixer, index_col=index_col ) From 57698c25525b90cf43f94b9eb1d173aaff03cd85 Mon Sep 17 00:00:00 2001 From: Jay <2594jaypatel@gmail.com> Date: Mon, 28 Jul 2025 23:42:40 -0400 Subject: [PATCH 7/9] warn-extra-fields-on-bad-lines --- doc/make.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/make.py b/doc/make.py index 06f7c84d8753e..a22bdeb1c6296 100755 --- a/doc/make.py +++ b/doc/make.py @@ -137,7 +137,7 @@ def _sphinx_build(self, kind: str): cmd = ["sphinx-build", "-b", kind] if self.num_jobs: - cmd += ["-j", self.num_jobs] + cmd += ["-j", "1"] if self.warnings_are_errors: cmd += ["-W", "--keep-going"] if self.verbosity: From ccb1706afaa053426b424e9ac8183ff0742d33c8 Mon Sep 17 00:00:00 2001 From: Jay <2594jaypatel@gmail.com> Date: Mon, 28 Jul 2025 23:57:04 -0400 Subject: [PATCH 8/9] warn-extra-fields-on-bad-lines --- pandas/tests/io/parser/test_python_parser_only.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 47e2abadc1a67..8557b151bf13a 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -334,8 +334,8 @@ def line_fixer(_line): return ["1", "2", "3", "4", "5"] expected_warning = ( - r"Length of header or names \(3\) does not match number of fields in line \(5\)\. " - r"Extra field will be dropped\." + r"Length of header or names \(3\) does not match number of fields in " + r"line \(5\)\. Extra field will be dropped\." ) for index_col in [None, 0]: From 28403575f193d6446ead56119780963de2d0a099 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 29 Jul 2025 04:03:15 +0000 Subject: [PATCH 9/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/io/parsers/base_parser.py | 2 +- pandas/tests/io/parser/test_python_parser_only.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index d59e49056fd59..acb458efa71b2 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -614,7 +614,7 @@ def _check_data_length( columns: list of column names data: list of array-likes containing the data column-wise. """ - if columns and len(data)!=len(columns): + if columns and len(data) != len(columns): warnings.warn( f"Length of header or names ({len(columns)}) does not match number of " f"fields in line ({len(data)}). Extra field will be dropped.", diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 8557b151bf13a..c5524b1f89ee1 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -334,9 +334,9 @@ def line_fixer(_line): return ["1", "2", "3", "4", "5"] expected_warning = ( - r"Length of header or names \(3\) does not match number of fields in " - r"line \(5\)\. Extra field will be dropped\." - ) + r"Length of header or names \(3\) does not match number of fields in " + r"line \(5\)\. Extra field will be dropped\." + ) for index_col in [None, 0]: with tm.assert_produces_warning(ParserWarning, match=expected_warning):