From 1b26060952b0a04fa0a7da33054cc0c33f4dd30e Mon Sep 17 00:00:00 2001 From: Mohammad Reza Yoosefiha Date: Sun, 13 Jul 2025 14:52:17 +0330 Subject: [PATCH 1/6] DOC: Simplify pandas theme footer - Use built-in copyright template from pydata-sphinx-theme - Simplify custom footer template to only include sponsor links - Add CSS for proper horizontal layout --- .cursor/rules/contributing-to-codebase.mdc | 656 ++++++++++++++++ .cursor/rules/copy-on-write-mechanism.mdc | 14 + .../creating-developement-environment.mdc | 212 +++++ .cursor/rules/debugging-c-extentions.mdc | 46 ++ .cursor/rules/extending-pandas.mdc | 349 +++++++++ .cursor/rules/internals.mdc | 66 ++ .cursor/rules/pandas-contribution.mdc | 221 ++++++ .cursor/rules/pandas-doc-string-guid.mdc | 740 ++++++++++++++++++ .cursor/rules/pandas-documentation.mdc | 128 +++ .cursor/rules/pandas-maintenace.mdc | 333 ++++++++ .cursor/rules/policies.mdc | 34 + doc/_templates/pandas_footer.html | 6 +- doc/source/_static/css/pandas.css | 13 +- doc/source/conf.py | 4 +- 14 files changed, 2816 insertions(+), 6 deletions(-) create mode 100644 .cursor/rules/contributing-to-codebase.mdc create mode 100644 .cursor/rules/copy-on-write-mechanism.mdc create mode 100644 .cursor/rules/creating-developement-environment.mdc create mode 100644 .cursor/rules/debugging-c-extentions.mdc create mode 100644 .cursor/rules/extending-pandas.mdc create mode 100644 .cursor/rules/internals.mdc create mode 100644 .cursor/rules/pandas-contribution.mdc create mode 100644 .cursor/rules/pandas-doc-string-guid.mdc create mode 100644 .cursor/rules/pandas-documentation.mdc create mode 100644 .cursor/rules/pandas-maintenace.mdc create mode 100644 .cursor/rules/policies.mdc diff --git a/.cursor/rules/contributing-to-codebase.mdc b/.cursor/rules/contributing-to-codebase.mdc new file mode 100644 index 0000000000000..7f7dba60afa69 --- /dev/null +++ b/.cursor/rules/contributing-to-codebase.mdc @@ -0,0 +1,656 @@ +--- +description: whenever we are contribuing to codebase +alwaysApply: false +--- +Contributing to the code base +Table of Contents: + +Code standards + +Pre-commit + +Optional dependencies + +Backwards compatibility + +Type hints + +Style guidelines + +pandas-specific types + +Validating type hints + +Testing type hints in code using pandas + +Testing with continuous integration + +Test-driven development + +Writing tests + +Using pytest + +Test structure + +Preferred pytest idioms + +Testing a warning + +Testing an exception + +Testing involving files + +Testing involving network connectivity + +Example + +Using hypothesis + +Running the test suite + +Running the performance test suite + +Documenting your code + +Code standards +Writing good code is not just about what you write. It is also about how you write it. During Continuous Integration testing, several tools will be run to check your code for stylistic errors. Generating any warnings will cause the test to fail. Thus, good style is a requirement for submitting code to pandas. + +There are a couple of tools in pandas to help contributors verify their changes before contributing to the project + +./ci/code_checks.sh: a script validates the doctests, formatting in docstrings, and imported modules. It is possible to run the checks independently by using the parameters docstrings, code, and doctests (e.g. ./ci/code_checks.sh doctests); + +pre-commit, which we go into detail on in the next section. + +In addition, because a lot of people use our library, it is important that we do not make sudden changes to the code that could have the potential to break a lot of user code as a result, that is, we need it to be as backwards compatible as possible to avoid mass breakages. + +Pre-commit +Additionally, Continuous Integration will run code formatting checks like ruff, isort, and clang-format and more using pre-commit hooks. Any warnings from these checks will cause the Continuous Integration to fail; therefore, it is helpful to run the check yourself before submitting code. This can be done by installing pre-commit (which should already have happened if you followed the instructions in Setting up your development environment) and then running: + +pre-commit install +from the root of the pandas repository. Now all of the styling checks will be run each time you commit changes without your needing to run each one manually. In addition, using pre-commit will also allow you to more easily remain up-to-date with our code checks as they change. + +Note that if needed, you can skip these checks with git commit --no-verify. + +If you don’t want to use pre-commit as part of your workflow, you can still use it to run its checks with one of the following: + +pre-commit run --files +pre-commit run --from-ref=upstream/main --to-ref=HEAD --all-files +without needing to have done pre-commit install beforehand. + +Finally, we also have some slow pre-commit checks, which don’t run on each commit but which do run during continuous integration. You can trigger them manually with: + +pre-commit run --hook-stage manual --all-files +Note + +You may want to periodically run pre-commit gc, to clean up repos which are no longer used. + +Note + +If you have conflicting installations of virtualenv, then you may get an error - see here. + +Also, due to a bug in virtualenv, you may run into issues if you’re using conda. To solve this, you can downgrade virtualenv to version 20.0.33. + +Note + +If you have recently merged in main from the upstream branch, some of the dependencies used by pre-commit may have changed. Make sure to update your development environment. + +Optional dependencies +Optional dependencies (e.g. matplotlib) should be imported with the private helper pandas.compat._optional.import_optional_dependency. This ensures a consistent error message when the dependency is not met. + +All methods using an optional dependency should include a test asserting that an ImportError is raised when the optional dependency is not found. This test should be skipped if the library is present. + +All optional dependencies should be documented in Optional dependencies and the minimum required version should be set in the pandas.compat._optional.VERSIONS dict. + +Backwards compatibility +Please try to maintain backward compatibility. pandas has lots of users with lots of existing code, so don’t break it if at all possible. If you think breakage is required, clearly state why as part of the pull request. Also, be careful when changing method signatures and add deprecation warnings where needed. Also, add the deprecated sphinx directive to the deprecated functions or methods. + +If a function with the same arguments as the one being deprecated exist, you can use the pandas.util._decorators.deprecate: + +from pandas.util._decorators import deprecate + +deprecate('old_func', 'new_func', '1.1.0') +Otherwise, you need to do it manually: + +import warnings +from pandas.util._exceptions import find_stack_level + + +def old_func(): + """Summary of the function. + + .. deprecated:: 1.1.0 + Use new_func instead. + """ + warnings.warn( + 'Use new_func instead.', + FutureWarning, + stacklevel=find_stack_level(), + ) + new_func() + + +def new_func(): + pass +You’ll also need to + +Write a new test that asserts a warning is issued when calling with the deprecated argument + +Update all of pandas existing tests and code to use the new argument + +See Testing a warning for more. + +Type hints +pandas strongly encourages the use of PEP 484 style type hints. New development should contain type hints and pull requests to annotate existing code are accepted as well! + +Style guidelines +Type imports should follow the from typing import ... convention. Your code may be automatically re-written to use some modern constructs (e.g. using the built-in list instead of typing.List) by the pre-commit checks. + +In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in Mypy 1775. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like + +class SomeClass1: + str = None +The appropriate way to annotate this would be as follows + +str_type = str + +class SomeClass2: + str: str_type = None +In some cases you may be tempted to use cast from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example + +from typing import cast + +from pandas.core.dtypes.common import is_number + +def cannot_infer_bad(obj: Union[str, int, float]): + + if is_number(obj): + ... + else: # Reasonably only str objects would reach this but... + obj = cast(str, obj) # Mypy complains without this! + return obj.upper() +The limitation here is that while a human can reasonably understand that is_number would catch the int and float types mypy cannot make that same inference just yet (see mypy #5206). While the above works, the use of cast is strongly discouraged. Where applicable a refactor of the code to appease static analysis is preferable + +def cannot_infer_good(obj: Union[str, int, float]): + + if isinstance(obj, str): + return obj.upper() + else: + ... +With custom types and inference this is not always possible so exceptions are made, but every effort should be exhausted to avoid cast before going down such paths. + +pandas-specific types +Commonly used types specific to pandas will appear in pandas._typing and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. + +For example, quite a few functions in pandas accept a dtype argument. This can be expressed as a string like "object", a numpy.dtype like np.int64 or even a pandas ExtensionDtype like pd.CategoricalDtype. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module + +from pandas._typing import Dtype + +def as_type(dtype: Dtype) -> ...: + ... +This module will ultimately house types for repeatedly used concepts like “path-like”, “array-like”, “numeric”, etc… and can also hold aliases for commonly appearing parameters like axis. Development of this module is active so be sure to refer to the source for the most up to date list of available types. + +Validating type hints +pandas uses mypy and pyright to statically analyze the code base and type hints. After making any change you can ensure your type hints are consistent by running + +pre-commit run --hook-stage manual --all-files mypy +pre-commit run --hook-stage manual --all-files pyright +pre-commit run --hook-stage manual --all-files pyright_reportGeneralTypeIssues +# the following might fail if the installed pandas version does not correspond to your local git version +pre-commit run --hook-stage manual --all-files stubtest +in your python environment. + +Warning + +Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the mypy or numpy versions do not match. Please see how to setup the python environment or select a recently succeeded workflow, select the “Docstring validation, typing, and other manual pre-commit hooks” job, then click on “Set up Conda” and “Environment info” to see which versions the pandas CI installs. + +Testing type hints in code using pandas +Warning + +pandas is not yet a py.typed library (PEP 561)! The primary purpose of locally declaring pandas as a py.typed library is to test and improve the pandas-builtin type annotations. + +Until pandas becomes a py.typed library, it is possible to easily experiment with the type annotations shipped with pandas by creating an empty file named “py.typed” in the pandas installation folder: + +python -c "import pandas; import pathlib; (pathlib.Path(pandas.__path__[0]) / 'py.typed').touch()" +The existence of the py.typed file signals to type checkers that pandas is already a py.typed library. This makes type checkers aware of the type annotations shipped with pandas. + +Testing with continuous integration +The pandas test suite will run automatically on GitHub Actions continuous integration services, once your pull request is submitted. However, if you wish to run the test suite on a branch prior to submitting the pull request, then the continuous integration services need to be hooked to your GitHub repository. Instructions are here for GitHub Actions. + +A pull-request will be considered for merging when you have an all ‘green’ build. If any tests are failing, then you will get a red ‘X’, where you can click through to see the individual failed tests. This is an example of a green build. + +../_images/ci.png +Test-driven development +pandas is serious about testing and strongly encourages contributors to embrace test-driven development (TDD). This development process “relies on the repetition of a very short development cycle: first the developer writes an (initially failing) automated test case that defines a desired improvement or new function, then produces the minimum amount of code to pass that test.” So, before actually writing any code, you should write your tests. Often the test can be taken from the original GitHub issue. However, it is always worth considering additional use cases and writing corresponding tests. + +We use code coverage to help understand the amount of code which is covered by a test. We recommend striving to ensure code you add or change within Pandas is covered by a test. Please see our code coverage dashboard through Codecov for more information. + +Adding tests is one of the most common requests after code is pushed to pandas. Therefore, it is worth getting in the habit of writing tests ahead of time so this is never an issue. + +Writing tests +All tests should go into the tests subdirectory of the specific package. This folder contains many current examples of tests, and we suggest looking to these for inspiration. + +As a general tip, you can use the search functionality in your integrated development environment (IDE) or the git grep command in a terminal to find test files in which the method is called. If you are unsure of the best location to put your test, take your best guess, but note that reviewers may request that you move the test to a different location. + +To use git grep, you can run the following command in a terminal: + +git grep "function_name(" + +This will search through all files in your repository for the text function_name(. This can be a useful way to quickly locate the function in the codebase and determine the best location to add a test for it. + +Ideally, there should be one, and only one, obvious place for a test to reside. Until we reach that ideal, these are some rules of thumb for where a test should be located. + +Does your test depend only on code in pd._libs.tslibs? This test likely belongs in one of: + +tests.tslibs + +Note + +No file in tests.tslibs should import from any pandas modules outside of pd._libs.tslibs + +tests.scalar + +tests.tseries.offsets + +Does your test depend only on code in pd._libs? This test likely belongs in one of: + +tests.libs + +tests.groupby.test_libgroupby + +Is your test for an arithmetic or comparison method? This test likely belongs in one of: + +tests.arithmetic + +Note + +These are intended for tests that can be shared to test the behavior of DataFrame/Series/Index/ExtensionArray using the box_with_array fixture. + +tests.frame.test_arithmetic + +tests.series.test_arithmetic + +Is your test for a reduction method (min, max, sum, prod, …)? This test likely belongs in one of: + +tests.reductions + +Note + +These are intended for tests that can be shared to test the behavior of DataFrame/Series/Index/ExtensionArray. + +tests.frame.test_reductions + +tests.series.test_reductions + +tests.test_nanops + +Is your test for an indexing method? This is the most difficult case for deciding where a test belongs, because there are many of these tests, and many of them test more than one method (e.g. both Series.__getitem__ and Series.loc.__getitem__) + +Is the test specifically testing an Index method (e.g. Index.get_loc, Index.get_indexer)? This test likely belongs in one of: + +tests.indexes.test_indexing + +tests.indexes.fooindex.test_indexing + +Within that files there should be a method-specific test class e.g. TestGetLoc. + +In most cases, neither Series nor DataFrame objects should be needed in these tests. + +Is the test for a Series or DataFrame indexing method other than __getitem__ or __setitem__, e.g. xs, where, take, mask, lookup, or insert? This test likely belongs in one of: + +tests.frame.indexing.test_methodname + +tests.series.indexing.test_methodname + +Is the test for any of loc, iloc, at, or iat? This test likely belongs in one of: + +tests.indexing.test_loc + +tests.indexing.test_iloc + +tests.indexing.test_at + +tests.indexing.test_iat + +Within the appropriate file, test classes correspond to either types of indexers (e.g. TestLocBooleanMask) or major use cases (e.g. TestLocSetitemWithExpansion). + +See the note in section D) about tests that test multiple indexing methods. + +Is the test for Series.__getitem__, Series.__setitem__, DataFrame.__getitem__, or DataFrame.__setitem__? This test likely belongs in one of: + +tests.series.test_getitem + +tests.series.test_setitem + +tests.frame.test_getitem + +tests.frame.test_setitem + +If many cases such a test may test multiple similar methods, e.g. + +import pandas as pd +import pandas._testing as tm + +def test_getitem_listlike_of_ints(): + ser = pd.Series(range(5)) + + result = ser[[3, 4]] + expected = pd.Series([2, 3]) + tm.assert_series_equal(result, expected) + + result = ser.loc[[3, 4]] + tm.assert_series_equal(result, expected) +In cases like this, the test location should be based on the underlying method being tested. Or in the case of a test for a bugfix, the location of the actual bug. So in this example, we know that Series.__getitem__ calls Series.loc.__getitem__, so this is really a test for loc.__getitem__. So this test belongs in tests.indexing.test_loc. + +Is your test for a DataFrame or Series method? + +Is the method a plotting method? This test likely belongs in one of: + +tests.plotting + +Is the method an IO method? This test likely belongs in one of: + +tests.io + +Note + +This includes to_string but excludes __repr__, which is tested in tests.frame.test_repr and tests.series.test_repr. Other classes often have a test_formats file. + +Otherwise This test likely belongs in one of: + +tests.series.methods.test_mymethod + +tests.frame.methods.test_mymethod + +Note + +If a test can be shared between DataFrame/Series using the frame_or_series fixture, by convention it goes in the tests.frame file. + +Is your test for an Index method, not depending on Series/DataFrame? This test likely belongs in one of: + +tests.indexes + +Is your test for one of the pandas-provided ExtensionArrays (Categorical, DatetimeArray, TimedeltaArray, PeriodArray, IntervalArray, NumpyExtensionArray, FloatArray, BoolArray, StringArray)? This test likely belongs in one of: + +tests.arrays + +Is your test for all ExtensionArray subclasses (the “EA Interface”)? This test likely belongs in one of: + +tests.extension + +Using pytest +Test structure +pandas existing test structure is mostly class-based, meaning that you will typically find tests wrapped in a class. + +class TestReallyCoolFeature: + def test_cool_feature_aspect(self): + pass +We prefer a more functional style using the pytest framework, which offers a richer testing framework that will facilitate testing and developing. Thus, instead of writing test classes, we will write test functions like this: + +def test_really_cool_feature(): + pass +Preferred pytest idioms +Functional tests named def test_* and only take arguments that are either fixtures or parameters. + +Use a bare assert for testing scalars and truth-testing + +Use tm.assert_series_equal(result, expected) and tm.assert_frame_equal(result, expected) for comparing Series and DataFrame results respectively. + +Use @pytest.mark.parameterize when testing multiple cases. + +Use pytest.mark.xfail when a test case is expected to fail. + +Use pytest.mark.skip when a test case is never expected to pass. + +Use pytest.param when a test case needs a particular mark. + +Use @pytest.fixture if multiple tests can share a setup object. + +Warning + +Do not use pytest.xfail (which is different than pytest.mark.xfail) since it immediately stops the test and does not check if the test will fail. If this is the behavior you desire, use pytest.skip instead. + +If a test is known to fail but the manner in which it fails is not meant to be captured, use pytest.mark.xfail. It is common to use this method for a test that exhibits buggy behavior or a non-implemented feature. If the failing test has flaky behavior, use the argument strict=False. This will make it so pytest does not fail if the test happens to pass. Using strict=False is highly undesirable, please use it only as a last resort. + +Prefer the decorator @pytest.mark.xfail and the argument pytest.param over usage within a test so that the test is appropriately marked during the collection phase of pytest. For xfailing a test that involves multiple parameters, a fixture, or a combination of these, it is only possible to xfail during the testing phase. To do so, use the request fixture: + +def test_xfail(request): + mark = pytest.mark.xfail(raises=TypeError, reason="Indicate why here") + request.applymarker(mark) +xfail is not to be used for tests involving failure due to invalid user arguments. For these tests, we need to verify the correct exception type and error message is being raised, using pytest.raises instead. + +Testing a warning +Use tm.assert_produces_warning as a context manager to check that a block of code raises a warning and specify the warning message using the match argument. + +with tm.assert_produces_warning(DeprecationWarning, match="the warning message"): + pd.deprecated_function() +If a warning should specifically not happen in a block of code, pass False into the context manager. + +with tm.assert_produces_warning(False): + pd.no_warning_function() +If you have a test that would emit a warning, but you aren’t actually testing the warning itself (say because it’s going to be removed in the future, or because we’re matching a 3rd-party library’s behavior), then use pytest.mark.filterwarnings to ignore the error. + +@pytest.mark.filterwarnings("ignore:msg:category") +def test_thing(self): + pass +Testing an exception +Use pytest.raises as a context manager with the specific exception subclass (i.e. never use Exception) and the exception message in match. + +with pytest.raises(ValueError, match="an error"): + raise ValueError("an error") +Testing involving files +The temp_file pytest fixture creates a temporary file Pathlib object for testing: + +def test_something(temp_file): + pd.DataFrame([1]).to_csv(str(temp_file)) +Please reference pytest’s documentation for the file retention policy. + +Testing involving network connectivity +A unit test should not access a public data set over the internet due to flakiness of network connections and lack of ownership of the server that is being connected to. To mock this interaction, use the httpserver fixture from the pytest-localserver plugin. with synthetic data. + +@pytest.mark.network +@pytest.mark.single_cpu +def test_network(httpserver): + httpserver.serve_content(content="content") + result = pd.read_html(httpserver.url) +Example +Here is an example of a self-contained set of tests in a file pandas/tests/test_cool_feature.py that illustrate multiple features that we like to use. Please remember to add the GitHub Issue Number as a comment to a new test. + +import pytest +import numpy as np +import pandas as pd + + +@pytest.mark.parametrize('dtype', ['int8', 'int16', 'int32', 'int64']) +def test_dtypes(dtype): + assert str(np.dtype(dtype)) == dtype + + +@pytest.mark.parametrize( + 'dtype', ['float32', pytest.param('int16', marks=pytest.mark.skip), + pytest.param('int32', marks=pytest.mark.xfail( + reason='to show how it works'))]) +def test_mark(dtype): + assert str(np.dtype(dtype)) == 'float32' + + +@pytest.fixture +def series(): + return pd.Series([1, 2, 3]) + + +@pytest.fixture(params=['int8', 'int16', 'int32', 'int64']) +def dtype(request): + return request.param + + +def test_series(series, dtype): + # GH + result = series.astype(dtype) + assert result.dtype == dtype + + expected = pd.Series([1, 2, 3], dtype=dtype) + tm.assert_series_equal(result, expected) +A test run of this yields + +((pandas) bash-3.2$ pytest test_cool_feature.py -v +=========================== test session starts =========================== +platform darwin -- Python 3.6.2, pytest-3.6.0, py-1.4.31, pluggy-0.4.0 +collected 11 items + +tester.py::test_dtypes[int8] PASSED +tester.py::test_dtypes[int16] PASSED +tester.py::test_dtypes[int32] PASSED +tester.py::test_dtypes[int64] PASSED +tester.py::test_mark[float32] PASSED +tester.py::test_mark[int16] SKIPPED +tester.py::test_mark[int32] xfail +tester.py::test_series[int8] PASSED +tester.py::test_series[int16] PASSED +tester.py::test_series[int32] PASSED +tester.py::test_series[int64] PASSED +Tests that we have parametrized are now accessible via the test name, for example we could run these with -k int8 to sub-select only those tests which match int8. + +((pandas) bash-3.2$ pytest test_cool_feature.py -v -k int8 +=========================== test session starts =========================== +platform darwin -- Python 3.6.2, pytest-3.6.0, py-1.4.31, pluggy-0.4.0 +collected 11 items + +test_cool_feature.py::test_dtypes[int8] PASSED +test_cool_feature.py::test_series[int8] PASSED +Using hypothesis +Hypothesis is a library for property-based testing. Instead of explicitly parametrizing a test, you can describe all valid inputs and let Hypothesis try to find a failing input. Even better, no matter how many random examples it tries, Hypothesis always reports a single minimal counterexample to your assertions - often an example that you would never have thought to test. + +See Getting Started with Hypothesis for more of an introduction, then refer to the Hypothesis documentation for details. + +import json +from hypothesis import given, strategies as st + +any_json_value = st.deferred(lambda: st.one_of( + st.none(), st.booleans(), st.floats(allow_nan=False), st.text(), + st.lists(any_json_value), st.dictionaries(st.text(), any_json_value) +)) + + +@given(value=any_json_value) +def test_json_roundtrip(value): + result = json.loads(json.dumps(value)) + assert value == result +This test shows off several useful features of Hypothesis, as well as demonstrating a good use-case: checking properties that should hold over a large or complicated domain of inputs. + +To keep the pandas test suite running quickly, parametrized tests are preferred if the inputs or logic are simple, with Hypothesis tests reserved for cases with complex logic or where there are too many combinations of options or subtle interactions to test (or think of!) all of them. + +Running the test suite +The tests can then be run directly inside your Git clone (without having to install pandas) by typing: + +pytest pandas +Note + +If a handful of tests don’t pass, it may not be an issue with your pandas installation. Some tests (e.g. some SQLAlchemy ones) require additional setup, others might start failing because a non-pinned library released a new version, and others might be flaky if run in parallel. As long as you can import pandas from your locally built version, your installation is probably fine and you can start contributing! + +Often it is worth running only a subset of tests first around your changes before running the entire suite. + +The easiest way to do this is with: + +pytest pandas/path/to/test.py -k regex_matching_test_name +Or with one of the following constructs: + +pytest pandas/tests/[test-module].py +pytest pandas/tests/[test-module].py::[TestClass] +pytest pandas/tests/[test-module].py::[TestClass]::[test_method] +Using pytest-xdist, which is included in our ‘pandas-dev’ environment, one can speed up local testing on multicore machines. The -n number flag then can be specified when running pytest to parallelize a test run across the number of specified cores or auto to utilize all the available cores on your machine. + +# Utilize 4 cores +pytest -n 4 pandas + +# Utilizes all available cores +pytest -n auto pandas +If you’d like to speed things along further a more advanced use of this command would look like this + +pytest pandas -n 4 -m "not slow and not network and not db and not single_cpu" -r sxX +In addition to the multithreaded performance increase this improves test speed by skipping some tests using the -m mark flag: + +slow: any test taking long (think seconds rather than milliseconds) + +network: tests requiring network connectivity + +db: tests requiring a database (mysql or postgres) + +single_cpu: tests that should run on a single cpu only + +You might want to enable the following option if it’s relevant for you: + +arm_slow: any test taking long on arm64 architecture + +These markers are defined in this toml file , under [tool.pytest.ini_options] in a list called markers, in case you want to check if new ones have been created which are of interest to you. + +The -r report flag will display a short summary info (see pytest documentation) . Here we are displaying the number of: + +s: skipped tests + +x: xfailed tests + +X: xpassed tests + +The summary is optional and can be removed if you don’t need the added information. Using the parallelization option can significantly reduce the time it takes to locally run tests before submitting a pull request. + +If you require assistance with the results, which has happened in the past, please set a seed before running the command and opening a bug report, that way we can reproduce it. Here’s an example for setting a seed on windows + +set PYTHONHASHSEED=314159265 +pytest pandas -n 4 -m "not slow and not network and not db and not single_cpu" -r sxX +On Unix use + +export PYTHONHASHSEED=314159265 +pytest pandas -n 4 -m "not slow and not network and not db and not single_cpu" -r sxX +For more, see the pytest documentation. + +Furthermore one can run + +pd.test() +with an imported pandas to run tests similarly. + +Running the performance test suite +Performance matters and it is worth considering whether your code has introduced performance regressions. pandas is in the process of migrating to asv benchmarks to enable easy monitoring of the performance of critical pandas operations. These benchmarks are all found in the pandas/asv_bench directory, and the test results can be found here. + +To use all features of asv, you will need either conda or virtualenv. For more details please check the asv installation webpage. + +To install asv: + +pip install git+https://github.com/airspeed-velocity/asv +If you need to run a benchmark, change your directory to asv_bench/ and run: + +asv continuous -f 1.1 upstream/main HEAD +You can replace HEAD with the name of the branch you are working on, and report benchmarks that changed by more than 10%. The command uses conda by default for creating the benchmark environments. If you want to use virtualenv instead, write: + +asv continuous -f 1.1 -E virtualenv upstream/main HEAD +The -E virtualenv option should be added to all asv commands that run benchmarks. The default value is defined in asv.conf.json. + +Running the full benchmark suite can be an all-day process, depending on your hardware and its resource utilization. However, usually it is sufficient to paste only a subset of the results into the pull request to show that the committed changes do not cause unexpected performance regressions. You can run specific benchmarks using the -b flag, which takes a regular expression. For example, this will only run benchmarks from a pandas/asv_bench/benchmarks/groupby.py file: + +asv continuous -f 1.1 upstream/main HEAD -b ^groupby +If you want to only run a specific group of benchmarks from a file, you can do it using . as a separator. For example: + +asv continuous -f 1.1 upstream/main HEAD -b groupby.GroupByMethods +will only run the GroupByMethods benchmark defined in groupby.py. + +You can also run the benchmark suite using the version of pandas already installed in your current Python environment. This can be useful if you do not have virtualenv or conda, or are using the setup.py develop approach discussed above; for the in-place build you need to set PYTHONPATH, e.g. PYTHONPATH="$PWD/.." asv [remaining arguments]. You can run benchmarks using an existing Python environment by: + +asv run -e -E existing +or, to use a specific Python interpreter,: + +asv run -e -E existing:python3.6 +This will display stderr from the benchmarks, and use your local python that comes from your $PATH. + +Information on how to write a benchmark and how to use asv can be found in the asv documentation. + +Documenting your code +Changes should be reflected in the release notes located in doc/source/whatsnew/vx.y.z.rst. This file contains an ongoing change log for each release. Add an entry to this file to document your fix, enhancement or (unavoidable) breaking change. Make sure to include the GitHub issue number when adding your entry (using :issue:`1234` where 1234 is the issue/pull request number). Your entry should be written using full sentences and proper grammar. + +When mentioning parts of the API, use a Sphinx :func:, :meth:, or :class: directive as appropriate. Not all public API functions and methods have a documentation page; ideally links would only be added if they resolve. You can usually find similar examples by checking the release notes for one of the previous versions. + +If your code is a bugfix, add your entry to the relevant bugfix section. Avoid adding to the Other section; only in rare cases should entries go there. Being as concise as possible, the description of the bug should include how the user may encounter it and an indication of the bug itself, e.g. “produces incorrect results” or “incorrectly raises”. It may be necessary to also indicate the new behavior. + +If your code is an enhancement, it is most likely necessary to add usage examples to the existing documentation. This can be done following the section regarding documentation. Further, to let users know when this feature was added, the versionadded directive is used. The sphinx syntax for that is: + +.. versionadded:: 2.1.0 +This will put the text New in version 2.1.0 wherever you put the sphinx directive. This should also be put in the docstring when adding a new function or method (example) or a new keyword argument (example). \ No newline at end of file diff --git a/.cursor/rules/copy-on-write-mechanism.mdc b/.cursor/rules/copy-on-write-mechanism.mdc new file mode 100644 index 0000000000000..0c1d84de14589 --- /dev/null +++ b/.cursor/rules/copy-on-write-mechanism.mdc @@ -0,0 +1,14 @@ +--- +alwaysApply: true +--- +Copy on write +Copy on Write is a mechanism to simplify the indexing API and improve performance through avoiding copies if possible. CoW means that any DataFrame or Series derived from another in any way always behaves as a copy. An explanation on how to use Copy on Write efficiently can be found here. + +Reference tracking +To be able to determine if we have to make a copy when writing into a DataFrame, we have to be aware if the values are shared with another DataFrame. pandas keeps track of all Blocks that share values with another block internally to be able to tell when a copy needs to be triggered. The reference tracking mechanism is implemented on the Block level. + +We use a custom reference tracker object, BlockValuesRefs, that keeps track of every block, whose values share memory with each other. The reference is held through a weak-reference. Every pair of blocks that share some memory should point to the same BlockValuesRefs object. If one block goes out of scope, the reference to this block dies. As a consequence, the reference tracker object always knows how many blocks are alive and share memory. + +Whenever a DataFrame or Series object is sharing data with another object, it is required that each of those objects have its own BlockManager and Block objects. Thus, in other words, one Block instance (that is held by a DataFrame, not necessarily for intermediate objects) should always be uniquely used for only a single DataFrame/Series object. For example, when you want to use the same Block for another object, you can create a shallow copy of the Block instance with block.copy(deep=False) (which will create a new Block instance with the same underlying values and which will correctly set up the references). + +We can ask the reference tracking object if there is another block alive that shares data with us before writing into the values. We can trigger a copy before writing if there is in fact another block alive. \ No newline at end of file diff --git a/.cursor/rules/creating-developement-environment.mdc b/.cursor/rules/creating-developement-environment.mdc new file mode 100644 index 0000000000000..d901fadff3962 --- /dev/null +++ b/.cursor/rules/creating-developement-environment.mdc @@ -0,0 +1,212 @@ +--- +description: when ever agent needs Creating a development environment +alwaysApply: false +--- +Creating a development environment +To test out code changes, you’ll need to build pandas from source, which requires a C/C++ compiler and Python environment. If you’re making documentation changes, you can skip to contributing to the documentation but if you skip creating the development environment you won’t be able to build the documentation locally before pushing your changes. It’s recommended to also install the pre-commit hooks. + +Step 1: install a C compiler +How to do this will depend on your platform. If you choose to use Docker or GitPod in the next step, then you can skip this step. + +Windows + +You will need Build Tools for Visual Studio 2022. + +Note + +You DO NOT need to install Visual Studio 2022. You only need “Build Tools for Visual Studio 2022” found by scrolling down to “All downloads” -> “Tools for Visual Studio”. In the installer, select the “Desktop development with C++” Workloads. + +If you encounter an error indicating cl.exe is not found when building with Meson, reopen the installer and also select the optional component MSVC v142 - VS 2019 C++ x64/x86 build tools in the right pane for installation. + +Alternatively, you can install the necessary components on the commandline using vs_BuildTools.exe + +Alternatively, you could use the WSL and consult the Linux instructions below. + +macOS + +To use the conda-based compilers, you will need to install the Developer Tools using xcode-select --install. + +If you prefer to use a different compiler, general information can be found here: https://devguide.python.org/setup/#macos + +Linux + +For Linux-based conda installations, you won’t have to install any additional components outside of the conda environment. The instructions below are only needed if your setup isn’t based on conda environments. + +Some Linux distributions will come with a pre-installed C compiler. To find out which compilers (and versions) are installed on your system: + +# for Debian/Ubuntu: +dpkg --list | grep compiler +# for Red Hat/RHEL/CentOS/Fedora: +yum list installed | grep -i --color compiler +GCC (GNU Compiler Collection), is a widely used compiler, which supports C and a number of other languages. If GCC is listed as an installed compiler nothing more is required. + +If no C compiler is installed, or you wish to upgrade, or you’re using a different Linux distribution, consult your favorite search engine for compiler installation/update instructions. + +Let us know if you have any difficulties by opening an issue or reaching out on our contributor community Slack. + +Step 2: create an isolated environment +Before we begin, please: + +Make sure that you have cloned the repository + +cd to the pandas source directory you just created with the clone command + +Option 1: using conda (recommended) +Install miniforge to get conda + +Create and activate the pandas-dev conda environment using the following commands: + +conda env create --file environment.yml +conda activate pandas-dev +Option 2: using pip +You’ll need to have at least the minimum Python version that pandas supports. You also need to have setuptools 51.0.0 or later to build pandas. + +Unix/macOS with virtualenv + +# Create a virtual environment +# Use an ENV_DIR of your choice. We'll use ~/virtualenvs/pandas-dev +# Any parent directories should already exist +python3 -m venv ~/virtualenvs/pandas-dev + +# Activate the virtualenv +. ~/virtualenvs/pandas-dev/bin/activate + +# Install the build dependencies +python -m pip install -r requirements-dev.txt +Unix/macOS with pyenv + +Consult the docs for setting up pyenv here. + +# Create a virtual environment +# Use an ENV_DIR of your choice. We'll use ~/Users//.pyenv/versions/pandas-dev +pyenv virtualenv + +# For instance: +pyenv virtualenv 3.10 pandas-dev + +# Activate the virtualenv +pyenv activate pandas-dev + +# Now install the build dependencies in the cloned pandas repo +python -m pip install -r requirements-dev.txt +Windows + +Below is a brief overview on how to set-up a virtual environment with Powershell under Windows. For details please refer to the official virtualenv user guide. + +Use an ENV_DIR of your choice. We’ll use ~\\virtualenvs\\pandas-dev where ~ is the folder pointed to by either $env:USERPROFILE (Powershell) or %USERPROFILE% (cmd.exe) environment variable. Any parent directories should already exist. + +# Create a virtual environment +python -m venv $env:USERPROFILE\virtualenvs\pandas-dev + +# Activate the virtualenv. Use activate.bat for cmd.exe +~\virtualenvs\pandas-dev\Scripts\Activate.ps1 + +# Install the build dependencies +python -m pip install -r requirements-dev.txt +Option 3: using Docker +pandas provides a DockerFile in the root directory to build a Docker image with a full pandas development environment. + +Docker Commands + +Build the Docker image: + +# Build the image +docker build -t pandas-dev . +Run Container: + +# Run a container and bind your local repo to the container +# This command assumes you are running from your local repo +# but if not alter ${PWD} to match your local repo path +docker run -it --rm -v ${PWD}:/home/pandas pandas-dev +Even easier, you can integrate Docker with the following IDEs: + +Visual Studio Code + +You can use the DockerFile to launch a remote session with Visual Studio Code, a popular free IDE, using the .devcontainer.json file. See https://code.visualstudio.com/docs/remote/containers for details. + +PyCharm (Professional) + +Enable Docker support and use the Services tool window to build and manage images as well as run and interact with containers. See https://www.jetbrains.com/help/pycharm/docker.html for details. + +Option 4: using Gitpod +Gitpod is an open-source platform that automatically creates the correct development environment right in your browser, reducing the need to install local development environments and deal with incompatible dependencies. + +If you are a Windows user, unfamiliar with using the command line or building pandas for the first time, it is often faster to build with Gitpod. Here are the in-depth instructions for building pandas with GitPod. + +Step 3: build and install pandas +There are currently two supported ways of building pandas, pip/meson and setuptools(setup.py). Historically, pandas has only supported using setuptools to build pandas. However, this method requires a lot of convoluted code in setup.py and also has many issues in compiling pandas in parallel due to limitations in setuptools. + +The newer build system, invokes the meson backend through pip (via a PEP 517 build). It automatically uses all available cores on your CPU, and also avoids the need for manual rebuilds by rebuilding automatically whenever pandas is imported (with an editable install). + +For these reasons, you should compile pandas with meson. Because the meson build system is newer, you may find bugs/minor issues as it matures. You can report these bugs here. + +To compile pandas with meson, run: + +# Build and install pandas +# By default, this will print verbose output +# showing the "rebuild" taking place on import (see section below for explanation) +# If you do not want to see this, omit everything after --no-build-isolation +python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true +Note + +The version number is pulled from the latest repository tag. Be sure to fetch the latest tags from upstream before building: + +# set the upstream repository, if not done already, and fetch the latest tags +git remote add upstream https://github.com/pandas-dev/pandas.git +git fetch upstream --tags +Build options + +It is possible to pass options from the pip frontend to the meson backend if you would like to configure your install. Occasionally, you’ll want to use this to adjust the build directory, and/or toggle debug/optimization levels. + +You can pass a build directory to pandas by appending -Cbuilddir="your builddir here" to your pip command. This option allows you to configure where meson stores your built C extensions, and allows for fast rebuilds. + +Sometimes, it might be useful to compile pandas with debugging symbols, when debugging C extensions. Appending -Csetup-args="-Ddebug=true" will do the trick. + +With pip, it is possible to chain together multiple config settings. For example, specifying both a build directory and building with debug symbols would look like -Cbuilddir="your builddir here" -Csetup-args="-Dbuildtype=debug". + +Compiling pandas with setup.py + +Note + +This method of compiling pandas will be deprecated and removed very soon, as the meson backend matures. + +To compile pandas with setuptools, run: + +python setup.py develop +Note + +If pandas is already installed (via meson), you have to uninstall it first: + +python -m pip uninstall pandas +This is because python setup.py develop will not uninstall the loader script that meson-python uses to import the extension from the build folder, which may cause errors such as an FileNotFoundError to be raised. + +Note + +You will need to repeat this step each time the C extensions change, for example if you modified any file in pandas/_libs or if you did a fetch and merge from upstream/main. + +Checking the build + +At this point you should be able to import pandas from your locally built version: + +$ python +>>> import pandas +>>> print(pandas.__version__) # note: the exact output may differ +2.0.0.dev0+880.g2b9e661fbb.dirty +At this point you may want to try running the test suite. + +Keeping up to date with the latest build + +When building pandas with meson, importing pandas will automatically trigger a rebuild, even when C/Cython files are modified. By default, no output will be produced by this rebuild (the import will just take longer). If you would like to see meson’s output when importing pandas, you can set the environment variable MESONPY_EDITABLE_VERBOSE. For example, this would be: + +# On Linux/macOS +MESONPY_EDITABLE_VERBOSE=1 python + +# Windows +set MESONPY_EDITABLE_VERBOSE=1 # Only need to set this once per session +python +If you would like to see this verbose output every time, you can set the editable-verbose config setting to true like so: + +python -m pip install -ve . -Ceditable-verbose=true +Tip + +If you ever find yourself wondering whether setuptools or meson was used to build your pandas, you can check the value of pandas._built_with_meson, which will be true if meson was used to compile pandas. \ No newline at end of file diff --git a/.cursor/rules/debugging-c-extentions.mdc b/.cursor/rules/debugging-c-extentions.mdc new file mode 100644 index 0000000000000..52c894a405346 --- /dev/null +++ b/.cursor/rules/debugging-c-extentions.mdc @@ -0,0 +1,46 @@ +--- +description: when ever want to debug c extention +alwaysApply: false +--- +Debugging C extensions +pandas uses Cython and C/C++ extension modules to optimize performance. Unfortunately, the standard Python debugger does not allow you to step into these extensions. Cython extensions can be debugged with the Cython debugger and C/C++ extensions can be debugged using the tools shipped with your platform’s compiler. + +For Python developers with limited or no C/C++ experience this can seem a daunting task. Core developer Will Ayd has written a 3 part blog series to help guide you from the standard Python debugger into these other tools: + +Fundamental Python Debugging Part 1 - Python + +Fundamental Python Debugging Part 2 - Python Extensions + +Fundamental Python Debugging Part 3 - Cython Extensions + +Debugging locally +By default building pandas from source will generate a release build. To generate a development build you can type: + +pip install -ve . --no-build-isolation -Cbuilddir="debug" -Csetup-args="-Dbuildtype=debug" +Note + +conda environments update CFLAGS/CPPFLAGS with flags that are geared towards generating releases, and may work counter towards usage in a development environment. If using conda, you should unset these environment variables via export CFLAGS= and export CPPFLAGS= + +By specifying builddir="debug" all of the targets will be built and placed in the debug directory relative to the project root. This helps to keep your debug and release artifacts separate; you are of course able to choose a different directory name or omit altogether if you do not care to separate build types. + +Using Docker +To simplify the debugging process, pandas has created a Docker image with a debug build of Python and the gdb/Cython debuggers pre-installed. You may either docker pull pandas/pandas-debug to get access to this image or build it from the tooling/debug folder locally. + +You can then mount your pandas repository into this image via: + +docker run --rm -it -w /data -v ${PWD}:/data pandas/pandas-debug +Inside the image, you can use meson to build/install pandas and place the build artifacts into a debug folder using a command as follows: + +python -m pip install -ve . --no-build-isolation -Cbuilddir="debug" -Csetup-args="-Dbuildtype=debug" +If planning to use cygdb, the files required by that application are placed within the build folder. So you have to first cd to the build folder, then start that application. + +cd debug +cygdb +Within the debugger you can use cygdb commands to navigate cython extensions. + +Editor support +The meson build system generates a compilation database automatically and places it in the build directory. Many language servers and IDEs can use this information to provide code-completion, go-to-definition and error checking support as you type. + +How each language server / IDE chooses to look for the compilation database may vary. When in doubt you may want to create a symlink at the root of the project that points to the compilation database in your build directory. Assuming you used debug as your directory name, you can run: + +ln -s debug/compile_commands.json . \ No newline at end of file diff --git a/.cursor/rules/extending-pandas.mdc b/.cursor/rules/extending-pandas.mdc new file mode 100644 index 0000000000000..d86de70b1b390 --- /dev/null +++ b/.cursor/rules/extending-pandas.mdc @@ -0,0 +1,349 @@ +--- +description: whenever you want to extend the current library or codebase +alwaysApply: false +--- +Extending pandas +While pandas provides a rich set of methods, containers, and data types, your needs may not be fully satisfied. pandas offers a few options for extending pandas. + +Registering custom accessors +Libraries can use the decorators pandas.api.extensions.register_dataframe_accessor(), pandas.api.extensions.register_series_accessor(), and pandas.api.extensions.register_index_accessor(), to add additional “namespaces” to pandas objects. All of these follow a similar convention: you decorate a class, providing the name of attribute to add. The class’s __init__ method gets the object being decorated. For example: + +@pd.api.extensions.register_dataframe_accessor("geo") +class GeoAccessor: + def __init__(self, pandas_obj): + self._validate(pandas_obj) + self._obj = pandas_obj + + @staticmethod + def _validate(obj): + # verify there is a column latitude and a column longitude + if "latitude" not in obj.columns or "longitude" not in obj.columns: + raise AttributeError("Must have 'latitude' and 'longitude'.") + + @property + def center(self): + # return the geographic center point of this DataFrame + lat = self._obj.latitude + lon = self._obj.longitude + return (float(lon.mean()), float(lat.mean())) + + def plot(self): + # plot this array's data on a map, e.g., using Cartopy + pass +Now users can access your methods using the geo namespace: + +ds = pd.DataFrame( + {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)} +) +ds.geo.center +(5.0, 10.0) +ds.geo.plot() +# plots data on a map +This can be a convenient way to extend pandas objects without subclassing them. If you write a custom accessor, make a pull request adding it to our ecosystem page. + +We highly recommend validating the data in your accessor’s __init__. In our GeoAccessor, we validate that the data contains the expected columns, raising an AttributeError when the validation fails. For a Series accessor, you should validate the dtype if the accessor applies only to certain dtypes. + +Extension types +Note + +The pandas.api.extensions.ExtensionDtype and pandas.api.extensions.ExtensionArray APIs were experimental prior to pandas 1.5. Starting with version 1.5, future changes will follow the pandas deprecation policy. + +pandas defines an interface for implementing data types and arrays that extend NumPy’s type system. pandas itself uses the extension system for some types that aren’t built into NumPy (categorical, period, interval, datetime with timezone). + +Libraries can define a custom array and data type. When pandas encounters these objects, they will be handled properly (i.e. not converted to an ndarray of objects). Many methods like pandas.isna() will dispatch to the extension type’s implementation. + +If you’re building a library that implements the interface, please publicize it on the ecosystem page. + +The interface consists of two classes. + +ExtensionDtype +A pandas.api.extensions.ExtensionDtype is similar to a numpy.dtype object. It describes the data type. Implementers are responsible for a few unique items like the name. + +One particularly important item is the type property. This should be the class that is the scalar type for your data. For example, if you were writing an extension array for IP Address data, this might be ipaddress.IPv4Address. + +See the extension dtype source for interface definition. + +pandas.api.extensions.ExtensionDtype can be registered to pandas to allow creation via a string dtype name. This allows one to instantiate Series and .astype() with a registered string name, for example 'category' is a registered string accessor for the CategoricalDtype. + +See the extension dtype dtypes for more on how to register dtypes. + +ExtensionArray +This class provides all the array-like functionality. ExtensionArrays are limited to 1 dimension. An ExtensionArray is linked to an ExtensionDtype via the dtype attribute. + +pandas makes no restrictions on how an extension array is created via its __new__ or __init__, and puts no restrictions on how you store your data. We do require that your array be convertible to a NumPy array, even if this is relatively expensive (as it is for Categorical). + +They may be backed by none, one, or many NumPy arrays. For example, pandas.Categorical is an extension array backed by two arrays, one for codes and one for categories. An array of IPv6 addresses may be backed by a NumPy structured array with two fields, one for the lower 64 bits and one for the upper 64 bits. Or they may be backed by some other storage type, like Python lists. + +See the extension array source for the interface definition. The docstrings and comments contain guidance for properly implementing the interface. + +ExtensionArray operator support +By default, there are no operators defined for the class ExtensionArray. There are two approaches for providing operator support for your ExtensionArray: + +Define each of the operators on your ExtensionArray subclass. + +Use an operator implementation from pandas that depends on operators that are already defined on the underlying elements (scalars) of the ExtensionArray. + +Note + +Regardless of the approach, you may want to set __array_priority__ if you want your implementation to be called when involved in binary operations with NumPy arrays. + +For the first approach, you define selected operators, e.g., __add__, __le__, etc. that you want your ExtensionArray subclass to support. + +The second approach assumes that the underlying elements (i.e., scalar type) of the ExtensionArray have the individual operators already defined. In other words, if your ExtensionArray named MyExtensionArray is implemented so that each element is an instance of the class MyExtensionElement, then if the operators are defined for MyExtensionElement, the second approach will automatically define the operators for MyExtensionArray. + +A mixin class, ExtensionScalarOpsMixin supports this second approach. If developing an ExtensionArray subclass, for example MyExtensionArray, can simply include ExtensionScalarOpsMixin as a parent class of MyExtensionArray, and then call the methods _add_arithmetic_ops() and/or _add_comparison_ops() to hook the operators into your MyExtensionArray class, as follows: + +from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin + + +class MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin): + pass + + +MyExtensionArray._add_arithmetic_ops() +MyExtensionArray._add_comparison_ops() +Note + +Since pandas automatically calls the underlying operator on each element one-by-one, this might not be as performant as implementing your own version of the associated operators directly on the ExtensionArray. + +For arithmetic operations, this implementation will try to reconstruct a new ExtensionArray with the result of the element-wise operation. Whether or not that succeeds depends on whether the operation returns a result that’s valid for the ExtensionArray. If an ExtensionArray cannot be reconstructed, an ndarray containing the scalars returned instead. + +For ease of implementation and consistency with operations between pandas and NumPy ndarrays, we recommend not handling Series and Indexes in your binary ops. Instead, you should detect these cases and return NotImplemented. When pandas encounters an operation like op(Series, ExtensionArray), pandas will + +unbox the array from the Series (Series.array) + +call result = op(values, ExtensionArray) + +re-box the result in a Series + +NumPy universal functions +Series implements __array_ufunc__. As part of the implementation, pandas unboxes the ExtensionArray from the Series, applies the ufunc, and re-boxes it if necessary. + +If applicable, we highly recommend that you implement __array_ufunc__ in your extension array to avoid coercion to an ndarray. See the NumPy documentation for an example. + +As part of your implementation, we require that you defer to pandas when a pandas container (Series, DataFrame, Index) is detected in inputs. If any of those is present, you should return NotImplemented. pandas will take care of unboxing the array from the container and re-calling the ufunc with the unwrapped input. + +Testing extension arrays +We provide a test suite for ensuring that your extension arrays satisfy the expected behavior. To use the test suite, you must provide several pytest fixtures and inherit from the base test class. The required fixtures are found in pandas-dev/pandas. + +To use a test, subclass it: + +from pandas.tests.extension import base + + +class TestConstructors(base.BaseConstructorsTests): + pass +See pandas-dev/pandas for a list of all the tests available. + +Compatibility with Apache Arrow +An ExtensionArray can support conversion to / from pyarrow arrays (and thus support for example serialization to the Parquet file format) by implementing two methods: ExtensionArray.__arrow_array__ and ExtensionDtype.__from_arrow__. + +The ExtensionArray.__arrow_array__ ensures that pyarrow knowns how to convert the specific extension array into a pyarrow.Array (also when included as a column in a pandas DataFrame): + +class MyExtensionArray(ExtensionArray): + ... + + def __arrow_array__(self, type=None): + # convert the underlying array values to a pyarrow Array + import pyarrow + + return pyarrow.array(..., type=type) +The ExtensionDtype.__from_arrow__ method then controls the conversion back from pyarrow to a pandas ExtensionArray. This method receives a pyarrow Array or ChunkedArray as only argument and is expected to return the appropriate pandas ExtensionArray for this dtype and the passed values: + +class ExtensionDtype: + ... + + def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> ExtensionArray: + ... +See more in the Arrow documentation. + +Those methods have been implemented for the nullable integer and string extension dtypes included in pandas, and ensure roundtrip to pyarrow and the Parquet file format. + +Subclassing pandas data structures +Warning + +There are some easier alternatives before considering subclassing pandas data structures. + +Extensible method chains with pipe + +Use composition. See here. + +Extending by registering an accessor + +Extending by extension type + +This section describes how to subclass pandas data structures to meet more specific needs. There are two points that need attention: + +Override constructor properties. + +Define original properties + +Note + +You can find a nice example in geopandas project. + +Override constructor properties +Each data structure has several constructor properties for returning a new data structure as the result of an operation. By overriding these properties, you can retain subclasses through pandas data manipulations. + +There are 3 possible constructor properties to be defined on a subclass: + +DataFrame/Series._constructor: Used when a manipulation result has the same dimension as the original. + +DataFrame._constructor_sliced: Used when a DataFrame (sub-)class manipulation result should be a Series (sub-)class. + +Series._constructor_expanddim: Used when a Series (sub-)class manipulation result should be a DataFrame (sub-)class, e.g. Series.to_frame(). + +Below example shows how to define SubclassedSeries and SubclassedDataFrame overriding constructor properties. + +class SubclassedSeries(pd.Series): + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + +class SubclassedDataFrame(pd.DataFrame): + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries +s = SubclassedSeries([1, 2, 3]) +type(s) + + +to_framed = s.to_frame() +type(to_framed) + + +df = SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) +df + A B C +0 1 4 7 +1 2 5 8 +2 3 6 9 + +type(df) + + +sliced1 = df[["A", "B"]] +sliced1 + A B +0 1 4 +1 2 5 +2 3 6 + +type(sliced1) + + +sliced2 = df["A"] +sliced2 +0 1 +1 2 +2 3 +Name: A, dtype: int64 + +type(sliced2) + +Define original properties +To let original data structures have additional properties, you should let pandas know what properties are added. pandas maps unknown properties to data names overriding __getattribute__. Defining original properties can be done in one of 2 ways: + +Define _internal_names and _internal_names_set for temporary properties which WILL NOT be passed to manipulation results. + +Define _metadata for normal properties which will be passed to manipulation results. + +Below is an example to define two original properties, “internal_cache” as a temporary property and “added_property” as a normal property + +class SubclassedDataFrame2(pd.DataFrame): + + # temporary properties + _internal_names = pd.DataFrame._internal_names + ["internal_cache"] + _internal_names_set = set(_internal_names) + + # normal properties + _metadata = ["added_property"] + + @property + def _constructor(self): + return SubclassedDataFrame2 +df = SubclassedDataFrame2({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) +df + A B C +0 1 4 7 +1 2 5 8 +2 3 6 9 + +df.internal_cache = "cached" +df.added_property = "property" + +df.internal_cache +cached +df.added_property +property + +# properties defined in _internal_names is reset after manipulation +df[["A", "B"]].internal_cache +AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' + +# properties defined in _metadata are retained +df[["A", "B"]].added_property +property +Plotting backends +pandas can be extended with third-party plotting backends. The main idea is letting users select a plotting backend different than the provided one based on Matplotlib. For example: + +pd.set_option("plotting.backend", "backend.module") +pd.Series([1, 2, 3]).plot() +This would be more or less equivalent to: + +import backend.module +backend.module.plot(pd.Series([1, 2, 3])) +The backend module can then use other visualization tools (Bokeh, Altair,…) to generate the plots. + +Libraries implementing the plotting backend should use entry points to make their backend discoverable to pandas. The key is "pandas_plotting_backends". For example, pandas registers the default “matplotlib” backend as follows. + +# in setup.py +setup( # noqa: F821 + ..., + entry_points={ + "pandas_plotting_backends": [ + "matplotlib = pandas:plotting._matplotlib", + ], + }, +) +More information on how to implement a third-party plotting backend can be found at pandas-dev/pandas. + +Arithmetic with 3rd party types +In order to control how arithmetic works between a custom type and a pandas type, implement __pandas_priority__. Similar to numpy’s __array_priority__ semantics, arithmetic methods on DataFrame, Series, and Index objects will delegate to other, if it has an attribute __pandas_priority__ with a higher value. + +By default, pandas objects try to operate with other objects, even if they are not types known to pandas: + +pd.Series([1, 2]) + [10, 20] +0 11 +1 22 +dtype: int64 +In the example above, if [10, 20] was a custom type that can be understood as a list, pandas objects will still operate with it in the same way. + +In some cases, it is useful to delegate to the other type the operation. For example, consider I implement a custom list object, and I want the result of adding my custom list with a pandas Series to be an instance of my list and not a Series as seen in the previous example. This is now possible by defining the __pandas_priority__ attribute of my custom list, and setting it to a higher value, than the priority of the pandas objects I want to operate with. + +The __pandas_priority__ of DataFrame, Series, and Index are 4000, 3000, and 2000 respectively. The base ExtensionArray.__pandas_priority__ is 1000. + +class CustomList(list): + __pandas_priority__ = 5000 + + def __radd__(self, other): + # return `self` and not the addition for simplicity + return self + +custom = CustomList() +series = pd.Series([1, 2, 3]) + +# Series refuses to add custom, since it's an unknown type with higher priority +assert series.__add__(custom) is NotImplemented + +# This will cause the custom class `__radd__` being used instead +assert series + custom is custom diff --git a/.cursor/rules/internals.mdc b/.cursor/rules/internals.mdc new file mode 100644 index 0000000000000..28b0572ba8924 --- /dev/null +++ b/.cursor/rules/internals.mdc @@ -0,0 +1,66 @@ +--- +alwaysApply: true +--- +Internals +This section will provide a look into some of pandas internals. It’s primarily intended for developers of pandas itself. + +Indexing +In pandas there are a few objects implemented which can serve as valid containers for the axis labels: + +Index: the generic “ordered set” object, an ndarray of object dtype assuming nothing about its contents. The labels must be hashable (and likely immutable) and unique. Populates a dict of label to location in Cython to do O(1) lookups. + +MultiIndex: the standard hierarchical index object + +DatetimeIndex: An Index object with Timestamp boxed elements (impl are the int64 values) + +TimedeltaIndex: An Index object with Timedelta boxed elements (impl are the in64 values) + +PeriodIndex: An Index object with Period elements + +There are functions that make the creation of a regular index easy: + +date_range(): fixed frequency date range generated from a time rule or DateOffset. An ndarray of Python datetime objects + +period_range(): fixed frequency date range generated from a time rule or DateOffset. An ndarray of Period objects, representing timespans + +Warning + +Custom Index subclasses are not supported, custom behavior should be implemented using the ExtensionArray interface instead. + +MultiIndex +Internally, the MultiIndex consists of a few things: the levels, the integer codes, and the level names: + +index = pd.MultiIndex.from_product( + [range(3), ["one", "two"]], names=["first", "second"] +) + + +index +Out[2]: +MultiIndex([(0, 'one'), + (0, 'two'), + (1, 'one'), + (1, 'two'), + (2, 'one'), + (2, 'two')], + names=['first', 'second']) + +index.levels +Out[3]: FrozenList([[0, 1, 2], ['one', 'two']]) + +index.codes +Out[4]: FrozenList([[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + +index.names +Out[5]: FrozenList(['first', 'second']) +You can probably guess that the codes determine which unique element is identified with that location at each layer of the index. It’s important to note that sortedness is determined solely from the integer codes and does not check (or care) whether the levels themselves are sorted. Fortunately, the constructors from_tuples() and from_arrays() ensure that this is true, but if you compute the levels and codes yourself, please be careful. + +Values +pandas extends NumPy’s type system with custom types, like Categorical or datetimes with a timezone, so we have multiple notions of “values”. For 1-D containers (Index classes and Series) we have the following convention: + +cls._values refers is the “best possible” array. This could be an ndarray or ExtensionArray. + +So, for example, Series[category]._values is a Categorical. + +Subclassing pandas data structures +This section has been moved to Subclassing pandas data structures. \ No newline at end of file diff --git a/.cursor/rules/pandas-contribution.mdc b/.cursor/rules/pandas-contribution.mdc new file mode 100644 index 0000000000000..fb1ad2eed515f --- /dev/null +++ b/.cursor/rules/pandas-contribution.mdc @@ -0,0 +1,221 @@ +--- +alwaysApply: true +--- +Contributing to pandas +Table of contents: + +Bug reports and enhancement requests + +Finding an issue to contribute to + +Submitting a pull request + +Version control, Git, and GitHub + +Getting started with Git + +Create a fork of pandas + +Creating a feature branch + +Making code changes + +Pushing your changes + +Making a pull request + +Updating your pull request + +Updating the development environment + +Tips for a successful pull request + +All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. + +Bug reports and enhancement requests +Bug reports and enhancement requests are an important part of making pandas more stable and are curated though Github issues. When reporting an issue or request, please select the appropriate category and fill out the issue form fully to ensure others and the core development team can fully understand the scope of the issue. + +The issue will then show up to the pandas community and be open to comments/ideas from others. + +Finding an issue to contribute to +If you are brand new to pandas or open-source development, we recommend searching the GitHub “issues” tab to find issues that interest you. Unassigned issues labeled Docs and good first issue are typically good for newer contributors. + +Once you’ve found an interesting issue, it’s a good idea to assign the issue to yourself, so nobody else duplicates the work on it. On the Github issue, a comment with the exact text take to automatically assign you the issue (this will take seconds and may require refreshing the page to see it). + +If for whatever reason you are not able to continue working with the issue, please unassign it, so other people know it’s available again. You can check the list of assigned issues, since people may not be working in them anymore. If you want to work on one that is assigned, feel free to kindly ask the current assignee if you can take it (please allow at least a week of inactivity before considering work in the issue discontinued). + +We have several contributor community communication channels, which you are welcome to join, and ask questions as you figure things out. Among them are regular meetings for new contributors, dev meetings, a dev mailing list, and a Slack for the contributor community. All pandas contributors are welcome to these spaces, where they can connect with each other. Even maintainers who have been with us for a long time felt just like you when they started out, and are happy to welcome you and support you as you get to know how we work, and where things are. Take a look at the next sections to learn more. + +Submitting a pull request +Version control, Git, and GitHub +pandas is hosted on GitHub, and to contribute, you will need to sign up for a free GitHub account. We use Git for version control to allow many people to work together on the project. + +If you are new to Git, you can reference some of these resources for learning Git. Feel free to reach out to the contributor community for help if needed: + +Git documentation. + +Also, the project follows a forking workflow further described on this page whereby contributors fork the repository, make changes and then create a pull request. So please be sure to read and follow all the instructions in this guide. + +If you are new to contributing to projects through forking on GitHub, take a look at the GitHub documentation for contributing to projects. GitHub provides a quick tutorial using a test repository that may help you become more familiar with forking a repository, cloning a fork, creating a feature branch, pushing changes and making pull requests. + +Below are some useful resources for learning more about forking and pull requests on GitHub: + +the GitHub documentation for forking a repo. + +the GitHub documentation for collaborating with pull requests. + +the GitHub documentation for working with forks. + +Getting started with Git +GitHub has instructions for installing git, setting up your SSH key, and configuring git. All these steps need to be completed before you can work seamlessly between your local repository and GitHub. + +Create a fork of pandas +You will need your own copy of pandas (aka fork) to work on the code. Go to the pandas project page and hit the Fork button. Please uncheck the box to copy only the main branch before selecting Create Fork. You will want to clone your fork to your machine + +git clone https://github.com/your-user-name/pandas.git pandas-yourname +cd pandas-yourname +git remote add upstream https://github.com/pandas-dev/pandas.git +git fetch upstream +This creates the directory pandas-yourname and connects your repository to the upstream (main project) pandas repository. + +Note + +Performing a shallow clone (with --depth==N, for some N greater or equal to 1) might break some tests and features as pd.show_versions() as the version number cannot be computed anymore. + +Creating a feature branch +Your local main branch should always reflect the current state of pandas repository. First ensure it’s up-to-date with the main pandas repository. + +git checkout main +git pull upstream main --ff-only +Then, create a feature branch for making your changes. For example + +git checkout -b shiny-new-feature +This changes your working branch from main to the shiny-new-feature branch. Keep any changes in this branch specific to one bug or feature so it is clear what the branch brings to pandas. You can have many feature branches and switch in between them using the git checkout command. + +When you want to update the feature branch with changes in main after you created the branch, check the section on updating a PR. + +Making code changes +Before modifying any code, ensure you follow the contributing environment guidelines to set up an appropriate development environment. + +Then once you have made code changes, you can see all the changes you’ve currently made by running. + +git status +For files you intended to modify or add, run. + +git add path/to/file-to-be-added-or-changed.py +Running git status again should display + +On branch shiny-new-feature + + modified: /relative/path/to/file-to-be-added-or-changed.py +Finally, commit your changes to your local repository with an explanatory commit message + +git commit -m "your commit message goes here" +Pushing your changes +When you want your changes to appear publicly on your GitHub page, push your forked feature branch’s commits + +git push origin shiny-new-feature +Here origin is the default name given to your remote repository on GitHub. You can see the remote repositories + +git remote -v +If you added the upstream repository as described above you will see something like + +origin git@github.com:yourname/pandas.git (fetch) +origin git@github.com:yourname/pandas.git (push) +upstream git://github.com/pandas-dev/pandas.git (fetch) +upstream git://github.com/pandas-dev/pandas.git (push) +Now your code is on GitHub, but it is not yet a part of the pandas project. For that to happen, a pull request needs to be submitted on GitHub. + +Making a pull request +One you have finished your code changes, your code change will need to follow the pandas contribution guidelines to be successfully accepted. + +If everything looks good, you are ready to make a pull request. A pull request is how code from your local repository becomes available to the GitHub community to review and merged into project to appear the in the next release. To submit a pull request: + +Navigate to your repository on GitHub + +Click on the Compare & pull request button + +You can then click on Commits and Files Changed to make sure everything looks okay one last time + +Write a descriptive title that includes prefixes. pandas uses a convention for title prefixes. Here are some common ones along with general guidelines for when to use them: + +ENH: Enhancement, new functionality + +BUG: Bug fix + +DOC: Additions/updates to documentation + +TST: Additions/updates to tests + +BLD: Updates to the build process/scripts + +PERF: Performance improvement + +TYP: Type annotations + +CLN: Code cleanup + +Write a description of your changes in the Preview Discussion tab + +Click Send Pull Request. + +This request then goes to the repository maintainers, and they will review the code. + +Updating your pull request +Based on the review you get on your pull request, you will probably need to make some changes to the code. You can follow the code committing steps again to address any feedback and update your pull request. + +It is also important that updates in the pandas main branch are reflected in your pull request. To update your feature branch with changes in the pandas main branch, run: + +git checkout shiny-new-feature +git fetch upstream +git merge upstream/main +If there are no conflicts (or they could be fixed automatically), a file with a default commit message will open, and you can simply save and quit this file. + +If there are merge conflicts, you need to solve those conflicts. See for example at https://help.github.com/articles/resolving-a-merge-conflict-using-the-command-line/ for an explanation on how to do this. + +Once the conflicts are resolved, run: + +git add -u to stage any files you’ve updated; + +git commit to finish the merge. + +Note + +If you have uncommitted changes at the moment you want to update the branch with main, you will need to stash them prior to updating (see the stash docs). This will effectively store your changes and they can be reapplied after updating. + +After the feature branch has been update locally, you can now update your pull request by pushing to the branch on GitHub: + +git push origin shiny-new-feature +Any git push will automatically update your pull request with your branch’s changes and restart the Continuous Integration checks. + +Updating the development environment +It is important to periodically update your local main branch with updates from the pandas main branch and update your development environment to reflect any changes to the various packages that are used during development. + +If using conda, run: + +git checkout main +git fetch upstream +git merge upstream/main +conda activate pandas-dev +conda env update -f environment.yml --prune +If using pip , do: + +git checkout main +git fetch upstream +git merge upstream/main +# activate the virtual environment based on your platform +python -m pip install --upgrade -r requirements-dev.txt +Tips for a successful pull request +If you have made it to the Making a pull request phase, one of the core contributors may take a look. Please note however that a handful of people are responsible for reviewing all of the contributions, which can often lead to bottlenecks. + +To improve the chances of your pull request being reviewed, you should: + +Reference an open issue for non-trivial changes to clarify the PR’s purpose + +Ensure you have appropriate tests. These should be the first part of any PR + +Keep your pull requests as simple as possible. Larger PRs take longer to review + +Ensure that CI is in a green state. Reviewers may not even look otherwise + +Keep Updating your pull request, either by request or every few days \ No newline at end of file diff --git a/.cursor/rules/pandas-doc-string-guid.mdc b/.cursor/rules/pandas-doc-string-guid.mdc new file mode 100644 index 0000000000000..e5cce1964d454 --- /dev/null +++ b/.cursor/rules/pandas-doc-string-guid.mdc @@ -0,0 +1,740 @@ +--- +description: when ever docstring is neeeded +alwaysApply: false +--- +pandas docstring guide +About docstrings and standards +A Python docstring is a string used to document a Python module, class, function or method, so programmers can understand what it does without having to read the details of the implementation. + +Also, it is a common practice to generate online (html) documentation automatically from docstrings. Sphinx serves this purpose. + +The next example gives an idea of what a docstring looks like: + +def add(num1, num2): + """ + Add up two integer numbers. + + This function simply wraps the ``+`` operator, and does not + do anything interesting, except for illustrating what + the docstring of a very simple function looks like. + + Parameters + ---------- + num1 : int + First number to add. + num2 : int + Second number to add. + + Returns + ------- + int + The sum of ``num1`` and ``num2``. + + See Also + -------- + subtract : Subtract one integer from another. + + Examples + -------- + >>> add(2, 2) + 4 + >>> add(25, 0) + 25 + >>> add(10, -10) + 0 + """ + return num1 + num2 +Some standards regarding docstrings exist, which make them easier to read, and allow them be easily exported to other formats such as html or pdf. + +The first conventions every Python docstring should follow are defined in PEP-257. + +As PEP-257 is quite broad, other more specific standards also exist. In the case of pandas, the NumPy docstring convention is followed. These conventions are explained in this document: + +numpydoc docstring guide + +numpydoc is a Sphinx extension to support the NumPy docstring convention. + +The standard uses reStructuredText (reST). reStructuredText is a markup language that allows encoding styles in plain text files. Documentation about reStructuredText can be found in: + +Sphinx reStructuredText primer + +Quick reStructuredText reference + +Full reStructuredText specification + +pandas has some helpers for sharing docstrings between related classes, see Sharing docstrings. + +The rest of this document will summarize all the above guidelines, and will provide additional conventions specific to the pandas project. + +Writing a docstring +General rules +Docstrings must be defined with three double-quotes. No blank lines should be left before or after the docstring. The text starts in the next line after the opening quotes. The closing quotes have their own line (meaning that they are not at the end of the last sentence). + +On rare occasions reST styles like bold text or italics will be used in docstrings, but is it common to have inline code, which is presented between backticks. The following are considered inline code: + +The name of a parameter + +Python code, a module, function, built-in, type, literal… (e.g. os, list, numpy.abs, datetime.date, True) + +A pandas class (in the form :class:`pandas.Series`) + +A pandas method (in the form :meth:`pandas.Series.sum`) + +A pandas function (in the form :func:`pandas.to_datetime`) + +Note + +To display only the last component of the linked class, method or function, prefix it with ~. For example, :class:`~pandas.Series` will link to pandas.Series but only display the last part, Series as the link text. See Sphinx cross-referencing syntax for details. + +Good: + +def add_values(arr): + """ + Add the values in ``arr``. + + This is equivalent to Python ``sum`` of :meth:`pandas.Series.sum`. + + Some sections are omitted here for simplicity. + """ + return sum(arr) +Bad: + +def func(): + + """Some function. + + With several mistakes in the docstring. + + It has a blank line after the signature ``def func():``. + + The text 'Some function' should go in the line after the + opening quotes of the docstring, not in the same line. + + There is a blank line between the docstring and the first line + of code ``foo = 1``. + + The closing quotes should be in the next line, not in this one.""" + + foo = 1 + bar = 2 + return foo + bar +Section 1: short summary +The short summary is a single sentence that expresses what the function does in a concise way. + +The short summary must start with a capital letter, end with a dot, and fit in a single line. It needs to express what the object does without providing details. For functions and methods, the short summary must start with an infinitive verb. + +Good: + +def astype(dtype): + """ + Cast Series type. + + This section will provide further details. + """ + pass +Bad: + +def astype(dtype): + """ + Casts Series type. + + Verb in third-person of the present simple, should be infinitive. + """ + pass +def astype(dtype): + """ + Method to cast Series type. + + Does not start with verb. + """ + pass +def astype(dtype): + """ + Cast Series type + + Missing dot at the end. + """ + pass +def astype(dtype): + """ + Cast Series type from its current type to the new type defined in + the parameter dtype. + + Summary is too verbose and doesn't fit in a single line. + """ + pass +Section 2: extended summary +The extended summary provides details on what the function does. It should not go into the details of the parameters, or discuss implementation notes, which go in other sections. + +A blank line is left between the short summary and the extended summary. Every paragraph in the extended summary ends with a dot. + +The extended summary should provide details on why the function is useful and their use cases, if it is not too generic. + +def unstack(): + """ + Pivot a row index to columns. + + When using a MultiIndex, a level can be pivoted so each value in + the index becomes a column. This is especially useful when a subindex + is repeated for the main index, and data is easier to visualize as a + pivot table. + + The index level will be automatically removed from the index when added + as columns. + """ + pass +Section 3: parameters +The details of the parameters will be added in this section. This section has the title “Parameters”, followed by a line with a hyphen under each letter of the word “Parameters”. A blank line is left before the section title, but not after, and not between the line with the word “Parameters” and the one with the hyphens. + +After the title, each parameter in the signature must be documented, including *args and **kwargs, but not self. + +The parameters are defined by their name, followed by a space, a colon, another space, and the type (or types). Note that the space between the name and the colon is important. Types are not defined for *args and **kwargs, but must be defined for all other parameters. After the parameter definition, it is required to have a line with the parameter description, which is indented, and can have multiple lines. The description must start with a capital letter, and finish with a dot. + +For keyword arguments with a default value, the default will be listed after a comma at the end of the type. The exact form of the type in this case will be “int, default 0”. In some cases it may be useful to explain what the default argument means, which can be added after a comma “int, default -1, meaning all cpus”. + +In cases where the default value is None, meaning that the value will not be used. Instead of "str, default None", it is preferred to write "str, optional". When None is a value being used, we will keep the form “str, default None”. For example, in df.to_csv(compression=None), None is not a value being used, but means that compression is optional, and no compression is being used if not provided. In this case we will use "str, optional". Only in cases like func(value=None) and None is being used in the same way as 0 or foo would be used, then we will specify “str, int or None, default None”. + +Good: + +class Series: + def plot(self, kind, color='blue', **kwargs): + """ + Generate a plot. + + Render the data in the Series as a matplotlib plot of the + specified kind. + + Parameters + ---------- + kind : str + Kind of matplotlib plot. + color : str, default 'blue' + Color name or rgb code. + **kwargs + These parameters will be passed to the matplotlib plotting + function. + """ + pass +Bad: + +class Series: + def plot(self, kind, **kwargs): + """ + Generate a plot. + + Render the data in the Series as a matplotlib plot of the + specified kind. + + Note the blank line between the parameters title and the first + parameter. Also, note that after the name of the parameter ``kind`` + and before the colon, a space is missing. + + Also, note that the parameter descriptions do not start with a + capital letter, and do not finish with a dot. + + Finally, the ``**kwargs`` parameter is missing. + + Parameters + ---------- + + kind: str + kind of matplotlib plot + """ + pass +Parameter types +When specifying the parameter types, Python built-in data types can be used directly (the Python type is preferred to the more verbose string, integer, boolean, etc): + +int + +float + +str + +bool + +For complex types, define the subtypes. For dict and tuple, as more than one type is present, we use the brackets to help read the type (curly brackets for dict and normal brackets for tuple): + +list of int + +dict of {str : int} + +tuple of (str, int, int) + +tuple of (str,) + +set of str + +In case where there are just a set of values allowed, list them in curly brackets and separated by commas (followed by a space). If the values are ordinal and they have an order, list them in this order. Otherwise, list the default value first, if there is one: + +{0, 10, 25} + +{‘simple’, ‘advanced’} + +{‘low’, ‘medium’, ‘high’} + +{‘cat’, ‘dog’, ‘bird’} + +If the type is defined in a Python module, the module must be specified: + +datetime.date + +datetime.datetime + +decimal.Decimal + +If the type is in a package, the module must be also specified: + +numpy.ndarray + +scipy.sparse.coo_matrix + +If the type is a pandas type, also specify pandas except for Series and DataFrame: + +Series + +DataFrame + +pandas.Index + +pandas.Categorical + +pandas.arrays.SparseArray + +If the exact type is not relevant, but must be compatible with a NumPy array, array-like can be specified. If Any type that can be iterated is accepted, iterable can be used: + +array-like + +iterable + +If more than one type is accepted, separate them by commas, except the last two types, that need to be separated by the word ‘or’: + +int or float + +float, decimal.Decimal or None + +str or list of str + +If None is one of the accepted values, it always needs to be the last in the list. + +For axis, the convention is to use something like: + +axis : {0 or ‘index’, 1 or ‘columns’, None}, default None + +Section 4: returns or yields +If the method returns a value, it will be documented in this section. Also if the method yields its output. + +The title of the section will be defined in the same way as the “Parameters”. With the names “Returns” or “Yields” followed by a line with as many hyphens as the letters in the preceding word. + +The documentation of the return is also similar to the parameters. But in this case, no name will be provided, unless the method returns or yields more than one value (a tuple of values). + +The types for “Returns” and “Yields” are the same as the ones for the “Parameters”. Also, the description must finish with a dot. + +For example, with a single value: + +def sample(): + """ + Generate and return a random number. + + The value is sampled from a continuous uniform distribution between + 0 and 1. + + Returns + ------- + float + Random number generated. + """ + return np.random.random() +With more than one value: + +import string + +def random_letters(): + """ + Generate and return a sequence of random letters. + + The length of the returned string is also random, and is also + returned. + + Returns + ------- + length : int + Length of the returned string. + letters : str + String of random letters. + """ + length = np.random.randint(1, 10) + letters = ''.join(np.random.choice(string.ascii_lowercase) + for i in range(length)) + return length, letters +If the method yields its value: + +def sample_values(): + """ + Generate an infinite sequence of random numbers. + + The values are sampled from a continuous uniform distribution between + 0 and 1. + + Yields + ------ + float + Random number generated. + """ + while True: + yield np.random.random() +Section 5: see also +This section is used to let users know about pandas functionality related to the one being documented. In rare cases, if no related methods or functions can be found at all, this section can be skipped. + +An obvious example would be the head() and tail() methods. As tail() does the equivalent as head() but at the end of the Series or DataFrame instead of at the beginning, it is good to let the users know about it. + +To give an intuition on what can be considered related, here there are some examples: + +loc and iloc, as they do the same, but in one case providing indices and in the other positions + +max and min, as they do the opposite + +iterrows, itertuples and items, as it is easy that a user looking for the method to iterate over columns ends up in the method to iterate over rows, and vice-versa + +fillna and dropna, as both methods are used to handle missing values + +read_csv and to_csv, as they are complementary + +merge and join, as one is a generalization of the other + +astype and pandas.to_datetime, as users may be reading the documentation of astype to know how to cast as a date, and the way to do it is with pandas.to_datetime + +where is related to numpy.where, as its functionality is based on it + +When deciding what is related, you should mainly use your common sense and think about what can be useful for the users reading the documentation, especially the less experienced ones. + +When relating to other libraries (mainly numpy), use the name of the module first (not an alias like np). If the function is in a module which is not the main one, like scipy.sparse, list the full module (e.g. scipy.sparse.coo_matrix). + +This section has a header, “See Also” (note the capital S and A), followed by the line with hyphens and preceded by a blank line. + +After the header, we will add a line for each related method or function, followed by a space, a colon, another space, and a short description that illustrates what this method or function does, why is it relevant in this context, and what the key differences are between the documented function and the one being referenced. The description must also end with a dot. + +Note that in “Returns” and “Yields”, the description is located on the line after the type. In this section, however, it is located on the same line, with a colon in between. If the description does not fit on the same line, it can continue onto other lines which must be further indented. + +For example: + +class Series: + def head(self): + """ + Return the first 5 elements of the Series. + + This function is mainly useful to preview the values of the + Series without displaying the whole of it. + + Returns + ------- + Series + Subset of the original series with the 5 first values. + + See Also + -------- + Series.tail : Return the last 5 elements of the Series. + Series.iloc : Return a slice of the elements in the Series, + which can also be used to return the first or last n. + """ + return self.iloc[:5] +Section 6: notes +This is an optional section used for notes about the implementation of the algorithm, or to document technical aspects of the function behavior. + +Feel free to skip it, unless you are familiar with the implementation of the algorithm, or you discover some counter-intuitive behavior while writing the examples for the function. + +This section follows the same format as the extended summary section. + +Section 7: examples +This is one of the most important sections of a docstring, despite being placed in the last position, as often people understand concepts better by example than through accurate explanations. + +Examples in docstrings, besides illustrating the usage of the function or method, must be valid Python code, that returns the given output in a deterministic way, and that can be copied and run by users. + +Examples are presented as a session in the Python terminal. >>> is used to present code. ... is used for code continuing from the previous line. Output is presented immediately after the last line of code generating the output (no blank lines in between). Comments describing the examples can be added with blank lines before and after them. + +The way to present examples is as follows: + +Import required libraries (except numpy and pandas) + +Create the data required for the example + +Show a very basic example that gives an idea of the most common use case + +Add examples with explanations that illustrate how the parameters can be used for extended functionality + +A simple example could be: + +class Series: + + def head(self, n=5): + """ + Return the first elements of the Series. + + This function is mainly useful to preview the values of the + Series without displaying all of it. + + Parameters + ---------- + n : int + Number of values to return. + + Return + ------ + pandas.Series + Subset of the original series with the n first values. + + See Also + -------- + tail : Return the last n elements of the Series. + + Examples + -------- + >>> ser = pd.Series(['Ant', 'Bear', 'Cow', 'Dog', 'Falcon', + ... 'Lion', 'Monkey', 'Rabbit', 'Zebra']) + >>> ser.head() + 0 Ant + 1 Bear + 2 Cow + 3 Dog + 4 Falcon + dtype: object + + With the ``n`` parameter, we can change the number of returned rows: + + >>> ser.head(n=3) + 0 Ant + 1 Bear + 2 Cow + dtype: object + """ + return self.iloc[:n] +The examples should be as concise as possible. In cases where the complexity of the function requires long examples, is recommended to use blocks with headers in bold. Use double star ** to make a text bold, like in **this example**. + +Conventions for the examples +Code in examples is assumed to always start with these two lines which are not shown: + +import numpy as np +import pandas as pd +Any other module used in the examples must be explicitly imported, one per line (as recommended in PEP 8#imports) and avoiding aliases. Avoid excessive imports, but if needed, imports from the standard library go first, followed by third-party libraries (like matplotlib). + +When illustrating examples with a single Series use the name ser, and if illustrating with a single DataFrame use the name df. For indices, idx is the preferred name. If a set of homogeneous Series or DataFrame is used, name them ser1, ser2, ser3… or df1, df2, df3… If the data is not homogeneous, and more than one structure is needed, name them with something meaningful, for example df_main and df_to_join. + +Data used in the example should be as compact as possible. The number of rows is recommended to be around 4, but make it a number that makes sense for the specific example. For example in the head method, it requires to be higher than 5, to show the example with the default values. If doing the mean, we could use something like [1, 2, 3], so it is easy to see that the value returned is the mean. + +For more complex examples (grouping for example), avoid using data without interpretation, like a matrix of random numbers with columns A, B, C, D… And instead use a meaningful example, which makes it easier to understand the concept. Unless required by the example, use names of animals, to keep examples consistent. And numerical properties of them. + +When calling the method, keywords arguments head(n=3) are preferred to positional arguments head(3). + +Good: + +class Series: + + def mean(self): + """ + Compute the mean of the input. + + Examples + -------- + >>> ser = pd.Series([1, 2, 3]) + >>> ser.mean() + 2 + """ + pass + + + def fillna(self, value): + """ + Replace missing values by ``value``. + + Examples + -------- + >>> ser = pd.Series([1, np.nan, 3]) + >>> ser.fillna(0) + [1, 0, 3] + """ + pass + + def groupby_mean(self): + """ + Group by index and return mean. + + Examples + -------- + >>> ser = pd.Series([380., 370., 24., 26], + ... name='max_speed', + ... index=['falcon', 'falcon', 'parrot', 'parrot']) + >>> ser.groupby_mean() + index + falcon 375.0 + parrot 25.0 + Name: max_speed, dtype: float64 + """ + pass + + def contains(self, pattern, case_sensitive=True, na=numpy.nan): + """ + Return whether each value contains ``pattern``. + + In this case, we are illustrating how to use sections, even + if the example is simple enough and does not require them. + + Examples + -------- + >>> ser = pd.Series('Antelope', 'Lion', 'Zebra', np.nan) + >>> ser.contains(pattern='a') + 0 False + 1 False + 2 True + 3 NaN + dtype: bool + + **Case sensitivity** + + With ``case_sensitive`` set to ``False`` we can match ``a`` with both + ``a`` and ``A``: + + >>> s.contains(pattern='a', case_sensitive=False) + 0 True + 1 False + 2 True + 3 NaN + dtype: bool + + **Missing values** + + We can fill missing values in the output using the ``na`` parameter: + + >>> ser.contains(pattern='a', na=False) + 0 False + 1 False + 2 True + 3 False + dtype: bool + """ + pass +Bad: + +def method(foo=None, bar=None): + """ + A sample DataFrame method. + + Do not import NumPy and pandas. + + Try to use meaningful data, when it makes the example easier + to understand. + + Try to avoid positional arguments like in ``df.method(1)``. They + can be all right if previously defined with a meaningful name, + like in ``present_value(interest_rate)``, but avoid them otherwise. + + When presenting the behavior with different parameters, do not place + all the calls one next to the other. Instead, add a short sentence + explaining what the example shows. + + Examples + -------- + >>> import numpy as np + >>> import pandas as pd + >>> df = pd.DataFrame(np.random.randn(3, 3), + ... columns=('a', 'b', 'c')) + >>> df.method(1) + 21 + >>> df.method(bar=14) + 123 + """ + pass +Tips for getting your examples pass the doctests +Getting the examples pass the doctests in the validation script can sometimes be tricky. Here are some attention points: + +Import all needed libraries (except for pandas and NumPy, those are already imported as import pandas as pd and import numpy as np) and define all variables you use in the example. + +Try to avoid using random data. However random data might be OK in some cases, like if the function you are documenting deals with probability distributions, or if the amount of data needed to make the function result meaningful is too much, such that creating it manually is very cumbersome. In those cases, always use a fixed random seed to make the generated examples predictable. Example: + +np.random.seed(42) +df = pd.DataFrame({'normal': np.random.normal(100, 5, 20)}) +If you have a code snippet that wraps multiple lines, you need to use ‘…’ on the continued lines: + +df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b', 'c'], + columns=['A', 'B']) +If you want to show a case where an exception is raised, you can do: + +pd.to_datetime(["712-01-01"]) +Traceback (most recent call last): +OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 712-01-01 00:00:00 +It is essential to include the “Traceback (most recent call last):”, but for the actual error only the error name is sufficient. + +If there is a small part of the result that can vary (e.g. a hash in an object representation), you can use ... to represent this part. + +If you want to show that s.plot() returns a matplotlib AxesSubplot object, this will fail the doctest + +s.plot() + +However, you can do (notice the comment that needs to be added) + +s.plot() + +Plots in examples +There are some methods in pandas returning plots. To render the plots generated by the examples in the documentation, the .. plot:: directive exists. + +To use it, place the next code after the “Examples” header as shown below. The plot will be generated automatically when building the documentation. + +class Series: + def plot(self): + """ + Generate a plot with the ``Series`` data. + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> ser = pd.Series([1, 2, 3]) + >>> ser.plot() + """ + pass +Sharing docstrings +pandas has a system for sharing docstrings, with slight variations, between classes. This helps us keep docstrings consistent, while keeping things clear for the user reading. It comes at the cost of some complexity when writing. + +Each shared docstring will have a base template with variables, like {klass}. The variables filled in later on using the doc decorator. Finally, docstrings can also be appended to with the doc decorator. + +In this example, we’ll create a parent docstring normally (this is like pandas.core.generic.NDFrame). Then we’ll have two children (like pandas.Series and pandas.DataFrame). We’ll substitute the class names in this docstring. + +class Parent: + @doc(klass="Parent") + def my_function(self): + """Apply my function to {klass}.""" + ... + + +class ChildA(Parent): + @doc(Parent.my_function, klass="ChildA") + def my_function(self): + ... + + +class ChildB(Parent): + @doc(Parent.my_function, klass="ChildB") + def my_function(self): + ... +The resulting docstrings are + +print(Parent.my_function.__doc__) +Apply my function to Parent. +print(ChildA.my_function.__doc__) +Apply my function to ChildA. +print(ChildB.my_function.__doc__) +Apply my function to ChildB. +Notice: + +We “append” the parent docstring to the children docstrings, which are initially empty. + +Our files will often contain a module-level _shared_doc_kwargs with some common substitution values (things like klass, axes, etc). + +You can substitute and append in one shot with something like + +@doc(template, **_shared_doc_kwargs) +def my_function(self): + ... +where template may come from a module-level _shared_docs dictionary mapping function names to docstrings. Wherever possible, we prefer using doc, since the docstring-writing processes is slightly closer to normal. + +See pandas.core.generic.NDFrame.fillna for an example template, and pandas.Series.fillna and pandas.core.generic.frame.fillna for the filled versions. \ No newline at end of file diff --git a/.cursor/rules/pandas-documentation.mdc b/.cursor/rules/pandas-documentation.mdc new file mode 100644 index 0000000000000..bf1d3fa21fe6a --- /dev/null +++ b/.cursor/rules/pandas-documentation.mdc @@ -0,0 +1,128 @@ +--- +description: when ever task is about improving or changing or adding documentation to pandas +alwaysApply: false +--- +Contributing to the documentation +Contributing to the documentation benefits everyone who uses pandas. We encourage you to help us improve the documentation, and you don’t have to be an expert on pandas to do so! In fact, there are sections of the docs that are worse off after being written by experts. If something in the docs doesn’t make sense to you, updating the relevant section after you figure it out is a great way to ensure it will help the next person. Please visit the issues page for a full list of issues that are currently open regarding the pandas documentation. + +Documentation: + +About the pandas documentation + +Updating a pandas docstring + +How to build the pandas documentation + +Requirements + +Building the documentation + +Building main branch documentation + +Previewing changes + +About the pandas documentation +The documentation is written in reStructuredText, which is almost like writing in plain English, and built using Sphinx. The Sphinx Documentation has an excellent introduction to reST. Review the Sphinx docs to perform more complex changes to the documentation as well. + +Some other important things to know about the docs: + +The pandas documentation consists of two parts: the docstrings in the code itself and the docs in this folder doc/. + +The docstrings provide a clear explanation of the usage of the individual functions, while the documentation in this folder consists of tutorial-like overviews per topic together with some other information (what’s new, installation, etc). + +The docstrings follow a pandas convention, based on the Numpy Docstring Standard. Follow the pandas docstring guide for detailed instructions on how to write a correct docstring. + +pandas docstring guide +About docstrings and standards +Writing a docstring +Sharing docstrings +The tutorials make heavy use of the IPython directive sphinx extension. This directive lets you put code in the documentation which will be run during the doc build. For example: + +.. ipython:: python + + x = 2 + x**3 +will be rendered as: + +In [1]: x = 2 + +In [2]: x**3 +Out[2]: 8 +Almost all code examples in the docs are run (and the output saved) during the doc build. This approach means that code examples will always be up to date, but it does make the doc building a bit more complex. + +Our API documentation files in doc/source/reference house the auto-generated documentation from the docstrings. For classes, there are a few subtleties around controlling which methods and attributes have pages auto-generated. + +We have two autosummary templates for classes. + +_templates/autosummary/class.rst. Use this when you want to automatically generate a page for every public method and attribute on the class. The Attributes and Methods sections will be automatically added to the class’ rendered documentation by numpydoc. See DataFrame for an example. + +_templates/autosummary/class_without_autosummary. Use this when you want to pick a subset of methods / attributes to auto-generate pages for. When using this template, you should include an Attributes and Methods section in the class docstring. See CategoricalIndex for an example. + +Every method should be included in a toctree in one of the documentation files in doc/source/reference, else Sphinx will emit a warning. + +The utility script scripts/validate_docstrings.py can be used to get a csv summary of the API documentation. And also validate common errors in the docstring of a specific class, function or method. The summary also compares the list of methods documented in the files in doc/source/reference (which is used to generate the API Reference page) and the actual public methods. This will identify methods documented in doc/source/reference that are not actually class methods, and existing methods that are not documented in doc/source/reference. + +Updating a pandas docstring +When improving a single function or method’s docstring, it is not necessarily needed to build the full documentation (see next section). However, there is a script that checks a docstring (for example for the DataFrame.mean method): + +python scripts/validate_docstrings.py pandas.DataFrame.mean +This script will indicate some formatting errors if present, and will also run and test the examples included in the docstring. Check the pandas docstring guide for a detailed guide on how to format the docstring. + +The examples in the docstring (‘doctests’) must be valid Python code, that in a deterministic way returns the presented output, and that can be copied and run by users. This can be checked with the script above, and is also tested on Travis. A failing doctest will be a blocker for merging a PR. Check the examples section in the docstring guide for some tips and tricks to get the doctests passing. + +When doing a PR with a docstring update, it is good to post the output of the validation script in a comment on github. + +How to build the pandas documentation +Requirements +First, you need to have a development environment to be able to build pandas (see the docs on creating a development environment). + +Building the documentation +So how do you build the docs? Navigate to your local doc/ directory in the console and run: + +python make.py html +Then you can find the HTML output in the folder doc/build/html/. + +The first time you build the docs, it will take quite a while because it has to run all the code examples and build all the generated docstring pages. In subsequent evocations, sphinx will try to only build the pages that have been modified. + +If you want to do a full clean build, do: + +python make.py clean +python make.py html +You can tell make.py to compile only a single section of the docs, greatly reducing the turn-around time for checking your changes. + +# omit autosummary and API section +python make.py clean +python make.py --no-api + +# compile the docs with only a single section, relative to the "source" folder. +# For example, compiling only this guide (doc/source/development/contributing.rst) +python make.py clean +python make.py --single development/contributing.rst + +# compile the reference docs for a single function +python make.py clean +python make.py --single pandas.DataFrame.join + +# compile whatsnew and API section (to resolve links in the whatsnew) +python make.py clean +python make.py --whatsnew +For comparison, a full documentation build may take 15 minutes, but a single section may take 15 seconds. Subsequent builds, which only process portions you have changed, will be faster. + +The build will automatically use the number of cores available on your machine to speed up the documentation build. You can override this: + +python make.py html --num-jobs 4 +Open the following file in a web browser to see the full documentation you just built doc/build/html/index.html. + +And you’ll have the satisfaction of seeing your new and improved documentation! + +Building main branch documentation +When pull requests are merged into the pandas main branch, the main parts of the documentation are also built by Travis-CI. These docs are then hosted here, see also the Continuous Integration section. + +Previewing changes +Once, the pull request is submitted, GitHub Actions will automatically build the documentation. To view the built site: + +Wait for the CI / Web and docs check to complete. + +Click Details next to it. + +From the Artifacts drop-down, click docs or website to download the site as a ZIP file. \ No newline at end of file diff --git a/.cursor/rules/pandas-maintenace.mdc b/.cursor/rules/pandas-maintenace.mdc new file mode 100644 index 0000000000000..03e5d6c2908c9 --- /dev/null +++ b/.cursor/rules/pandas-maintenace.mdc @@ -0,0 +1,333 @@ +--- +alwaysApply: true +--- +pandas maintenance +This guide is for pandas’ maintainers. It may also be interesting to contributors looking to understand the pandas development process and what steps are necessary to become a maintainer. + +The main contributing guide is available at Contributing to pandas. + +Roles +pandas uses two levels of permissions: triage and core team members. + +Triage members can label and close issues and pull requests. + +Core team members can label and close issues and pull request, and can merge pull requests. + +GitHub publishes the full list of permissions. + +Tasks +pandas is largely a volunteer project, so these tasks shouldn’t be read as “expectations” of triage and maintainers. Rather, they’re general descriptions of what it means to be a maintainer. + +Triage newly filed issues (see Issue triage) + +Review newly opened pull requests + +Respond to updates on existing issues and pull requests + +Drive discussion and decisions on stalled issues and pull requests + +Provide experience / wisdom on API design questions to ensure consistency and maintainability + +Project organization (run / attend developer meetings, represent pandas) + +https://matthewrocklin.com/blog/2019/05/18/maintainer may be interesting background reading. + +Issue triage +Triage is an important first step in addressing issues reported by the community, and even partial contributions are a great way to help maintain pandas. Only remove the “Needs Triage” tag once all of the steps below have been completed. + +Here’s a typical workflow for triaging a newly opened issue. + +Thank the reporter for opening an issue + +The issue tracker is many people’s first interaction with the pandas project itself, beyond just using the library. As such, we want it to be a welcoming, pleasant experience. + +Is the necessary information provided? + +Ideally reporters would fill out the issue template, but many don’t. If crucial information (like the version of pandas they used), is missing feel free to ask for that and label the issue with “Needs info”. The report should follow the guidelines in Bug reports and enhancement requests. You may want to link to that if they didn’t follow the template. + +Make sure that the title accurately reflects the issue. Edit it yourself if it’s not clear. + +Is this a duplicate issue? + +We have many open issues. If a new issue is clearly a duplicate, label the new issue as “Duplicate” and close the issue with a link to the original issue. Make sure to still thank the reporter, and encourage them to chime in on the original issue, and perhaps try to fix it. + +If the new issue provides relevant information, such as a better or slightly different example, add it to the original issue as a comment or an edit to the original post. + +Is the issue minimal and reproducible? + +For bug reports, we ask that the reporter provide a minimal reproducible example. See https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports for a good explanation. If the example is not reproducible, or if it’s clearly not minimal, feel free to ask the reporter if they can provide an example or simplify the provided one. Do acknowledge that writing minimal reproducible examples is hard work. If the reporter is struggling, you can try to write one yourself and we’ll edit the original post to include it. + +If a reproducible example can’t be provided, add the “Needs info” label. + +If a reproducible example is provided, but you see a simplification, edit the original post with your simpler reproducible example. + +If this is a regression report, post the result of a git bisect run. More info on this can be found in the Investigating regressions section. + +Ensure the issue exists on the main branch and that it has the “Needs Triage” tag until all steps have been completed. Add a comment to the issue once you have verified it exists on the main branch, so others know it has been confirmed. + +Is this a clearly defined feature request? + +Generally, pandas prefers to discuss and design new features in issues, before a pull request is made. Encourage the submitter to include a proposed API for the new feature. Having them write a full docstring is a good way to pin down specifics. + +Tag new feature requests with “Needs Discussion”, as we’ll need a discussion from several pandas maintainers before deciding whether the proposal is in scope for pandas. + +Is this a usage question? + +We prefer that usage questions are asked on StackOverflow with the pandas tag. https://stackoverflow.com/questions/tagged/pandas + +If it’s easy to answer, feel free to link to the relevant documentation section, let them know that in the future this kind of question should be on StackOverflow, and close the issue. + +What labels and milestones should I add? + +Apply the relevant labels. This is a bit of an art, and comes with experience. Look at similar issues to get a feel for how things are labeled. + +If the issue is clearly defined and the fix seems relatively straightforward, label the issue as “Good first issue”. + +If the issue is a regression report, add the “Regression” label and the next patch release milestone. + +Once you have completed the above, make sure to remove the “Needs Triage” label. + +Investigating regressions +Regressions are bugs that unintentionally break previously working code. The common way to investigate regressions is by using git bisect, which finds the first commit that introduced the bug. + +For example: a user reports that pd.Series([1, 1]).sum() returns 3 in pandas version 1.5.0 while in version 1.4.0 it returned 2. To begin, create a file t.py in your pandas directory, which contains + +import pandas as pd +assert pd.Series([1, 1]).sum() == 2 +and then run: + +git bisect start +git bisect good v1.4.0 +git bisect bad v1.5.0 +git bisect run bash -c "python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true; python t.py" +This finds the first commit that changed the behavior. The C extensions have to be rebuilt at every step, so the search can take a while. + +Exit bisect and rebuild the current version: + +git bisect reset +python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true +Report your findings under the corresponding issue and ping the commit author to get their input. + +Note + +In the bisect run command above, commits are considered good if t.py exits with 0 and bad otherwise. When raising an exception is the desired behavior, wrap the code in an appropriate try/except statement. See GH 35685 for more examples. + +Closing issues +Be delicate here: many people interpret closing an issue as us saying that the conversation is over. It’s typically best to give the reporter some time to respond or self-close their issue if it’s determined that the behavior is not a bug, or the feature is out of scope. Sometimes reporters just go away though, and we’ll close the issue after the conversation has died. If you think an issue should be closed but are not completely sure, please apply the “closing candidate” label and wait for other maintainers to take a look. + +Reviewing pull requests +Anybody can review a pull request: regular contributors, triagers, or core-team members. But only core-team members can merge pull requests when they’re ready. + +Here are some things to check when reviewing a pull request. + +Tests should be in a sensible location: in the same file as closely related tests. + +New public APIs should be included somewhere in doc/source/reference/. + +New / changed API should use the versionadded or versionchanged directives in the docstring. + +User-facing changes should have a whatsnew in the appropriate file. + +Regression tests should reference the original GitHub issue number like # GH-1234. + +The pull request should be labeled and assigned the appropriate milestone (the next patch release for regression fixes and small bug fixes, the next minor milestone otherwise) + +Changes should comply with our Version policy. + +Backporting +pandas supports point releases (e.g. 1.4.3) that aim to: + +Fix bugs in new features introduced in the first minor version release. + +e.g. If a new feature was added in 1.4 and contains a bug, a fix can be applied in 1.4.3 + +Fix bugs that used to work in a few minor releases prior. There should be agreement between core team members that a backport is appropriate. + +e.g. If a feature worked in 1.2 and stopped working since 1.3, a fix can be applied in 1.4.3. + +Since pandas minor releases are based on GitHub branches (e.g. point release of 1.4 are based off the 1.4.x branch), “backporting” means merging a pull request fix to the main branch and correct minor branch associated with the next point release. + +By default, if a pull request is assigned to the next point release milestone within the GitHub interface, the backporting process should happen automatically by the @meeseeksdev bot once the pull request is merged. A new pull request will be made backporting the pull request to the correct version branch. Sometimes due to merge conflicts, a manual pull request will need to be made addressing the code conflict. + +If the bot does not automatically start the backporting process, you can also write a GitHub comment in the merged pull request to trigger the backport: + +@meeseeksdev backport version-branch +This will trigger a workflow which will backport a given change to a branch (e.g. @meeseeksdev backport 1.4.x) + +Cleaning up old issues +Every open issue in pandas has a cost. Open issues make finding duplicates harder, and can make it harder to know what needs to be done in pandas. That said, closing issues isn’t a goal on its own. Our goal is to make pandas the best it can be, and that’s best done by ensuring that the quality of our open issues is high. + +Occasionally, bugs are fixed but the issue isn’t linked to in the Pull Request. In these cases, comment that “This has been fixed, but could use a test.” and label the issue as “Good First Issue” and “Needs Test”. + +If an older issue doesn’t follow our issue template, edit the original post to include a minimal example, the actual output, and the expected output. Uniformity in issue reports is valuable. + +If an older issue lacks a reproducible example, label it as “Needs Info” and ask them to provide one (or write one yourself if possible). If one isn’t provide reasonably soon, close it according to the policies in Closing issues. + +Cleaning up old pull requests +Occasionally, contributors are unable to finish off a pull request. If some time has passed (two weeks, say) since the last review requesting changes, gently ask if they’re still interested in working on this. If another two weeks or so passes with no response, thank them for their work and then either: + +close the pull request; + +push to the contributor’s branch to push their work over the finish line (if you’re part of pandas-core). This can be helpful for pushing an important PR across the line, or for fixing a small merge conflict. + +If closing the pull request, then please comment on the original issue that “There’s a stalled PR at #1234 that may be helpful.”, and perhaps label the issue as “Good first issue” if the PR was relatively close to being accepted. + +Becoming a pandas maintainer +The full process is outlined in our governance documents. In summary, we’re happy to give triage permissions to anyone who shows interest by being helpful on the issue tracker. + +The required steps for adding a maintainer are: + +Contact the contributor and ask their interest to join. + +Add the contributor to the appropriate GitHub Team if accepted the invitation. + +pandas-core is for core team members + +pandas-triage is for pandas triage members + +If adding to pandas-core, there are two additional steps: + +Add the contributor to the pandas Google group. + +Create a pull request to add the contributor’s GitHub handle to pandas-dev/pandas/web/pandas/config.yml. + +The current list of core-team members is at pandas-dev/pandas + +Merging pull requests +Only core team members can merge pull requests. We have a few guidelines. + +You should typically not self-merge your own pull requests without approval. Exceptions include things like small changes to fix CI (e.g. pinning a package version). Self-merging with approval from other core team members is fine if the change is something you’re very confident about. + +You should not merge pull requests that have an active discussion, or pull requests that has any -1 votes from a core maintainer. pandas operates by consensus. + +For larger changes, it’s good to have a +1 from at least two core team members. + +In addition to the items listed in Closing issues, you should verify that the pull request is assigned the correct milestone. + +Pull requests merged with a patch-release milestone will typically be backported by our bot. Verify that the bot noticed the merge (it will leave a comment within a minute typically). If a manual backport is needed please do that, and remove the “Needs backport” label once you’ve done it manually. If you forget to assign a milestone before tagging, you can request the bot to backport it with: + +@Meeseeksdev backport +Release process +The release process makes a snapshot of pandas (a git commit) available to users with a particular version number. After the release the new pandas version will be available in the next places: + +Git repo with a new tag + +Source distribution in a GitHub release + +Pip packages in the PyPI + +Conda packages in conda-forge + +The process for releasing a new version of pandas is detailed next section. + +The instructions contain which needs to be replaced with the version to be released (e.g. 1.5.2). Also the branch to be released , which depends on whether the version being released is the release candidate of a new version, or any other version. Release candidates are released from main, while other versions are released from their branch (e.g. 1.5.x). + +Prerequisites +In order to be able to release a new pandas version, the next permissions are needed: + +Merge rights to the pandas and pandas-feedstock repositories. For the latter, open a PR adding your GitHub username to the conda-forge recipe. + +Permissions to push to main in the pandas repository, to push the new tags. + +Write permissions to PyPI. + +Access to our website / documentation server. Share your public key with the infrastructure committee to be added to the authorized_keys file of the main server user. + +Access to the social media accounts, to publish the announcements. + +Pre-release +Agree with the core team on the next topics: + +Release date (major/minor releases happen usually every 6 months, and patch releases monthly until x.x.5, just before the next major/minor) + +Blockers (issues and PRs that must be part of the release) + +Next version after the one being released + +Update and clean release notes for the version to be released, including: + +Set the final date of the release + +Remove any unused bullet point + +Make sure there are no formatting issues, typos, etc. + +Make sure the CI is green for the last commit of the branch being released. + +If not a release candidate, make sure all backporting pull requests to the branch being released are merged. + +Create a new issue and milestone for the version after the one being released. If the release was a release candidate, we would usually want to create issues and milestones for both the next major/minor, and the next patch release. In the milestone of a patch release, we add the description on-merge: backport to , so tagged PRs are automatically backported to the release branch by our bot. + +Change the milestone of all issues and PRs in the milestone being released to the next milestone. + +Release +Create an empty commit and a tag in the last commit of the branch to be released: + +git checkout +git pull --ff-only upstream +git clean -xdf +git commit --allow-empty --author="pandas Development Team " -m "RLS: " +git tag -a v -m "Version " # NOTE that the tag is v1.5.2 with "v" not 1.5.2 +git push upstream --follow-tags +The docs for the new version will be built and published automatically with the docs job in the CI, which will be triggered when the tag is pushed. + +Only if the release is a release candidate, we want to create a new branch for it, immediately after creating the tag. For example, if we are releasing pandas 1.4.0rc0, we would like to create the branch 1.4.x to backport commits to the 1.4 versions. As well as create a tag to mark the start of the development of 1.5.0 (assuming it is the next version): + +git checkout -b 1.4.x +git push upstream 1.4.x +git checkout main +git commit --allow-empty -m "Start 1.5.0" +git tag -a v1.5.0.dev0 -m "DEV: Start 1.5.0" +git push upstream main --follow-tags +Download the source distribution and wheels from the wheel staging area. Be careful to make sure that no wheels are missing (e.g. due to failed builds). + +Running scripts/download_wheels.sh with the version that you want to download wheels/the sdist for should do the trick. This script will make a dist folder inside your clone of pandas and put the downloaded wheels and sdist there: + +scripts/download_wheels.sh +Create a new GitHub release: + +Tag: + +Title: pandas + +Description: Copy the description of the last release of the same kind (release candidate, major/minor or patch release) + +Files: pandas-.tar.gz source distribution just generated + +Set as a pre-release: Only check for a release candidate + +Set as the latest release: Leave checked, unless releasing a patch release for an older version (e.g. releasing 1.4.5 after 1.5 has been released) + +Upload wheels to PyPI: + +twine upload pandas/dist/pandas-*.{whl,tar.gz} --skip-existing +The GitHub release will after some hours trigger an automated conda-forge PR. (If you don’t want to wait, you can open an issue titled @conda-forge-admin, please update version to trigger the bot.) Merge it once the CI is green, and it will generate the conda-forge packages. + +In case a manual PR needs to be done, the version, sha256 and build fields are the ones that usually need to be changed. If anything else in the recipe has changed since the last release, those changes should be available in ci/meta.yaml. + +Post-Release +Update symlinks to stable documentation by logging in to our web server, and editing /var/www/html/pandas-docs/stable to point to version/ for major and minor releases, or version/ to version/ for patch releases. The exact instructions are (replace the example version numbers by the appropriate ones for the version you are releasing): + +Log in to the server and use the correct user. + +cd /var/www/html/pandas-docs/ + +ln -sfn version/2.1 stable (for a major or minor release) + +ln -sfn version/2.0.3 version/2.0 (for a patch release) + +If releasing a major or minor release, open a PR in our source code to update web/pandas/versions.json, to have the desired versions in the documentation dropdown menu. + +Close the milestone and the issue for the released version. + +Create a new issue for the next release, with the estimated date of release. + +Open a PR with the placeholder for the release notes of the next version. See for example the PR for 1.5.3. Note that the template to use depends on whether it is a major, minor or patch release. + +Announce the new release in the official channels (use previous announcements for reference): + +The pandas-dev and pydata mailing lists + +X, Mastodon, Telegram and LinkedIn + +Update this release instructions to fix anything incorrect and to update about any change since the last release. \ No newline at end of file diff --git a/.cursor/rules/policies.mdc b/.cursor/rules/policies.mdc new file mode 100644 index 0000000000000..9c72057d7117d --- /dev/null +++ b/.cursor/rules/policies.mdc @@ -0,0 +1,34 @@ +--- +alwaysApply: true +--- +Policies +Version policy +pandas uses a loose variant of semantic versioning (SemVer) to govern deprecations, API compatibility, and version numbering. + +A pandas release number is made up of MAJOR.MINOR.PATCH. + +API breaking changes should only occur in major releases. These changes will be documented, with clear guidance on what is changing, why it’s changing, and how to migrate existing code to the new behavior. + +Whenever possible, a deprecation path will be provided rather than an outright breaking change. + +pandas will introduce deprecations in minor releases. These deprecations will preserve the existing behavior while emitting a warning that provide guidance on: + +How to achieve similar behavior if an alternative is available + +The pandas version in which the deprecation will be enforced. + +We will not introduce new deprecations in patch releases. + +Deprecations will only be enforced in major releases. For example, if a behavior is deprecated in pandas 1.2.0, it will continue to work, with a warning, for all releases in the 1.x series. The behavior will change and the deprecation removed in the next major release (2.0.0). + +Note + +pandas will sometimes make behavior changing bug fixes, as part of minor or patch releases. Whether or not a change is a bug fix or an API-breaking change is a judgement call. We’ll do our best, and we invite you to participate in development discussion on the issue tracker or mailing list. + +These policies do not apply to features marked as experimental in the documentation. pandas may change the behavior of experimental features at any time. + +Python support +pandas mirrors the SPEC 0 guideline for Python support. + +Security policy +To report a security vulnerability to pandas, please go to pandas-dev/pandas and see the instructions there. \ No newline at end of file diff --git a/doc/_templates/pandas_footer.html b/doc/_templates/pandas_footer.html index 6d8caa4d6c741..8d781d909a2fe 100644 --- a/doc/_templates/pandas_footer.html +++ b/doc/_templates/pandas_footer.html @@ -1,3 +1,3 @@ - + + via NumFOCUS, Inc. Hosted by OVHcloud. + diff --git a/doc/source/_static/css/pandas.css b/doc/source/_static/css/pandas.css index 1145177898737..25b8e36ed214d 100644 --- a/doc/source/_static/css/pandas.css +++ b/doc/source/_static/css/pandas.css @@ -48,5 +48,16 @@ table { } .card, .card img { - background-color: var(--pst-color-background); + background: none !important; +} + +/* Footer styling */ +.footer-items__start { + flex-direction: row; + align-items: center; + gap: 0.5rem; +} + +.footer-sponsors { + display: inline-block; } diff --git a/doc/source/conf.py b/doc/source/conf.py index f222a228531ff..8c06026df469e 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -164,7 +164,7 @@ # General information about the project. project = "pandas" # We have our custom "pandas_footer.html" template, using copyright for the current year -copyright = f"{datetime.now().year}," +copyright = f"{datetime.now().year}, pandas" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -242,7 +242,7 @@ html_theme_options = { "external_links": [], - "footer_start": ["pandas_footer", "sphinx-version"], + "footer_start": ["copyright", "pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", "analytics": { "plausible_analytics_domain": "pandas.pydata.org", From ec306e46f747a56880eb83ac12794da599b9df68 Mon Sep 17 00:00:00 2001 From: Mohammad Reza Yoosefiha Date: Sun, 13 Jul 2025 14:59:50 +0330 Subject: [PATCH 2/6] DOC: Add documentation changes for v3.0.0 - Simplified pandas theme footer implementation by leveraging built-in templates from pydata-sphinx-theme v0.16 (:issue:`51536`) --- doc/source/whatsnew/v3.0.0.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3e58be09372a3..caa4fce037c97 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -943,6 +943,14 @@ Other - - +.. --------------------------------------------------------------------------- +.. _whatsnew_300.documentation: + +Documentation changes +~~~~~~~~~~~~~~~~~~~~~ + +- Simplified pandas theme footer implementation by leveraging built-in templates from pydata-sphinx-theme v0.16 (:issue:`51536`) + .. --------------------------------------------------------------------------- .. _whatsnew_300.contributors: From aa9896e4f021a0b5478ec6ac1d73282cd6b34abf Mon Sep 17 00:00:00 2001 From: siryoos Date: Tue, 15 Jul 2025 09:19:47 +0330 Subject: [PATCH 3/6] update the git ignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d951f3fb9cbad..bf3a4f5db21a4 100644 --- a/.gitignore +++ b/.gitignore @@ -23,7 +23,8 @@ .tags .cache/ .vscode/ - +*.cursorrules +*.cursor # Compiled source # ################### *.a From f5421a1fc10b5e822680d32b69f8ca538275b6f5 Mon Sep 17 00:00:00 2001 From: siryoos Date: Tue, 15 Jul 2025 09:22:48 +0330 Subject: [PATCH 4/6] DOC: Remove outdated contributing guidelines and documentation files - Deleted multiple obsolete markdown files related to contributing, debugging, and extending pandas. - Cleaned up the repository by removing files that are no longer relevant to the current development practices. --- .cursor/rules/contributing-to-codebase.mdc | 656 ---------------- .cursor/rules/copy-on-write-mechanism.mdc | 14 - .../creating-developement-environment.mdc | 212 ----- .cursor/rules/debugging-c-extentions.mdc | 46 -- .cursor/rules/extending-pandas.mdc | 349 --------- .cursor/rules/internals.mdc | 66 -- .cursor/rules/pandas-contribution.mdc | 221 ------ .cursor/rules/pandas-doc-string-guid.mdc | 740 ------------------ .cursor/rules/pandas-documentation.mdc | 128 --- .cursor/rules/pandas-maintenace.mdc | 333 -------- .cursor/rules/policies.mdc | 34 - 11 files changed, 2799 deletions(-) delete mode 100644 .cursor/rules/contributing-to-codebase.mdc delete mode 100644 .cursor/rules/copy-on-write-mechanism.mdc delete mode 100644 .cursor/rules/creating-developement-environment.mdc delete mode 100644 .cursor/rules/debugging-c-extentions.mdc delete mode 100644 .cursor/rules/extending-pandas.mdc delete mode 100644 .cursor/rules/internals.mdc delete mode 100644 .cursor/rules/pandas-contribution.mdc delete mode 100644 .cursor/rules/pandas-doc-string-guid.mdc delete mode 100644 .cursor/rules/pandas-documentation.mdc delete mode 100644 .cursor/rules/pandas-maintenace.mdc delete mode 100644 .cursor/rules/policies.mdc diff --git a/.cursor/rules/contributing-to-codebase.mdc b/.cursor/rules/contributing-to-codebase.mdc deleted file mode 100644 index 7f7dba60afa69..0000000000000 --- a/.cursor/rules/contributing-to-codebase.mdc +++ /dev/null @@ -1,656 +0,0 @@ ---- -description: whenever we are contribuing to codebase -alwaysApply: false ---- -Contributing to the code base -Table of Contents: - -Code standards - -Pre-commit - -Optional dependencies - -Backwards compatibility - -Type hints - -Style guidelines - -pandas-specific types - -Validating type hints - -Testing type hints in code using pandas - -Testing with continuous integration - -Test-driven development - -Writing tests - -Using pytest - -Test structure - -Preferred pytest idioms - -Testing a warning - -Testing an exception - -Testing involving files - -Testing involving network connectivity - -Example - -Using hypothesis - -Running the test suite - -Running the performance test suite - -Documenting your code - -Code standards -Writing good code is not just about what you write. It is also about how you write it. During Continuous Integration testing, several tools will be run to check your code for stylistic errors. Generating any warnings will cause the test to fail. Thus, good style is a requirement for submitting code to pandas. - -There are a couple of tools in pandas to help contributors verify their changes before contributing to the project - -./ci/code_checks.sh: a script validates the doctests, formatting in docstrings, and imported modules. It is possible to run the checks independently by using the parameters docstrings, code, and doctests (e.g. ./ci/code_checks.sh doctests); - -pre-commit, which we go into detail on in the next section. - -In addition, because a lot of people use our library, it is important that we do not make sudden changes to the code that could have the potential to break a lot of user code as a result, that is, we need it to be as backwards compatible as possible to avoid mass breakages. - -Pre-commit -Additionally, Continuous Integration will run code formatting checks like ruff, isort, and clang-format and more using pre-commit hooks. Any warnings from these checks will cause the Continuous Integration to fail; therefore, it is helpful to run the check yourself before submitting code. This can be done by installing pre-commit (which should already have happened if you followed the instructions in Setting up your development environment) and then running: - -pre-commit install -from the root of the pandas repository. Now all of the styling checks will be run each time you commit changes without your needing to run each one manually. In addition, using pre-commit will also allow you to more easily remain up-to-date with our code checks as they change. - -Note that if needed, you can skip these checks with git commit --no-verify. - -If you don’t want to use pre-commit as part of your workflow, you can still use it to run its checks with one of the following: - -pre-commit run --files -pre-commit run --from-ref=upstream/main --to-ref=HEAD --all-files -without needing to have done pre-commit install beforehand. - -Finally, we also have some slow pre-commit checks, which don’t run on each commit but which do run during continuous integration. You can trigger them manually with: - -pre-commit run --hook-stage manual --all-files -Note - -You may want to periodically run pre-commit gc, to clean up repos which are no longer used. - -Note - -If you have conflicting installations of virtualenv, then you may get an error - see here. - -Also, due to a bug in virtualenv, you may run into issues if you’re using conda. To solve this, you can downgrade virtualenv to version 20.0.33. - -Note - -If you have recently merged in main from the upstream branch, some of the dependencies used by pre-commit may have changed. Make sure to update your development environment. - -Optional dependencies -Optional dependencies (e.g. matplotlib) should be imported with the private helper pandas.compat._optional.import_optional_dependency. This ensures a consistent error message when the dependency is not met. - -All methods using an optional dependency should include a test asserting that an ImportError is raised when the optional dependency is not found. This test should be skipped if the library is present. - -All optional dependencies should be documented in Optional dependencies and the minimum required version should be set in the pandas.compat._optional.VERSIONS dict. - -Backwards compatibility -Please try to maintain backward compatibility. pandas has lots of users with lots of existing code, so don’t break it if at all possible. If you think breakage is required, clearly state why as part of the pull request. Also, be careful when changing method signatures and add deprecation warnings where needed. Also, add the deprecated sphinx directive to the deprecated functions or methods. - -If a function with the same arguments as the one being deprecated exist, you can use the pandas.util._decorators.deprecate: - -from pandas.util._decorators import deprecate - -deprecate('old_func', 'new_func', '1.1.0') -Otherwise, you need to do it manually: - -import warnings -from pandas.util._exceptions import find_stack_level - - -def old_func(): - """Summary of the function. - - .. deprecated:: 1.1.0 - Use new_func instead. - """ - warnings.warn( - 'Use new_func instead.', - FutureWarning, - stacklevel=find_stack_level(), - ) - new_func() - - -def new_func(): - pass -You’ll also need to - -Write a new test that asserts a warning is issued when calling with the deprecated argument - -Update all of pandas existing tests and code to use the new argument - -See Testing a warning for more. - -Type hints -pandas strongly encourages the use of PEP 484 style type hints. New development should contain type hints and pull requests to annotate existing code are accepted as well! - -Style guidelines -Type imports should follow the from typing import ... convention. Your code may be automatically re-written to use some modern constructs (e.g. using the built-in list instead of typing.List) by the pre-commit checks. - -In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in Mypy 1775. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like - -class SomeClass1: - str = None -The appropriate way to annotate this would be as follows - -str_type = str - -class SomeClass2: - str: str_type = None -In some cases you may be tempted to use cast from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example - -from typing import cast - -from pandas.core.dtypes.common import is_number - -def cannot_infer_bad(obj: Union[str, int, float]): - - if is_number(obj): - ... - else: # Reasonably only str objects would reach this but... - obj = cast(str, obj) # Mypy complains without this! - return obj.upper() -The limitation here is that while a human can reasonably understand that is_number would catch the int and float types mypy cannot make that same inference just yet (see mypy #5206). While the above works, the use of cast is strongly discouraged. Where applicable a refactor of the code to appease static analysis is preferable - -def cannot_infer_good(obj: Union[str, int, float]): - - if isinstance(obj, str): - return obj.upper() - else: - ... -With custom types and inference this is not always possible so exceptions are made, but every effort should be exhausted to avoid cast before going down such paths. - -pandas-specific types -Commonly used types specific to pandas will appear in pandas._typing and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. - -For example, quite a few functions in pandas accept a dtype argument. This can be expressed as a string like "object", a numpy.dtype like np.int64 or even a pandas ExtensionDtype like pd.CategoricalDtype. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module - -from pandas._typing import Dtype - -def as_type(dtype: Dtype) -> ...: - ... -This module will ultimately house types for repeatedly used concepts like “path-like”, “array-like”, “numeric”, etc… and can also hold aliases for commonly appearing parameters like axis. Development of this module is active so be sure to refer to the source for the most up to date list of available types. - -Validating type hints -pandas uses mypy and pyright to statically analyze the code base and type hints. After making any change you can ensure your type hints are consistent by running - -pre-commit run --hook-stage manual --all-files mypy -pre-commit run --hook-stage manual --all-files pyright -pre-commit run --hook-stage manual --all-files pyright_reportGeneralTypeIssues -# the following might fail if the installed pandas version does not correspond to your local git version -pre-commit run --hook-stage manual --all-files stubtest -in your python environment. - -Warning - -Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the mypy or numpy versions do not match. Please see how to setup the python environment or select a recently succeeded workflow, select the “Docstring validation, typing, and other manual pre-commit hooks” job, then click on “Set up Conda” and “Environment info” to see which versions the pandas CI installs. - -Testing type hints in code using pandas -Warning - -pandas is not yet a py.typed library (PEP 561)! The primary purpose of locally declaring pandas as a py.typed library is to test and improve the pandas-builtin type annotations. - -Until pandas becomes a py.typed library, it is possible to easily experiment with the type annotations shipped with pandas by creating an empty file named “py.typed” in the pandas installation folder: - -python -c "import pandas; import pathlib; (pathlib.Path(pandas.__path__[0]) / 'py.typed').touch()" -The existence of the py.typed file signals to type checkers that pandas is already a py.typed library. This makes type checkers aware of the type annotations shipped with pandas. - -Testing with continuous integration -The pandas test suite will run automatically on GitHub Actions continuous integration services, once your pull request is submitted. However, if you wish to run the test suite on a branch prior to submitting the pull request, then the continuous integration services need to be hooked to your GitHub repository. Instructions are here for GitHub Actions. - -A pull-request will be considered for merging when you have an all ‘green’ build. If any tests are failing, then you will get a red ‘X’, where you can click through to see the individual failed tests. This is an example of a green build. - -../_images/ci.png -Test-driven development -pandas is serious about testing and strongly encourages contributors to embrace test-driven development (TDD). This development process “relies on the repetition of a very short development cycle: first the developer writes an (initially failing) automated test case that defines a desired improvement or new function, then produces the minimum amount of code to pass that test.” So, before actually writing any code, you should write your tests. Often the test can be taken from the original GitHub issue. However, it is always worth considering additional use cases and writing corresponding tests. - -We use code coverage to help understand the amount of code which is covered by a test. We recommend striving to ensure code you add or change within Pandas is covered by a test. Please see our code coverage dashboard through Codecov for more information. - -Adding tests is one of the most common requests after code is pushed to pandas. Therefore, it is worth getting in the habit of writing tests ahead of time so this is never an issue. - -Writing tests -All tests should go into the tests subdirectory of the specific package. This folder contains many current examples of tests, and we suggest looking to these for inspiration. - -As a general tip, you can use the search functionality in your integrated development environment (IDE) or the git grep command in a terminal to find test files in which the method is called. If you are unsure of the best location to put your test, take your best guess, but note that reviewers may request that you move the test to a different location. - -To use git grep, you can run the following command in a terminal: - -git grep "function_name(" - -This will search through all files in your repository for the text function_name(. This can be a useful way to quickly locate the function in the codebase and determine the best location to add a test for it. - -Ideally, there should be one, and only one, obvious place for a test to reside. Until we reach that ideal, these are some rules of thumb for where a test should be located. - -Does your test depend only on code in pd._libs.tslibs? This test likely belongs in one of: - -tests.tslibs - -Note - -No file in tests.tslibs should import from any pandas modules outside of pd._libs.tslibs - -tests.scalar - -tests.tseries.offsets - -Does your test depend only on code in pd._libs? This test likely belongs in one of: - -tests.libs - -tests.groupby.test_libgroupby - -Is your test for an arithmetic or comparison method? This test likely belongs in one of: - -tests.arithmetic - -Note - -These are intended for tests that can be shared to test the behavior of DataFrame/Series/Index/ExtensionArray using the box_with_array fixture. - -tests.frame.test_arithmetic - -tests.series.test_arithmetic - -Is your test for a reduction method (min, max, sum, prod, …)? This test likely belongs in one of: - -tests.reductions - -Note - -These are intended for tests that can be shared to test the behavior of DataFrame/Series/Index/ExtensionArray. - -tests.frame.test_reductions - -tests.series.test_reductions - -tests.test_nanops - -Is your test for an indexing method? This is the most difficult case for deciding where a test belongs, because there are many of these tests, and many of them test more than one method (e.g. both Series.__getitem__ and Series.loc.__getitem__) - -Is the test specifically testing an Index method (e.g. Index.get_loc, Index.get_indexer)? This test likely belongs in one of: - -tests.indexes.test_indexing - -tests.indexes.fooindex.test_indexing - -Within that files there should be a method-specific test class e.g. TestGetLoc. - -In most cases, neither Series nor DataFrame objects should be needed in these tests. - -Is the test for a Series or DataFrame indexing method other than __getitem__ or __setitem__, e.g. xs, where, take, mask, lookup, or insert? This test likely belongs in one of: - -tests.frame.indexing.test_methodname - -tests.series.indexing.test_methodname - -Is the test for any of loc, iloc, at, or iat? This test likely belongs in one of: - -tests.indexing.test_loc - -tests.indexing.test_iloc - -tests.indexing.test_at - -tests.indexing.test_iat - -Within the appropriate file, test classes correspond to either types of indexers (e.g. TestLocBooleanMask) or major use cases (e.g. TestLocSetitemWithExpansion). - -See the note in section D) about tests that test multiple indexing methods. - -Is the test for Series.__getitem__, Series.__setitem__, DataFrame.__getitem__, or DataFrame.__setitem__? This test likely belongs in one of: - -tests.series.test_getitem - -tests.series.test_setitem - -tests.frame.test_getitem - -tests.frame.test_setitem - -If many cases such a test may test multiple similar methods, e.g. - -import pandas as pd -import pandas._testing as tm - -def test_getitem_listlike_of_ints(): - ser = pd.Series(range(5)) - - result = ser[[3, 4]] - expected = pd.Series([2, 3]) - tm.assert_series_equal(result, expected) - - result = ser.loc[[3, 4]] - tm.assert_series_equal(result, expected) -In cases like this, the test location should be based on the underlying method being tested. Or in the case of a test for a bugfix, the location of the actual bug. So in this example, we know that Series.__getitem__ calls Series.loc.__getitem__, so this is really a test for loc.__getitem__. So this test belongs in tests.indexing.test_loc. - -Is your test for a DataFrame or Series method? - -Is the method a plotting method? This test likely belongs in one of: - -tests.plotting - -Is the method an IO method? This test likely belongs in one of: - -tests.io - -Note - -This includes to_string but excludes __repr__, which is tested in tests.frame.test_repr and tests.series.test_repr. Other classes often have a test_formats file. - -Otherwise This test likely belongs in one of: - -tests.series.methods.test_mymethod - -tests.frame.methods.test_mymethod - -Note - -If a test can be shared between DataFrame/Series using the frame_or_series fixture, by convention it goes in the tests.frame file. - -Is your test for an Index method, not depending on Series/DataFrame? This test likely belongs in one of: - -tests.indexes - -Is your test for one of the pandas-provided ExtensionArrays (Categorical, DatetimeArray, TimedeltaArray, PeriodArray, IntervalArray, NumpyExtensionArray, FloatArray, BoolArray, StringArray)? This test likely belongs in one of: - -tests.arrays - -Is your test for all ExtensionArray subclasses (the “EA Interface”)? This test likely belongs in one of: - -tests.extension - -Using pytest -Test structure -pandas existing test structure is mostly class-based, meaning that you will typically find tests wrapped in a class. - -class TestReallyCoolFeature: - def test_cool_feature_aspect(self): - pass -We prefer a more functional style using the pytest framework, which offers a richer testing framework that will facilitate testing and developing. Thus, instead of writing test classes, we will write test functions like this: - -def test_really_cool_feature(): - pass -Preferred pytest idioms -Functional tests named def test_* and only take arguments that are either fixtures or parameters. - -Use a bare assert for testing scalars and truth-testing - -Use tm.assert_series_equal(result, expected) and tm.assert_frame_equal(result, expected) for comparing Series and DataFrame results respectively. - -Use @pytest.mark.parameterize when testing multiple cases. - -Use pytest.mark.xfail when a test case is expected to fail. - -Use pytest.mark.skip when a test case is never expected to pass. - -Use pytest.param when a test case needs a particular mark. - -Use @pytest.fixture if multiple tests can share a setup object. - -Warning - -Do not use pytest.xfail (which is different than pytest.mark.xfail) since it immediately stops the test and does not check if the test will fail. If this is the behavior you desire, use pytest.skip instead. - -If a test is known to fail but the manner in which it fails is not meant to be captured, use pytest.mark.xfail. It is common to use this method for a test that exhibits buggy behavior or a non-implemented feature. If the failing test has flaky behavior, use the argument strict=False. This will make it so pytest does not fail if the test happens to pass. Using strict=False is highly undesirable, please use it only as a last resort. - -Prefer the decorator @pytest.mark.xfail and the argument pytest.param over usage within a test so that the test is appropriately marked during the collection phase of pytest. For xfailing a test that involves multiple parameters, a fixture, or a combination of these, it is only possible to xfail during the testing phase. To do so, use the request fixture: - -def test_xfail(request): - mark = pytest.mark.xfail(raises=TypeError, reason="Indicate why here") - request.applymarker(mark) -xfail is not to be used for tests involving failure due to invalid user arguments. For these tests, we need to verify the correct exception type and error message is being raised, using pytest.raises instead. - -Testing a warning -Use tm.assert_produces_warning as a context manager to check that a block of code raises a warning and specify the warning message using the match argument. - -with tm.assert_produces_warning(DeprecationWarning, match="the warning message"): - pd.deprecated_function() -If a warning should specifically not happen in a block of code, pass False into the context manager. - -with tm.assert_produces_warning(False): - pd.no_warning_function() -If you have a test that would emit a warning, but you aren’t actually testing the warning itself (say because it’s going to be removed in the future, or because we’re matching a 3rd-party library’s behavior), then use pytest.mark.filterwarnings to ignore the error. - -@pytest.mark.filterwarnings("ignore:msg:category") -def test_thing(self): - pass -Testing an exception -Use pytest.raises as a context manager with the specific exception subclass (i.e. never use Exception) and the exception message in match. - -with pytest.raises(ValueError, match="an error"): - raise ValueError("an error") -Testing involving files -The temp_file pytest fixture creates a temporary file Pathlib object for testing: - -def test_something(temp_file): - pd.DataFrame([1]).to_csv(str(temp_file)) -Please reference pytest’s documentation for the file retention policy. - -Testing involving network connectivity -A unit test should not access a public data set over the internet due to flakiness of network connections and lack of ownership of the server that is being connected to. To mock this interaction, use the httpserver fixture from the pytest-localserver plugin. with synthetic data. - -@pytest.mark.network -@pytest.mark.single_cpu -def test_network(httpserver): - httpserver.serve_content(content="content") - result = pd.read_html(httpserver.url) -Example -Here is an example of a self-contained set of tests in a file pandas/tests/test_cool_feature.py that illustrate multiple features that we like to use. Please remember to add the GitHub Issue Number as a comment to a new test. - -import pytest -import numpy as np -import pandas as pd - - -@pytest.mark.parametrize('dtype', ['int8', 'int16', 'int32', 'int64']) -def test_dtypes(dtype): - assert str(np.dtype(dtype)) == dtype - - -@pytest.mark.parametrize( - 'dtype', ['float32', pytest.param('int16', marks=pytest.mark.skip), - pytest.param('int32', marks=pytest.mark.xfail( - reason='to show how it works'))]) -def test_mark(dtype): - assert str(np.dtype(dtype)) == 'float32' - - -@pytest.fixture -def series(): - return pd.Series([1, 2, 3]) - - -@pytest.fixture(params=['int8', 'int16', 'int32', 'int64']) -def dtype(request): - return request.param - - -def test_series(series, dtype): - # GH - result = series.astype(dtype) - assert result.dtype == dtype - - expected = pd.Series([1, 2, 3], dtype=dtype) - tm.assert_series_equal(result, expected) -A test run of this yields - -((pandas) bash-3.2$ pytest test_cool_feature.py -v -=========================== test session starts =========================== -platform darwin -- Python 3.6.2, pytest-3.6.0, py-1.4.31, pluggy-0.4.0 -collected 11 items - -tester.py::test_dtypes[int8] PASSED -tester.py::test_dtypes[int16] PASSED -tester.py::test_dtypes[int32] PASSED -tester.py::test_dtypes[int64] PASSED -tester.py::test_mark[float32] PASSED -tester.py::test_mark[int16] SKIPPED -tester.py::test_mark[int32] xfail -tester.py::test_series[int8] PASSED -tester.py::test_series[int16] PASSED -tester.py::test_series[int32] PASSED -tester.py::test_series[int64] PASSED -Tests that we have parametrized are now accessible via the test name, for example we could run these with -k int8 to sub-select only those tests which match int8. - -((pandas) bash-3.2$ pytest test_cool_feature.py -v -k int8 -=========================== test session starts =========================== -platform darwin -- Python 3.6.2, pytest-3.6.0, py-1.4.31, pluggy-0.4.0 -collected 11 items - -test_cool_feature.py::test_dtypes[int8] PASSED -test_cool_feature.py::test_series[int8] PASSED -Using hypothesis -Hypothesis is a library for property-based testing. Instead of explicitly parametrizing a test, you can describe all valid inputs and let Hypothesis try to find a failing input. Even better, no matter how many random examples it tries, Hypothesis always reports a single minimal counterexample to your assertions - often an example that you would never have thought to test. - -See Getting Started with Hypothesis for more of an introduction, then refer to the Hypothesis documentation for details. - -import json -from hypothesis import given, strategies as st - -any_json_value = st.deferred(lambda: st.one_of( - st.none(), st.booleans(), st.floats(allow_nan=False), st.text(), - st.lists(any_json_value), st.dictionaries(st.text(), any_json_value) -)) - - -@given(value=any_json_value) -def test_json_roundtrip(value): - result = json.loads(json.dumps(value)) - assert value == result -This test shows off several useful features of Hypothesis, as well as demonstrating a good use-case: checking properties that should hold over a large or complicated domain of inputs. - -To keep the pandas test suite running quickly, parametrized tests are preferred if the inputs or logic are simple, with Hypothesis tests reserved for cases with complex logic or where there are too many combinations of options or subtle interactions to test (or think of!) all of them. - -Running the test suite -The tests can then be run directly inside your Git clone (without having to install pandas) by typing: - -pytest pandas -Note - -If a handful of tests don’t pass, it may not be an issue with your pandas installation. Some tests (e.g. some SQLAlchemy ones) require additional setup, others might start failing because a non-pinned library released a new version, and others might be flaky if run in parallel. As long as you can import pandas from your locally built version, your installation is probably fine and you can start contributing! - -Often it is worth running only a subset of tests first around your changes before running the entire suite. - -The easiest way to do this is with: - -pytest pandas/path/to/test.py -k regex_matching_test_name -Or with one of the following constructs: - -pytest pandas/tests/[test-module].py -pytest pandas/tests/[test-module].py::[TestClass] -pytest pandas/tests/[test-module].py::[TestClass]::[test_method] -Using pytest-xdist, which is included in our ‘pandas-dev’ environment, one can speed up local testing on multicore machines. The -n number flag then can be specified when running pytest to parallelize a test run across the number of specified cores or auto to utilize all the available cores on your machine. - -# Utilize 4 cores -pytest -n 4 pandas - -# Utilizes all available cores -pytest -n auto pandas -If you’d like to speed things along further a more advanced use of this command would look like this - -pytest pandas -n 4 -m "not slow and not network and not db and not single_cpu" -r sxX -In addition to the multithreaded performance increase this improves test speed by skipping some tests using the -m mark flag: - -slow: any test taking long (think seconds rather than milliseconds) - -network: tests requiring network connectivity - -db: tests requiring a database (mysql or postgres) - -single_cpu: tests that should run on a single cpu only - -You might want to enable the following option if it’s relevant for you: - -arm_slow: any test taking long on arm64 architecture - -These markers are defined in this toml file , under [tool.pytest.ini_options] in a list called markers, in case you want to check if new ones have been created which are of interest to you. - -The -r report flag will display a short summary info (see pytest documentation) . Here we are displaying the number of: - -s: skipped tests - -x: xfailed tests - -X: xpassed tests - -The summary is optional and can be removed if you don’t need the added information. Using the parallelization option can significantly reduce the time it takes to locally run tests before submitting a pull request. - -If you require assistance with the results, which has happened in the past, please set a seed before running the command and opening a bug report, that way we can reproduce it. Here’s an example for setting a seed on windows - -set PYTHONHASHSEED=314159265 -pytest pandas -n 4 -m "not slow and not network and not db and not single_cpu" -r sxX -On Unix use - -export PYTHONHASHSEED=314159265 -pytest pandas -n 4 -m "not slow and not network and not db and not single_cpu" -r sxX -For more, see the pytest documentation. - -Furthermore one can run - -pd.test() -with an imported pandas to run tests similarly. - -Running the performance test suite -Performance matters and it is worth considering whether your code has introduced performance regressions. pandas is in the process of migrating to asv benchmarks to enable easy monitoring of the performance of critical pandas operations. These benchmarks are all found in the pandas/asv_bench directory, and the test results can be found here. - -To use all features of asv, you will need either conda or virtualenv. For more details please check the asv installation webpage. - -To install asv: - -pip install git+https://github.com/airspeed-velocity/asv -If you need to run a benchmark, change your directory to asv_bench/ and run: - -asv continuous -f 1.1 upstream/main HEAD -You can replace HEAD with the name of the branch you are working on, and report benchmarks that changed by more than 10%. The command uses conda by default for creating the benchmark environments. If you want to use virtualenv instead, write: - -asv continuous -f 1.1 -E virtualenv upstream/main HEAD -The -E virtualenv option should be added to all asv commands that run benchmarks. The default value is defined in asv.conf.json. - -Running the full benchmark suite can be an all-day process, depending on your hardware and its resource utilization. However, usually it is sufficient to paste only a subset of the results into the pull request to show that the committed changes do not cause unexpected performance regressions. You can run specific benchmarks using the -b flag, which takes a regular expression. For example, this will only run benchmarks from a pandas/asv_bench/benchmarks/groupby.py file: - -asv continuous -f 1.1 upstream/main HEAD -b ^groupby -If you want to only run a specific group of benchmarks from a file, you can do it using . as a separator. For example: - -asv continuous -f 1.1 upstream/main HEAD -b groupby.GroupByMethods -will only run the GroupByMethods benchmark defined in groupby.py. - -You can also run the benchmark suite using the version of pandas already installed in your current Python environment. This can be useful if you do not have virtualenv or conda, or are using the setup.py develop approach discussed above; for the in-place build you need to set PYTHONPATH, e.g. PYTHONPATH="$PWD/.." asv [remaining arguments]. You can run benchmarks using an existing Python environment by: - -asv run -e -E existing -or, to use a specific Python interpreter,: - -asv run -e -E existing:python3.6 -This will display stderr from the benchmarks, and use your local python that comes from your $PATH. - -Information on how to write a benchmark and how to use asv can be found in the asv documentation. - -Documenting your code -Changes should be reflected in the release notes located in doc/source/whatsnew/vx.y.z.rst. This file contains an ongoing change log for each release. Add an entry to this file to document your fix, enhancement or (unavoidable) breaking change. Make sure to include the GitHub issue number when adding your entry (using :issue:`1234` where 1234 is the issue/pull request number). Your entry should be written using full sentences and proper grammar. - -When mentioning parts of the API, use a Sphinx :func:, :meth:, or :class: directive as appropriate. Not all public API functions and methods have a documentation page; ideally links would only be added if they resolve. You can usually find similar examples by checking the release notes for one of the previous versions. - -If your code is a bugfix, add your entry to the relevant bugfix section. Avoid adding to the Other section; only in rare cases should entries go there. Being as concise as possible, the description of the bug should include how the user may encounter it and an indication of the bug itself, e.g. “produces incorrect results” or “incorrectly raises”. It may be necessary to also indicate the new behavior. - -If your code is an enhancement, it is most likely necessary to add usage examples to the existing documentation. This can be done following the section regarding documentation. Further, to let users know when this feature was added, the versionadded directive is used. The sphinx syntax for that is: - -.. versionadded:: 2.1.0 -This will put the text New in version 2.1.0 wherever you put the sphinx directive. This should also be put in the docstring when adding a new function or method (example) or a new keyword argument (example). \ No newline at end of file diff --git a/.cursor/rules/copy-on-write-mechanism.mdc b/.cursor/rules/copy-on-write-mechanism.mdc deleted file mode 100644 index 0c1d84de14589..0000000000000 --- a/.cursor/rules/copy-on-write-mechanism.mdc +++ /dev/null @@ -1,14 +0,0 @@ ---- -alwaysApply: true ---- -Copy on write -Copy on Write is a mechanism to simplify the indexing API and improve performance through avoiding copies if possible. CoW means that any DataFrame or Series derived from another in any way always behaves as a copy. An explanation on how to use Copy on Write efficiently can be found here. - -Reference tracking -To be able to determine if we have to make a copy when writing into a DataFrame, we have to be aware if the values are shared with another DataFrame. pandas keeps track of all Blocks that share values with another block internally to be able to tell when a copy needs to be triggered. The reference tracking mechanism is implemented on the Block level. - -We use a custom reference tracker object, BlockValuesRefs, that keeps track of every block, whose values share memory with each other. The reference is held through a weak-reference. Every pair of blocks that share some memory should point to the same BlockValuesRefs object. If one block goes out of scope, the reference to this block dies. As a consequence, the reference tracker object always knows how many blocks are alive and share memory. - -Whenever a DataFrame or Series object is sharing data with another object, it is required that each of those objects have its own BlockManager and Block objects. Thus, in other words, one Block instance (that is held by a DataFrame, not necessarily for intermediate objects) should always be uniquely used for only a single DataFrame/Series object. For example, when you want to use the same Block for another object, you can create a shallow copy of the Block instance with block.copy(deep=False) (which will create a new Block instance with the same underlying values and which will correctly set up the references). - -We can ask the reference tracking object if there is another block alive that shares data with us before writing into the values. We can trigger a copy before writing if there is in fact another block alive. \ No newline at end of file diff --git a/.cursor/rules/creating-developement-environment.mdc b/.cursor/rules/creating-developement-environment.mdc deleted file mode 100644 index d901fadff3962..0000000000000 --- a/.cursor/rules/creating-developement-environment.mdc +++ /dev/null @@ -1,212 +0,0 @@ ---- -description: when ever agent needs Creating a development environment -alwaysApply: false ---- -Creating a development environment -To test out code changes, you’ll need to build pandas from source, which requires a C/C++ compiler and Python environment. If you’re making documentation changes, you can skip to contributing to the documentation but if you skip creating the development environment you won’t be able to build the documentation locally before pushing your changes. It’s recommended to also install the pre-commit hooks. - -Step 1: install a C compiler -How to do this will depend on your platform. If you choose to use Docker or GitPod in the next step, then you can skip this step. - -Windows - -You will need Build Tools for Visual Studio 2022. - -Note - -You DO NOT need to install Visual Studio 2022. You only need “Build Tools for Visual Studio 2022” found by scrolling down to “All downloads” -> “Tools for Visual Studio”. In the installer, select the “Desktop development with C++” Workloads. - -If you encounter an error indicating cl.exe is not found when building with Meson, reopen the installer and also select the optional component MSVC v142 - VS 2019 C++ x64/x86 build tools in the right pane for installation. - -Alternatively, you can install the necessary components on the commandline using vs_BuildTools.exe - -Alternatively, you could use the WSL and consult the Linux instructions below. - -macOS - -To use the conda-based compilers, you will need to install the Developer Tools using xcode-select --install. - -If you prefer to use a different compiler, general information can be found here: https://devguide.python.org/setup/#macos - -Linux - -For Linux-based conda installations, you won’t have to install any additional components outside of the conda environment. The instructions below are only needed if your setup isn’t based on conda environments. - -Some Linux distributions will come with a pre-installed C compiler. To find out which compilers (and versions) are installed on your system: - -# for Debian/Ubuntu: -dpkg --list | grep compiler -# for Red Hat/RHEL/CentOS/Fedora: -yum list installed | grep -i --color compiler -GCC (GNU Compiler Collection), is a widely used compiler, which supports C and a number of other languages. If GCC is listed as an installed compiler nothing more is required. - -If no C compiler is installed, or you wish to upgrade, or you’re using a different Linux distribution, consult your favorite search engine for compiler installation/update instructions. - -Let us know if you have any difficulties by opening an issue or reaching out on our contributor community Slack. - -Step 2: create an isolated environment -Before we begin, please: - -Make sure that you have cloned the repository - -cd to the pandas source directory you just created with the clone command - -Option 1: using conda (recommended) -Install miniforge to get conda - -Create and activate the pandas-dev conda environment using the following commands: - -conda env create --file environment.yml -conda activate pandas-dev -Option 2: using pip -You’ll need to have at least the minimum Python version that pandas supports. You also need to have setuptools 51.0.0 or later to build pandas. - -Unix/macOS with virtualenv - -# Create a virtual environment -# Use an ENV_DIR of your choice. We'll use ~/virtualenvs/pandas-dev -# Any parent directories should already exist -python3 -m venv ~/virtualenvs/pandas-dev - -# Activate the virtualenv -. ~/virtualenvs/pandas-dev/bin/activate - -# Install the build dependencies -python -m pip install -r requirements-dev.txt -Unix/macOS with pyenv - -Consult the docs for setting up pyenv here. - -# Create a virtual environment -# Use an ENV_DIR of your choice. We'll use ~/Users//.pyenv/versions/pandas-dev -pyenv virtualenv - -# For instance: -pyenv virtualenv 3.10 pandas-dev - -# Activate the virtualenv -pyenv activate pandas-dev - -# Now install the build dependencies in the cloned pandas repo -python -m pip install -r requirements-dev.txt -Windows - -Below is a brief overview on how to set-up a virtual environment with Powershell under Windows. For details please refer to the official virtualenv user guide. - -Use an ENV_DIR of your choice. We’ll use ~\\virtualenvs\\pandas-dev where ~ is the folder pointed to by either $env:USERPROFILE (Powershell) or %USERPROFILE% (cmd.exe) environment variable. Any parent directories should already exist. - -# Create a virtual environment -python -m venv $env:USERPROFILE\virtualenvs\pandas-dev - -# Activate the virtualenv. Use activate.bat for cmd.exe -~\virtualenvs\pandas-dev\Scripts\Activate.ps1 - -# Install the build dependencies -python -m pip install -r requirements-dev.txt -Option 3: using Docker -pandas provides a DockerFile in the root directory to build a Docker image with a full pandas development environment. - -Docker Commands - -Build the Docker image: - -# Build the image -docker build -t pandas-dev . -Run Container: - -# Run a container and bind your local repo to the container -# This command assumes you are running from your local repo -# but if not alter ${PWD} to match your local repo path -docker run -it --rm -v ${PWD}:/home/pandas pandas-dev -Even easier, you can integrate Docker with the following IDEs: - -Visual Studio Code - -You can use the DockerFile to launch a remote session with Visual Studio Code, a popular free IDE, using the .devcontainer.json file. See https://code.visualstudio.com/docs/remote/containers for details. - -PyCharm (Professional) - -Enable Docker support and use the Services tool window to build and manage images as well as run and interact with containers. See https://www.jetbrains.com/help/pycharm/docker.html for details. - -Option 4: using Gitpod -Gitpod is an open-source platform that automatically creates the correct development environment right in your browser, reducing the need to install local development environments and deal with incompatible dependencies. - -If you are a Windows user, unfamiliar with using the command line or building pandas for the first time, it is often faster to build with Gitpod. Here are the in-depth instructions for building pandas with GitPod. - -Step 3: build and install pandas -There are currently two supported ways of building pandas, pip/meson and setuptools(setup.py). Historically, pandas has only supported using setuptools to build pandas. However, this method requires a lot of convoluted code in setup.py and also has many issues in compiling pandas in parallel due to limitations in setuptools. - -The newer build system, invokes the meson backend through pip (via a PEP 517 build). It automatically uses all available cores on your CPU, and also avoids the need for manual rebuilds by rebuilding automatically whenever pandas is imported (with an editable install). - -For these reasons, you should compile pandas with meson. Because the meson build system is newer, you may find bugs/minor issues as it matures. You can report these bugs here. - -To compile pandas with meson, run: - -# Build and install pandas -# By default, this will print verbose output -# showing the "rebuild" taking place on import (see section below for explanation) -# If you do not want to see this, omit everything after --no-build-isolation -python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true -Note - -The version number is pulled from the latest repository tag. Be sure to fetch the latest tags from upstream before building: - -# set the upstream repository, if not done already, and fetch the latest tags -git remote add upstream https://github.com/pandas-dev/pandas.git -git fetch upstream --tags -Build options - -It is possible to pass options from the pip frontend to the meson backend if you would like to configure your install. Occasionally, you’ll want to use this to adjust the build directory, and/or toggle debug/optimization levels. - -You can pass a build directory to pandas by appending -Cbuilddir="your builddir here" to your pip command. This option allows you to configure where meson stores your built C extensions, and allows for fast rebuilds. - -Sometimes, it might be useful to compile pandas with debugging symbols, when debugging C extensions. Appending -Csetup-args="-Ddebug=true" will do the trick. - -With pip, it is possible to chain together multiple config settings. For example, specifying both a build directory and building with debug symbols would look like -Cbuilddir="your builddir here" -Csetup-args="-Dbuildtype=debug". - -Compiling pandas with setup.py - -Note - -This method of compiling pandas will be deprecated and removed very soon, as the meson backend matures. - -To compile pandas with setuptools, run: - -python setup.py develop -Note - -If pandas is already installed (via meson), you have to uninstall it first: - -python -m pip uninstall pandas -This is because python setup.py develop will not uninstall the loader script that meson-python uses to import the extension from the build folder, which may cause errors such as an FileNotFoundError to be raised. - -Note - -You will need to repeat this step each time the C extensions change, for example if you modified any file in pandas/_libs or if you did a fetch and merge from upstream/main. - -Checking the build - -At this point you should be able to import pandas from your locally built version: - -$ python ->>> import pandas ->>> print(pandas.__version__) # note: the exact output may differ -2.0.0.dev0+880.g2b9e661fbb.dirty -At this point you may want to try running the test suite. - -Keeping up to date with the latest build - -When building pandas with meson, importing pandas will automatically trigger a rebuild, even when C/Cython files are modified. By default, no output will be produced by this rebuild (the import will just take longer). If you would like to see meson’s output when importing pandas, you can set the environment variable MESONPY_EDITABLE_VERBOSE. For example, this would be: - -# On Linux/macOS -MESONPY_EDITABLE_VERBOSE=1 python - -# Windows -set MESONPY_EDITABLE_VERBOSE=1 # Only need to set this once per session -python -If you would like to see this verbose output every time, you can set the editable-verbose config setting to true like so: - -python -m pip install -ve . -Ceditable-verbose=true -Tip - -If you ever find yourself wondering whether setuptools or meson was used to build your pandas, you can check the value of pandas._built_with_meson, which will be true if meson was used to compile pandas. \ No newline at end of file diff --git a/.cursor/rules/debugging-c-extentions.mdc b/.cursor/rules/debugging-c-extentions.mdc deleted file mode 100644 index 52c894a405346..0000000000000 --- a/.cursor/rules/debugging-c-extentions.mdc +++ /dev/null @@ -1,46 +0,0 @@ ---- -description: when ever want to debug c extention -alwaysApply: false ---- -Debugging C extensions -pandas uses Cython and C/C++ extension modules to optimize performance. Unfortunately, the standard Python debugger does not allow you to step into these extensions. Cython extensions can be debugged with the Cython debugger and C/C++ extensions can be debugged using the tools shipped with your platform’s compiler. - -For Python developers with limited or no C/C++ experience this can seem a daunting task. Core developer Will Ayd has written a 3 part blog series to help guide you from the standard Python debugger into these other tools: - -Fundamental Python Debugging Part 1 - Python - -Fundamental Python Debugging Part 2 - Python Extensions - -Fundamental Python Debugging Part 3 - Cython Extensions - -Debugging locally -By default building pandas from source will generate a release build. To generate a development build you can type: - -pip install -ve . --no-build-isolation -Cbuilddir="debug" -Csetup-args="-Dbuildtype=debug" -Note - -conda environments update CFLAGS/CPPFLAGS with flags that are geared towards generating releases, and may work counter towards usage in a development environment. If using conda, you should unset these environment variables via export CFLAGS= and export CPPFLAGS= - -By specifying builddir="debug" all of the targets will be built and placed in the debug directory relative to the project root. This helps to keep your debug and release artifacts separate; you are of course able to choose a different directory name or omit altogether if you do not care to separate build types. - -Using Docker -To simplify the debugging process, pandas has created a Docker image with a debug build of Python and the gdb/Cython debuggers pre-installed. You may either docker pull pandas/pandas-debug to get access to this image or build it from the tooling/debug folder locally. - -You can then mount your pandas repository into this image via: - -docker run --rm -it -w /data -v ${PWD}:/data pandas/pandas-debug -Inside the image, you can use meson to build/install pandas and place the build artifacts into a debug folder using a command as follows: - -python -m pip install -ve . --no-build-isolation -Cbuilddir="debug" -Csetup-args="-Dbuildtype=debug" -If planning to use cygdb, the files required by that application are placed within the build folder. So you have to first cd to the build folder, then start that application. - -cd debug -cygdb -Within the debugger you can use cygdb commands to navigate cython extensions. - -Editor support -The meson build system generates a compilation database automatically and places it in the build directory. Many language servers and IDEs can use this information to provide code-completion, go-to-definition and error checking support as you type. - -How each language server / IDE chooses to look for the compilation database may vary. When in doubt you may want to create a symlink at the root of the project that points to the compilation database in your build directory. Assuming you used debug as your directory name, you can run: - -ln -s debug/compile_commands.json . \ No newline at end of file diff --git a/.cursor/rules/extending-pandas.mdc b/.cursor/rules/extending-pandas.mdc deleted file mode 100644 index d86de70b1b390..0000000000000 --- a/.cursor/rules/extending-pandas.mdc +++ /dev/null @@ -1,349 +0,0 @@ ---- -description: whenever you want to extend the current library or codebase -alwaysApply: false ---- -Extending pandas -While pandas provides a rich set of methods, containers, and data types, your needs may not be fully satisfied. pandas offers a few options for extending pandas. - -Registering custom accessors -Libraries can use the decorators pandas.api.extensions.register_dataframe_accessor(), pandas.api.extensions.register_series_accessor(), and pandas.api.extensions.register_index_accessor(), to add additional “namespaces” to pandas objects. All of these follow a similar convention: you decorate a class, providing the name of attribute to add. The class’s __init__ method gets the object being decorated. For example: - -@pd.api.extensions.register_dataframe_accessor("geo") -class GeoAccessor: - def __init__(self, pandas_obj): - self._validate(pandas_obj) - self._obj = pandas_obj - - @staticmethod - def _validate(obj): - # verify there is a column latitude and a column longitude - if "latitude" not in obj.columns or "longitude" not in obj.columns: - raise AttributeError("Must have 'latitude' and 'longitude'.") - - @property - def center(self): - # return the geographic center point of this DataFrame - lat = self._obj.latitude - lon = self._obj.longitude - return (float(lon.mean()), float(lat.mean())) - - def plot(self): - # plot this array's data on a map, e.g., using Cartopy - pass -Now users can access your methods using the geo namespace: - -ds = pd.DataFrame( - {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)} -) -ds.geo.center -(5.0, 10.0) -ds.geo.plot() -# plots data on a map -This can be a convenient way to extend pandas objects without subclassing them. If you write a custom accessor, make a pull request adding it to our ecosystem page. - -We highly recommend validating the data in your accessor’s __init__. In our GeoAccessor, we validate that the data contains the expected columns, raising an AttributeError when the validation fails. For a Series accessor, you should validate the dtype if the accessor applies only to certain dtypes. - -Extension types -Note - -The pandas.api.extensions.ExtensionDtype and pandas.api.extensions.ExtensionArray APIs were experimental prior to pandas 1.5. Starting with version 1.5, future changes will follow the pandas deprecation policy. - -pandas defines an interface for implementing data types and arrays that extend NumPy’s type system. pandas itself uses the extension system for some types that aren’t built into NumPy (categorical, period, interval, datetime with timezone). - -Libraries can define a custom array and data type. When pandas encounters these objects, they will be handled properly (i.e. not converted to an ndarray of objects). Many methods like pandas.isna() will dispatch to the extension type’s implementation. - -If you’re building a library that implements the interface, please publicize it on the ecosystem page. - -The interface consists of two classes. - -ExtensionDtype -A pandas.api.extensions.ExtensionDtype is similar to a numpy.dtype object. It describes the data type. Implementers are responsible for a few unique items like the name. - -One particularly important item is the type property. This should be the class that is the scalar type for your data. For example, if you were writing an extension array for IP Address data, this might be ipaddress.IPv4Address. - -See the extension dtype source for interface definition. - -pandas.api.extensions.ExtensionDtype can be registered to pandas to allow creation via a string dtype name. This allows one to instantiate Series and .astype() with a registered string name, for example 'category' is a registered string accessor for the CategoricalDtype. - -See the extension dtype dtypes for more on how to register dtypes. - -ExtensionArray -This class provides all the array-like functionality. ExtensionArrays are limited to 1 dimension. An ExtensionArray is linked to an ExtensionDtype via the dtype attribute. - -pandas makes no restrictions on how an extension array is created via its __new__ or __init__, and puts no restrictions on how you store your data. We do require that your array be convertible to a NumPy array, even if this is relatively expensive (as it is for Categorical). - -They may be backed by none, one, or many NumPy arrays. For example, pandas.Categorical is an extension array backed by two arrays, one for codes and one for categories. An array of IPv6 addresses may be backed by a NumPy structured array with two fields, one for the lower 64 bits and one for the upper 64 bits. Or they may be backed by some other storage type, like Python lists. - -See the extension array source for the interface definition. The docstrings and comments contain guidance for properly implementing the interface. - -ExtensionArray operator support -By default, there are no operators defined for the class ExtensionArray. There are two approaches for providing operator support for your ExtensionArray: - -Define each of the operators on your ExtensionArray subclass. - -Use an operator implementation from pandas that depends on operators that are already defined on the underlying elements (scalars) of the ExtensionArray. - -Note - -Regardless of the approach, you may want to set __array_priority__ if you want your implementation to be called when involved in binary operations with NumPy arrays. - -For the first approach, you define selected operators, e.g., __add__, __le__, etc. that you want your ExtensionArray subclass to support. - -The second approach assumes that the underlying elements (i.e., scalar type) of the ExtensionArray have the individual operators already defined. In other words, if your ExtensionArray named MyExtensionArray is implemented so that each element is an instance of the class MyExtensionElement, then if the operators are defined for MyExtensionElement, the second approach will automatically define the operators for MyExtensionArray. - -A mixin class, ExtensionScalarOpsMixin supports this second approach. If developing an ExtensionArray subclass, for example MyExtensionArray, can simply include ExtensionScalarOpsMixin as a parent class of MyExtensionArray, and then call the methods _add_arithmetic_ops() and/or _add_comparison_ops() to hook the operators into your MyExtensionArray class, as follows: - -from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin - - -class MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin): - pass - - -MyExtensionArray._add_arithmetic_ops() -MyExtensionArray._add_comparison_ops() -Note - -Since pandas automatically calls the underlying operator on each element one-by-one, this might not be as performant as implementing your own version of the associated operators directly on the ExtensionArray. - -For arithmetic operations, this implementation will try to reconstruct a new ExtensionArray with the result of the element-wise operation. Whether or not that succeeds depends on whether the operation returns a result that’s valid for the ExtensionArray. If an ExtensionArray cannot be reconstructed, an ndarray containing the scalars returned instead. - -For ease of implementation and consistency with operations between pandas and NumPy ndarrays, we recommend not handling Series and Indexes in your binary ops. Instead, you should detect these cases and return NotImplemented. When pandas encounters an operation like op(Series, ExtensionArray), pandas will - -unbox the array from the Series (Series.array) - -call result = op(values, ExtensionArray) - -re-box the result in a Series - -NumPy universal functions -Series implements __array_ufunc__. As part of the implementation, pandas unboxes the ExtensionArray from the Series, applies the ufunc, and re-boxes it if necessary. - -If applicable, we highly recommend that you implement __array_ufunc__ in your extension array to avoid coercion to an ndarray. See the NumPy documentation for an example. - -As part of your implementation, we require that you defer to pandas when a pandas container (Series, DataFrame, Index) is detected in inputs. If any of those is present, you should return NotImplemented. pandas will take care of unboxing the array from the container and re-calling the ufunc with the unwrapped input. - -Testing extension arrays -We provide a test suite for ensuring that your extension arrays satisfy the expected behavior. To use the test suite, you must provide several pytest fixtures and inherit from the base test class. The required fixtures are found in pandas-dev/pandas. - -To use a test, subclass it: - -from pandas.tests.extension import base - - -class TestConstructors(base.BaseConstructorsTests): - pass -See pandas-dev/pandas for a list of all the tests available. - -Compatibility with Apache Arrow -An ExtensionArray can support conversion to / from pyarrow arrays (and thus support for example serialization to the Parquet file format) by implementing two methods: ExtensionArray.__arrow_array__ and ExtensionDtype.__from_arrow__. - -The ExtensionArray.__arrow_array__ ensures that pyarrow knowns how to convert the specific extension array into a pyarrow.Array (also when included as a column in a pandas DataFrame): - -class MyExtensionArray(ExtensionArray): - ... - - def __arrow_array__(self, type=None): - # convert the underlying array values to a pyarrow Array - import pyarrow - - return pyarrow.array(..., type=type) -The ExtensionDtype.__from_arrow__ method then controls the conversion back from pyarrow to a pandas ExtensionArray. This method receives a pyarrow Array or ChunkedArray as only argument and is expected to return the appropriate pandas ExtensionArray for this dtype and the passed values: - -class ExtensionDtype: - ... - - def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> ExtensionArray: - ... -See more in the Arrow documentation. - -Those methods have been implemented for the nullable integer and string extension dtypes included in pandas, and ensure roundtrip to pyarrow and the Parquet file format. - -Subclassing pandas data structures -Warning - -There are some easier alternatives before considering subclassing pandas data structures. - -Extensible method chains with pipe - -Use composition. See here. - -Extending by registering an accessor - -Extending by extension type - -This section describes how to subclass pandas data structures to meet more specific needs. There are two points that need attention: - -Override constructor properties. - -Define original properties - -Note - -You can find a nice example in geopandas project. - -Override constructor properties -Each data structure has several constructor properties for returning a new data structure as the result of an operation. By overriding these properties, you can retain subclasses through pandas data manipulations. - -There are 3 possible constructor properties to be defined on a subclass: - -DataFrame/Series._constructor: Used when a manipulation result has the same dimension as the original. - -DataFrame._constructor_sliced: Used when a DataFrame (sub-)class manipulation result should be a Series (sub-)class. - -Series._constructor_expanddim: Used when a Series (sub-)class manipulation result should be a DataFrame (sub-)class, e.g. Series.to_frame(). - -Below example shows how to define SubclassedSeries and SubclassedDataFrame overriding constructor properties. - -class SubclassedSeries(pd.Series): - @property - def _constructor(self): - return SubclassedSeries - - @property - def _constructor_expanddim(self): - return SubclassedDataFrame - - -class SubclassedDataFrame(pd.DataFrame): - @property - def _constructor(self): - return SubclassedDataFrame - - @property - def _constructor_sliced(self): - return SubclassedSeries -s = SubclassedSeries([1, 2, 3]) -type(s) - - -to_framed = s.to_frame() -type(to_framed) - - -df = SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) -df - A B C -0 1 4 7 -1 2 5 8 -2 3 6 9 - -type(df) - - -sliced1 = df[["A", "B"]] -sliced1 - A B -0 1 4 -1 2 5 -2 3 6 - -type(sliced1) - - -sliced2 = df["A"] -sliced2 -0 1 -1 2 -2 3 -Name: A, dtype: int64 - -type(sliced2) - -Define original properties -To let original data structures have additional properties, you should let pandas know what properties are added. pandas maps unknown properties to data names overriding __getattribute__. Defining original properties can be done in one of 2 ways: - -Define _internal_names and _internal_names_set for temporary properties which WILL NOT be passed to manipulation results. - -Define _metadata for normal properties which will be passed to manipulation results. - -Below is an example to define two original properties, “internal_cache” as a temporary property and “added_property” as a normal property - -class SubclassedDataFrame2(pd.DataFrame): - - # temporary properties - _internal_names = pd.DataFrame._internal_names + ["internal_cache"] - _internal_names_set = set(_internal_names) - - # normal properties - _metadata = ["added_property"] - - @property - def _constructor(self): - return SubclassedDataFrame2 -df = SubclassedDataFrame2({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) -df - A B C -0 1 4 7 -1 2 5 8 -2 3 6 9 - -df.internal_cache = "cached" -df.added_property = "property" - -df.internal_cache -cached -df.added_property -property - -# properties defined in _internal_names is reset after manipulation -df[["A", "B"]].internal_cache -AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' - -# properties defined in _metadata are retained -df[["A", "B"]].added_property -property -Plotting backends -pandas can be extended with third-party plotting backends. The main idea is letting users select a plotting backend different than the provided one based on Matplotlib. For example: - -pd.set_option("plotting.backend", "backend.module") -pd.Series([1, 2, 3]).plot() -This would be more or less equivalent to: - -import backend.module -backend.module.plot(pd.Series([1, 2, 3])) -The backend module can then use other visualization tools (Bokeh, Altair,…) to generate the plots. - -Libraries implementing the plotting backend should use entry points to make their backend discoverable to pandas. The key is "pandas_plotting_backends". For example, pandas registers the default “matplotlib” backend as follows. - -# in setup.py -setup( # noqa: F821 - ..., - entry_points={ - "pandas_plotting_backends": [ - "matplotlib = pandas:plotting._matplotlib", - ], - }, -) -More information on how to implement a third-party plotting backend can be found at pandas-dev/pandas. - -Arithmetic with 3rd party types -In order to control how arithmetic works between a custom type and a pandas type, implement __pandas_priority__. Similar to numpy’s __array_priority__ semantics, arithmetic methods on DataFrame, Series, and Index objects will delegate to other, if it has an attribute __pandas_priority__ with a higher value. - -By default, pandas objects try to operate with other objects, even if they are not types known to pandas: - -pd.Series([1, 2]) + [10, 20] -0 11 -1 22 -dtype: int64 -In the example above, if [10, 20] was a custom type that can be understood as a list, pandas objects will still operate with it in the same way. - -In some cases, it is useful to delegate to the other type the operation. For example, consider I implement a custom list object, and I want the result of adding my custom list with a pandas Series to be an instance of my list and not a Series as seen in the previous example. This is now possible by defining the __pandas_priority__ attribute of my custom list, and setting it to a higher value, than the priority of the pandas objects I want to operate with. - -The __pandas_priority__ of DataFrame, Series, and Index are 4000, 3000, and 2000 respectively. The base ExtensionArray.__pandas_priority__ is 1000. - -class CustomList(list): - __pandas_priority__ = 5000 - - def __radd__(self, other): - # return `self` and not the addition for simplicity - return self - -custom = CustomList() -series = pd.Series([1, 2, 3]) - -# Series refuses to add custom, since it's an unknown type with higher priority -assert series.__add__(custom) is NotImplemented - -# This will cause the custom class `__radd__` being used instead -assert series + custom is custom diff --git a/.cursor/rules/internals.mdc b/.cursor/rules/internals.mdc deleted file mode 100644 index 28b0572ba8924..0000000000000 --- a/.cursor/rules/internals.mdc +++ /dev/null @@ -1,66 +0,0 @@ ---- -alwaysApply: true ---- -Internals -This section will provide a look into some of pandas internals. It’s primarily intended for developers of pandas itself. - -Indexing -In pandas there are a few objects implemented which can serve as valid containers for the axis labels: - -Index: the generic “ordered set” object, an ndarray of object dtype assuming nothing about its contents. The labels must be hashable (and likely immutable) and unique. Populates a dict of label to location in Cython to do O(1) lookups. - -MultiIndex: the standard hierarchical index object - -DatetimeIndex: An Index object with Timestamp boxed elements (impl are the int64 values) - -TimedeltaIndex: An Index object with Timedelta boxed elements (impl are the in64 values) - -PeriodIndex: An Index object with Period elements - -There are functions that make the creation of a regular index easy: - -date_range(): fixed frequency date range generated from a time rule or DateOffset. An ndarray of Python datetime objects - -period_range(): fixed frequency date range generated from a time rule or DateOffset. An ndarray of Period objects, representing timespans - -Warning - -Custom Index subclasses are not supported, custom behavior should be implemented using the ExtensionArray interface instead. - -MultiIndex -Internally, the MultiIndex consists of a few things: the levels, the integer codes, and the level names: - -index = pd.MultiIndex.from_product( - [range(3), ["one", "two"]], names=["first", "second"] -) - - -index -Out[2]: -MultiIndex([(0, 'one'), - (0, 'two'), - (1, 'one'), - (1, 'two'), - (2, 'one'), - (2, 'two')], - names=['first', 'second']) - -index.levels -Out[3]: FrozenList([[0, 1, 2], ['one', 'two']]) - -index.codes -Out[4]: FrozenList([[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) - -index.names -Out[5]: FrozenList(['first', 'second']) -You can probably guess that the codes determine which unique element is identified with that location at each layer of the index. It’s important to note that sortedness is determined solely from the integer codes and does not check (or care) whether the levels themselves are sorted. Fortunately, the constructors from_tuples() and from_arrays() ensure that this is true, but if you compute the levels and codes yourself, please be careful. - -Values -pandas extends NumPy’s type system with custom types, like Categorical or datetimes with a timezone, so we have multiple notions of “values”. For 1-D containers (Index classes and Series) we have the following convention: - -cls._values refers is the “best possible” array. This could be an ndarray or ExtensionArray. - -So, for example, Series[category]._values is a Categorical. - -Subclassing pandas data structures -This section has been moved to Subclassing pandas data structures. \ No newline at end of file diff --git a/.cursor/rules/pandas-contribution.mdc b/.cursor/rules/pandas-contribution.mdc deleted file mode 100644 index fb1ad2eed515f..0000000000000 --- a/.cursor/rules/pandas-contribution.mdc +++ /dev/null @@ -1,221 +0,0 @@ ---- -alwaysApply: true ---- -Contributing to pandas -Table of contents: - -Bug reports and enhancement requests - -Finding an issue to contribute to - -Submitting a pull request - -Version control, Git, and GitHub - -Getting started with Git - -Create a fork of pandas - -Creating a feature branch - -Making code changes - -Pushing your changes - -Making a pull request - -Updating your pull request - -Updating the development environment - -Tips for a successful pull request - -All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. - -Bug reports and enhancement requests -Bug reports and enhancement requests are an important part of making pandas more stable and are curated though Github issues. When reporting an issue or request, please select the appropriate category and fill out the issue form fully to ensure others and the core development team can fully understand the scope of the issue. - -The issue will then show up to the pandas community and be open to comments/ideas from others. - -Finding an issue to contribute to -If you are brand new to pandas or open-source development, we recommend searching the GitHub “issues” tab to find issues that interest you. Unassigned issues labeled Docs and good first issue are typically good for newer contributors. - -Once you’ve found an interesting issue, it’s a good idea to assign the issue to yourself, so nobody else duplicates the work on it. On the Github issue, a comment with the exact text take to automatically assign you the issue (this will take seconds and may require refreshing the page to see it). - -If for whatever reason you are not able to continue working with the issue, please unassign it, so other people know it’s available again. You can check the list of assigned issues, since people may not be working in them anymore. If you want to work on one that is assigned, feel free to kindly ask the current assignee if you can take it (please allow at least a week of inactivity before considering work in the issue discontinued). - -We have several contributor community communication channels, which you are welcome to join, and ask questions as you figure things out. Among them are regular meetings for new contributors, dev meetings, a dev mailing list, and a Slack for the contributor community. All pandas contributors are welcome to these spaces, where they can connect with each other. Even maintainers who have been with us for a long time felt just like you when they started out, and are happy to welcome you and support you as you get to know how we work, and where things are. Take a look at the next sections to learn more. - -Submitting a pull request -Version control, Git, and GitHub -pandas is hosted on GitHub, and to contribute, you will need to sign up for a free GitHub account. We use Git for version control to allow many people to work together on the project. - -If you are new to Git, you can reference some of these resources for learning Git. Feel free to reach out to the contributor community for help if needed: - -Git documentation. - -Also, the project follows a forking workflow further described on this page whereby contributors fork the repository, make changes and then create a pull request. So please be sure to read and follow all the instructions in this guide. - -If you are new to contributing to projects through forking on GitHub, take a look at the GitHub documentation for contributing to projects. GitHub provides a quick tutorial using a test repository that may help you become more familiar with forking a repository, cloning a fork, creating a feature branch, pushing changes and making pull requests. - -Below are some useful resources for learning more about forking and pull requests on GitHub: - -the GitHub documentation for forking a repo. - -the GitHub documentation for collaborating with pull requests. - -the GitHub documentation for working with forks. - -Getting started with Git -GitHub has instructions for installing git, setting up your SSH key, and configuring git. All these steps need to be completed before you can work seamlessly between your local repository and GitHub. - -Create a fork of pandas -You will need your own copy of pandas (aka fork) to work on the code. Go to the pandas project page and hit the Fork button. Please uncheck the box to copy only the main branch before selecting Create Fork. You will want to clone your fork to your machine - -git clone https://github.com/your-user-name/pandas.git pandas-yourname -cd pandas-yourname -git remote add upstream https://github.com/pandas-dev/pandas.git -git fetch upstream -This creates the directory pandas-yourname and connects your repository to the upstream (main project) pandas repository. - -Note - -Performing a shallow clone (with --depth==N, for some N greater or equal to 1) might break some tests and features as pd.show_versions() as the version number cannot be computed anymore. - -Creating a feature branch -Your local main branch should always reflect the current state of pandas repository. First ensure it’s up-to-date with the main pandas repository. - -git checkout main -git pull upstream main --ff-only -Then, create a feature branch for making your changes. For example - -git checkout -b shiny-new-feature -This changes your working branch from main to the shiny-new-feature branch. Keep any changes in this branch specific to one bug or feature so it is clear what the branch brings to pandas. You can have many feature branches and switch in between them using the git checkout command. - -When you want to update the feature branch with changes in main after you created the branch, check the section on updating a PR. - -Making code changes -Before modifying any code, ensure you follow the contributing environment guidelines to set up an appropriate development environment. - -Then once you have made code changes, you can see all the changes you’ve currently made by running. - -git status -For files you intended to modify or add, run. - -git add path/to/file-to-be-added-or-changed.py -Running git status again should display - -On branch shiny-new-feature - - modified: /relative/path/to/file-to-be-added-or-changed.py -Finally, commit your changes to your local repository with an explanatory commit message - -git commit -m "your commit message goes here" -Pushing your changes -When you want your changes to appear publicly on your GitHub page, push your forked feature branch’s commits - -git push origin shiny-new-feature -Here origin is the default name given to your remote repository on GitHub. You can see the remote repositories - -git remote -v -If you added the upstream repository as described above you will see something like - -origin git@github.com:yourname/pandas.git (fetch) -origin git@github.com:yourname/pandas.git (push) -upstream git://github.com/pandas-dev/pandas.git (fetch) -upstream git://github.com/pandas-dev/pandas.git (push) -Now your code is on GitHub, but it is not yet a part of the pandas project. For that to happen, a pull request needs to be submitted on GitHub. - -Making a pull request -One you have finished your code changes, your code change will need to follow the pandas contribution guidelines to be successfully accepted. - -If everything looks good, you are ready to make a pull request. A pull request is how code from your local repository becomes available to the GitHub community to review and merged into project to appear the in the next release. To submit a pull request: - -Navigate to your repository on GitHub - -Click on the Compare & pull request button - -You can then click on Commits and Files Changed to make sure everything looks okay one last time - -Write a descriptive title that includes prefixes. pandas uses a convention for title prefixes. Here are some common ones along with general guidelines for when to use them: - -ENH: Enhancement, new functionality - -BUG: Bug fix - -DOC: Additions/updates to documentation - -TST: Additions/updates to tests - -BLD: Updates to the build process/scripts - -PERF: Performance improvement - -TYP: Type annotations - -CLN: Code cleanup - -Write a description of your changes in the Preview Discussion tab - -Click Send Pull Request. - -This request then goes to the repository maintainers, and they will review the code. - -Updating your pull request -Based on the review you get on your pull request, you will probably need to make some changes to the code. You can follow the code committing steps again to address any feedback and update your pull request. - -It is also important that updates in the pandas main branch are reflected in your pull request. To update your feature branch with changes in the pandas main branch, run: - -git checkout shiny-new-feature -git fetch upstream -git merge upstream/main -If there are no conflicts (or they could be fixed automatically), a file with a default commit message will open, and you can simply save and quit this file. - -If there are merge conflicts, you need to solve those conflicts. See for example at https://help.github.com/articles/resolving-a-merge-conflict-using-the-command-line/ for an explanation on how to do this. - -Once the conflicts are resolved, run: - -git add -u to stage any files you’ve updated; - -git commit to finish the merge. - -Note - -If you have uncommitted changes at the moment you want to update the branch with main, you will need to stash them prior to updating (see the stash docs). This will effectively store your changes and they can be reapplied after updating. - -After the feature branch has been update locally, you can now update your pull request by pushing to the branch on GitHub: - -git push origin shiny-new-feature -Any git push will automatically update your pull request with your branch’s changes and restart the Continuous Integration checks. - -Updating the development environment -It is important to periodically update your local main branch with updates from the pandas main branch and update your development environment to reflect any changes to the various packages that are used during development. - -If using conda, run: - -git checkout main -git fetch upstream -git merge upstream/main -conda activate pandas-dev -conda env update -f environment.yml --prune -If using pip , do: - -git checkout main -git fetch upstream -git merge upstream/main -# activate the virtual environment based on your platform -python -m pip install --upgrade -r requirements-dev.txt -Tips for a successful pull request -If you have made it to the Making a pull request phase, one of the core contributors may take a look. Please note however that a handful of people are responsible for reviewing all of the contributions, which can often lead to bottlenecks. - -To improve the chances of your pull request being reviewed, you should: - -Reference an open issue for non-trivial changes to clarify the PR’s purpose - -Ensure you have appropriate tests. These should be the first part of any PR - -Keep your pull requests as simple as possible. Larger PRs take longer to review - -Ensure that CI is in a green state. Reviewers may not even look otherwise - -Keep Updating your pull request, either by request or every few days \ No newline at end of file diff --git a/.cursor/rules/pandas-doc-string-guid.mdc b/.cursor/rules/pandas-doc-string-guid.mdc deleted file mode 100644 index e5cce1964d454..0000000000000 --- a/.cursor/rules/pandas-doc-string-guid.mdc +++ /dev/null @@ -1,740 +0,0 @@ ---- -description: when ever docstring is neeeded -alwaysApply: false ---- -pandas docstring guide -About docstrings and standards -A Python docstring is a string used to document a Python module, class, function or method, so programmers can understand what it does without having to read the details of the implementation. - -Also, it is a common practice to generate online (html) documentation automatically from docstrings. Sphinx serves this purpose. - -The next example gives an idea of what a docstring looks like: - -def add(num1, num2): - """ - Add up two integer numbers. - - This function simply wraps the ``+`` operator, and does not - do anything interesting, except for illustrating what - the docstring of a very simple function looks like. - - Parameters - ---------- - num1 : int - First number to add. - num2 : int - Second number to add. - - Returns - ------- - int - The sum of ``num1`` and ``num2``. - - See Also - -------- - subtract : Subtract one integer from another. - - Examples - -------- - >>> add(2, 2) - 4 - >>> add(25, 0) - 25 - >>> add(10, -10) - 0 - """ - return num1 + num2 -Some standards regarding docstrings exist, which make them easier to read, and allow them be easily exported to other formats such as html or pdf. - -The first conventions every Python docstring should follow are defined in PEP-257. - -As PEP-257 is quite broad, other more specific standards also exist. In the case of pandas, the NumPy docstring convention is followed. These conventions are explained in this document: - -numpydoc docstring guide - -numpydoc is a Sphinx extension to support the NumPy docstring convention. - -The standard uses reStructuredText (reST). reStructuredText is a markup language that allows encoding styles in plain text files. Documentation about reStructuredText can be found in: - -Sphinx reStructuredText primer - -Quick reStructuredText reference - -Full reStructuredText specification - -pandas has some helpers for sharing docstrings between related classes, see Sharing docstrings. - -The rest of this document will summarize all the above guidelines, and will provide additional conventions specific to the pandas project. - -Writing a docstring -General rules -Docstrings must be defined with three double-quotes. No blank lines should be left before or after the docstring. The text starts in the next line after the opening quotes. The closing quotes have their own line (meaning that they are not at the end of the last sentence). - -On rare occasions reST styles like bold text or italics will be used in docstrings, but is it common to have inline code, which is presented between backticks. The following are considered inline code: - -The name of a parameter - -Python code, a module, function, built-in, type, literal… (e.g. os, list, numpy.abs, datetime.date, True) - -A pandas class (in the form :class:`pandas.Series`) - -A pandas method (in the form :meth:`pandas.Series.sum`) - -A pandas function (in the form :func:`pandas.to_datetime`) - -Note - -To display only the last component of the linked class, method or function, prefix it with ~. For example, :class:`~pandas.Series` will link to pandas.Series but only display the last part, Series as the link text. See Sphinx cross-referencing syntax for details. - -Good: - -def add_values(arr): - """ - Add the values in ``arr``. - - This is equivalent to Python ``sum`` of :meth:`pandas.Series.sum`. - - Some sections are omitted here for simplicity. - """ - return sum(arr) -Bad: - -def func(): - - """Some function. - - With several mistakes in the docstring. - - It has a blank line after the signature ``def func():``. - - The text 'Some function' should go in the line after the - opening quotes of the docstring, not in the same line. - - There is a blank line between the docstring and the first line - of code ``foo = 1``. - - The closing quotes should be in the next line, not in this one.""" - - foo = 1 - bar = 2 - return foo + bar -Section 1: short summary -The short summary is a single sentence that expresses what the function does in a concise way. - -The short summary must start with a capital letter, end with a dot, and fit in a single line. It needs to express what the object does without providing details. For functions and methods, the short summary must start with an infinitive verb. - -Good: - -def astype(dtype): - """ - Cast Series type. - - This section will provide further details. - """ - pass -Bad: - -def astype(dtype): - """ - Casts Series type. - - Verb in third-person of the present simple, should be infinitive. - """ - pass -def astype(dtype): - """ - Method to cast Series type. - - Does not start with verb. - """ - pass -def astype(dtype): - """ - Cast Series type - - Missing dot at the end. - """ - pass -def astype(dtype): - """ - Cast Series type from its current type to the new type defined in - the parameter dtype. - - Summary is too verbose and doesn't fit in a single line. - """ - pass -Section 2: extended summary -The extended summary provides details on what the function does. It should not go into the details of the parameters, or discuss implementation notes, which go in other sections. - -A blank line is left between the short summary and the extended summary. Every paragraph in the extended summary ends with a dot. - -The extended summary should provide details on why the function is useful and their use cases, if it is not too generic. - -def unstack(): - """ - Pivot a row index to columns. - - When using a MultiIndex, a level can be pivoted so each value in - the index becomes a column. This is especially useful when a subindex - is repeated for the main index, and data is easier to visualize as a - pivot table. - - The index level will be automatically removed from the index when added - as columns. - """ - pass -Section 3: parameters -The details of the parameters will be added in this section. This section has the title “Parameters”, followed by a line with a hyphen under each letter of the word “Parameters”. A blank line is left before the section title, but not after, and not between the line with the word “Parameters” and the one with the hyphens. - -After the title, each parameter in the signature must be documented, including *args and **kwargs, but not self. - -The parameters are defined by their name, followed by a space, a colon, another space, and the type (or types). Note that the space between the name and the colon is important. Types are not defined for *args and **kwargs, but must be defined for all other parameters. After the parameter definition, it is required to have a line with the parameter description, which is indented, and can have multiple lines. The description must start with a capital letter, and finish with a dot. - -For keyword arguments with a default value, the default will be listed after a comma at the end of the type. The exact form of the type in this case will be “int, default 0”. In some cases it may be useful to explain what the default argument means, which can be added after a comma “int, default -1, meaning all cpus”. - -In cases where the default value is None, meaning that the value will not be used. Instead of "str, default None", it is preferred to write "str, optional". When None is a value being used, we will keep the form “str, default None”. For example, in df.to_csv(compression=None), None is not a value being used, but means that compression is optional, and no compression is being used if not provided. In this case we will use "str, optional". Only in cases like func(value=None) and None is being used in the same way as 0 or foo would be used, then we will specify “str, int or None, default None”. - -Good: - -class Series: - def plot(self, kind, color='blue', **kwargs): - """ - Generate a plot. - - Render the data in the Series as a matplotlib plot of the - specified kind. - - Parameters - ---------- - kind : str - Kind of matplotlib plot. - color : str, default 'blue' - Color name or rgb code. - **kwargs - These parameters will be passed to the matplotlib plotting - function. - """ - pass -Bad: - -class Series: - def plot(self, kind, **kwargs): - """ - Generate a plot. - - Render the data in the Series as a matplotlib plot of the - specified kind. - - Note the blank line between the parameters title and the first - parameter. Also, note that after the name of the parameter ``kind`` - and before the colon, a space is missing. - - Also, note that the parameter descriptions do not start with a - capital letter, and do not finish with a dot. - - Finally, the ``**kwargs`` parameter is missing. - - Parameters - ---------- - - kind: str - kind of matplotlib plot - """ - pass -Parameter types -When specifying the parameter types, Python built-in data types can be used directly (the Python type is preferred to the more verbose string, integer, boolean, etc): - -int - -float - -str - -bool - -For complex types, define the subtypes. For dict and tuple, as more than one type is present, we use the brackets to help read the type (curly brackets for dict and normal brackets for tuple): - -list of int - -dict of {str : int} - -tuple of (str, int, int) - -tuple of (str,) - -set of str - -In case where there are just a set of values allowed, list them in curly brackets and separated by commas (followed by a space). If the values are ordinal and they have an order, list them in this order. Otherwise, list the default value first, if there is one: - -{0, 10, 25} - -{‘simple’, ‘advanced’} - -{‘low’, ‘medium’, ‘high’} - -{‘cat’, ‘dog’, ‘bird’} - -If the type is defined in a Python module, the module must be specified: - -datetime.date - -datetime.datetime - -decimal.Decimal - -If the type is in a package, the module must be also specified: - -numpy.ndarray - -scipy.sparse.coo_matrix - -If the type is a pandas type, also specify pandas except for Series and DataFrame: - -Series - -DataFrame - -pandas.Index - -pandas.Categorical - -pandas.arrays.SparseArray - -If the exact type is not relevant, but must be compatible with a NumPy array, array-like can be specified. If Any type that can be iterated is accepted, iterable can be used: - -array-like - -iterable - -If more than one type is accepted, separate them by commas, except the last two types, that need to be separated by the word ‘or’: - -int or float - -float, decimal.Decimal or None - -str or list of str - -If None is one of the accepted values, it always needs to be the last in the list. - -For axis, the convention is to use something like: - -axis : {0 or ‘index’, 1 or ‘columns’, None}, default None - -Section 4: returns or yields -If the method returns a value, it will be documented in this section. Also if the method yields its output. - -The title of the section will be defined in the same way as the “Parameters”. With the names “Returns” or “Yields” followed by a line with as many hyphens as the letters in the preceding word. - -The documentation of the return is also similar to the parameters. But in this case, no name will be provided, unless the method returns or yields more than one value (a tuple of values). - -The types for “Returns” and “Yields” are the same as the ones for the “Parameters”. Also, the description must finish with a dot. - -For example, with a single value: - -def sample(): - """ - Generate and return a random number. - - The value is sampled from a continuous uniform distribution between - 0 and 1. - - Returns - ------- - float - Random number generated. - """ - return np.random.random() -With more than one value: - -import string - -def random_letters(): - """ - Generate and return a sequence of random letters. - - The length of the returned string is also random, and is also - returned. - - Returns - ------- - length : int - Length of the returned string. - letters : str - String of random letters. - """ - length = np.random.randint(1, 10) - letters = ''.join(np.random.choice(string.ascii_lowercase) - for i in range(length)) - return length, letters -If the method yields its value: - -def sample_values(): - """ - Generate an infinite sequence of random numbers. - - The values are sampled from a continuous uniform distribution between - 0 and 1. - - Yields - ------ - float - Random number generated. - """ - while True: - yield np.random.random() -Section 5: see also -This section is used to let users know about pandas functionality related to the one being documented. In rare cases, if no related methods or functions can be found at all, this section can be skipped. - -An obvious example would be the head() and tail() methods. As tail() does the equivalent as head() but at the end of the Series or DataFrame instead of at the beginning, it is good to let the users know about it. - -To give an intuition on what can be considered related, here there are some examples: - -loc and iloc, as they do the same, but in one case providing indices and in the other positions - -max and min, as they do the opposite - -iterrows, itertuples and items, as it is easy that a user looking for the method to iterate over columns ends up in the method to iterate over rows, and vice-versa - -fillna and dropna, as both methods are used to handle missing values - -read_csv and to_csv, as they are complementary - -merge and join, as one is a generalization of the other - -astype and pandas.to_datetime, as users may be reading the documentation of astype to know how to cast as a date, and the way to do it is with pandas.to_datetime - -where is related to numpy.where, as its functionality is based on it - -When deciding what is related, you should mainly use your common sense and think about what can be useful for the users reading the documentation, especially the less experienced ones. - -When relating to other libraries (mainly numpy), use the name of the module first (not an alias like np). If the function is in a module which is not the main one, like scipy.sparse, list the full module (e.g. scipy.sparse.coo_matrix). - -This section has a header, “See Also” (note the capital S and A), followed by the line with hyphens and preceded by a blank line. - -After the header, we will add a line for each related method or function, followed by a space, a colon, another space, and a short description that illustrates what this method or function does, why is it relevant in this context, and what the key differences are between the documented function and the one being referenced. The description must also end with a dot. - -Note that in “Returns” and “Yields”, the description is located on the line after the type. In this section, however, it is located on the same line, with a colon in between. If the description does not fit on the same line, it can continue onto other lines which must be further indented. - -For example: - -class Series: - def head(self): - """ - Return the first 5 elements of the Series. - - This function is mainly useful to preview the values of the - Series without displaying the whole of it. - - Returns - ------- - Series - Subset of the original series with the 5 first values. - - See Also - -------- - Series.tail : Return the last 5 elements of the Series. - Series.iloc : Return a slice of the elements in the Series, - which can also be used to return the first or last n. - """ - return self.iloc[:5] -Section 6: notes -This is an optional section used for notes about the implementation of the algorithm, or to document technical aspects of the function behavior. - -Feel free to skip it, unless you are familiar with the implementation of the algorithm, or you discover some counter-intuitive behavior while writing the examples for the function. - -This section follows the same format as the extended summary section. - -Section 7: examples -This is one of the most important sections of a docstring, despite being placed in the last position, as often people understand concepts better by example than through accurate explanations. - -Examples in docstrings, besides illustrating the usage of the function or method, must be valid Python code, that returns the given output in a deterministic way, and that can be copied and run by users. - -Examples are presented as a session in the Python terminal. >>> is used to present code. ... is used for code continuing from the previous line. Output is presented immediately after the last line of code generating the output (no blank lines in between). Comments describing the examples can be added with blank lines before and after them. - -The way to present examples is as follows: - -Import required libraries (except numpy and pandas) - -Create the data required for the example - -Show a very basic example that gives an idea of the most common use case - -Add examples with explanations that illustrate how the parameters can be used for extended functionality - -A simple example could be: - -class Series: - - def head(self, n=5): - """ - Return the first elements of the Series. - - This function is mainly useful to preview the values of the - Series without displaying all of it. - - Parameters - ---------- - n : int - Number of values to return. - - Return - ------ - pandas.Series - Subset of the original series with the n first values. - - See Also - -------- - tail : Return the last n elements of the Series. - - Examples - -------- - >>> ser = pd.Series(['Ant', 'Bear', 'Cow', 'Dog', 'Falcon', - ... 'Lion', 'Monkey', 'Rabbit', 'Zebra']) - >>> ser.head() - 0 Ant - 1 Bear - 2 Cow - 3 Dog - 4 Falcon - dtype: object - - With the ``n`` parameter, we can change the number of returned rows: - - >>> ser.head(n=3) - 0 Ant - 1 Bear - 2 Cow - dtype: object - """ - return self.iloc[:n] -The examples should be as concise as possible. In cases where the complexity of the function requires long examples, is recommended to use blocks with headers in bold. Use double star ** to make a text bold, like in **this example**. - -Conventions for the examples -Code in examples is assumed to always start with these two lines which are not shown: - -import numpy as np -import pandas as pd -Any other module used in the examples must be explicitly imported, one per line (as recommended in PEP 8#imports) and avoiding aliases. Avoid excessive imports, but if needed, imports from the standard library go first, followed by third-party libraries (like matplotlib). - -When illustrating examples with a single Series use the name ser, and if illustrating with a single DataFrame use the name df. For indices, idx is the preferred name. If a set of homogeneous Series or DataFrame is used, name them ser1, ser2, ser3… or df1, df2, df3… If the data is not homogeneous, and more than one structure is needed, name them with something meaningful, for example df_main and df_to_join. - -Data used in the example should be as compact as possible. The number of rows is recommended to be around 4, but make it a number that makes sense for the specific example. For example in the head method, it requires to be higher than 5, to show the example with the default values. If doing the mean, we could use something like [1, 2, 3], so it is easy to see that the value returned is the mean. - -For more complex examples (grouping for example), avoid using data without interpretation, like a matrix of random numbers with columns A, B, C, D… And instead use a meaningful example, which makes it easier to understand the concept. Unless required by the example, use names of animals, to keep examples consistent. And numerical properties of them. - -When calling the method, keywords arguments head(n=3) are preferred to positional arguments head(3). - -Good: - -class Series: - - def mean(self): - """ - Compute the mean of the input. - - Examples - -------- - >>> ser = pd.Series([1, 2, 3]) - >>> ser.mean() - 2 - """ - pass - - - def fillna(self, value): - """ - Replace missing values by ``value``. - - Examples - -------- - >>> ser = pd.Series([1, np.nan, 3]) - >>> ser.fillna(0) - [1, 0, 3] - """ - pass - - def groupby_mean(self): - """ - Group by index and return mean. - - Examples - -------- - >>> ser = pd.Series([380., 370., 24., 26], - ... name='max_speed', - ... index=['falcon', 'falcon', 'parrot', 'parrot']) - >>> ser.groupby_mean() - index - falcon 375.0 - parrot 25.0 - Name: max_speed, dtype: float64 - """ - pass - - def contains(self, pattern, case_sensitive=True, na=numpy.nan): - """ - Return whether each value contains ``pattern``. - - In this case, we are illustrating how to use sections, even - if the example is simple enough and does not require them. - - Examples - -------- - >>> ser = pd.Series('Antelope', 'Lion', 'Zebra', np.nan) - >>> ser.contains(pattern='a') - 0 False - 1 False - 2 True - 3 NaN - dtype: bool - - **Case sensitivity** - - With ``case_sensitive`` set to ``False`` we can match ``a`` with both - ``a`` and ``A``: - - >>> s.contains(pattern='a', case_sensitive=False) - 0 True - 1 False - 2 True - 3 NaN - dtype: bool - - **Missing values** - - We can fill missing values in the output using the ``na`` parameter: - - >>> ser.contains(pattern='a', na=False) - 0 False - 1 False - 2 True - 3 False - dtype: bool - """ - pass -Bad: - -def method(foo=None, bar=None): - """ - A sample DataFrame method. - - Do not import NumPy and pandas. - - Try to use meaningful data, when it makes the example easier - to understand. - - Try to avoid positional arguments like in ``df.method(1)``. They - can be all right if previously defined with a meaningful name, - like in ``present_value(interest_rate)``, but avoid them otherwise. - - When presenting the behavior with different parameters, do not place - all the calls one next to the other. Instead, add a short sentence - explaining what the example shows. - - Examples - -------- - >>> import numpy as np - >>> import pandas as pd - >>> df = pd.DataFrame(np.random.randn(3, 3), - ... columns=('a', 'b', 'c')) - >>> df.method(1) - 21 - >>> df.method(bar=14) - 123 - """ - pass -Tips for getting your examples pass the doctests -Getting the examples pass the doctests in the validation script can sometimes be tricky. Here are some attention points: - -Import all needed libraries (except for pandas and NumPy, those are already imported as import pandas as pd and import numpy as np) and define all variables you use in the example. - -Try to avoid using random data. However random data might be OK in some cases, like if the function you are documenting deals with probability distributions, or if the amount of data needed to make the function result meaningful is too much, such that creating it manually is very cumbersome. In those cases, always use a fixed random seed to make the generated examples predictable. Example: - -np.random.seed(42) -df = pd.DataFrame({'normal': np.random.normal(100, 5, 20)}) -If you have a code snippet that wraps multiple lines, you need to use ‘…’ on the continued lines: - -df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=['a', 'b', 'c'], - columns=['A', 'B']) -If you want to show a case where an exception is raised, you can do: - -pd.to_datetime(["712-01-01"]) -Traceback (most recent call last): -OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 712-01-01 00:00:00 -It is essential to include the “Traceback (most recent call last):”, but for the actual error only the error name is sufficient. - -If there is a small part of the result that can vary (e.g. a hash in an object representation), you can use ... to represent this part. - -If you want to show that s.plot() returns a matplotlib AxesSubplot object, this will fail the doctest - -s.plot() - -However, you can do (notice the comment that needs to be added) - -s.plot() - -Plots in examples -There are some methods in pandas returning plots. To render the plots generated by the examples in the documentation, the .. plot:: directive exists. - -To use it, place the next code after the “Examples” header as shown below. The plot will be generated automatically when building the documentation. - -class Series: - def plot(self): - """ - Generate a plot with the ``Series`` data. - - Examples - -------- - - .. plot:: - :context: close-figs - - >>> ser = pd.Series([1, 2, 3]) - >>> ser.plot() - """ - pass -Sharing docstrings -pandas has a system for sharing docstrings, with slight variations, between classes. This helps us keep docstrings consistent, while keeping things clear for the user reading. It comes at the cost of some complexity when writing. - -Each shared docstring will have a base template with variables, like {klass}. The variables filled in later on using the doc decorator. Finally, docstrings can also be appended to with the doc decorator. - -In this example, we’ll create a parent docstring normally (this is like pandas.core.generic.NDFrame). Then we’ll have two children (like pandas.Series and pandas.DataFrame). We’ll substitute the class names in this docstring. - -class Parent: - @doc(klass="Parent") - def my_function(self): - """Apply my function to {klass}.""" - ... - - -class ChildA(Parent): - @doc(Parent.my_function, klass="ChildA") - def my_function(self): - ... - - -class ChildB(Parent): - @doc(Parent.my_function, klass="ChildB") - def my_function(self): - ... -The resulting docstrings are - -print(Parent.my_function.__doc__) -Apply my function to Parent. -print(ChildA.my_function.__doc__) -Apply my function to ChildA. -print(ChildB.my_function.__doc__) -Apply my function to ChildB. -Notice: - -We “append” the parent docstring to the children docstrings, which are initially empty. - -Our files will often contain a module-level _shared_doc_kwargs with some common substitution values (things like klass, axes, etc). - -You can substitute and append in one shot with something like - -@doc(template, **_shared_doc_kwargs) -def my_function(self): - ... -where template may come from a module-level _shared_docs dictionary mapping function names to docstrings. Wherever possible, we prefer using doc, since the docstring-writing processes is slightly closer to normal. - -See pandas.core.generic.NDFrame.fillna for an example template, and pandas.Series.fillna and pandas.core.generic.frame.fillna for the filled versions. \ No newline at end of file diff --git a/.cursor/rules/pandas-documentation.mdc b/.cursor/rules/pandas-documentation.mdc deleted file mode 100644 index bf1d3fa21fe6a..0000000000000 --- a/.cursor/rules/pandas-documentation.mdc +++ /dev/null @@ -1,128 +0,0 @@ ---- -description: when ever task is about improving or changing or adding documentation to pandas -alwaysApply: false ---- -Contributing to the documentation -Contributing to the documentation benefits everyone who uses pandas. We encourage you to help us improve the documentation, and you don’t have to be an expert on pandas to do so! In fact, there are sections of the docs that are worse off after being written by experts. If something in the docs doesn’t make sense to you, updating the relevant section after you figure it out is a great way to ensure it will help the next person. Please visit the issues page for a full list of issues that are currently open regarding the pandas documentation. - -Documentation: - -About the pandas documentation - -Updating a pandas docstring - -How to build the pandas documentation - -Requirements - -Building the documentation - -Building main branch documentation - -Previewing changes - -About the pandas documentation -The documentation is written in reStructuredText, which is almost like writing in plain English, and built using Sphinx. The Sphinx Documentation has an excellent introduction to reST. Review the Sphinx docs to perform more complex changes to the documentation as well. - -Some other important things to know about the docs: - -The pandas documentation consists of two parts: the docstrings in the code itself and the docs in this folder doc/. - -The docstrings provide a clear explanation of the usage of the individual functions, while the documentation in this folder consists of tutorial-like overviews per topic together with some other information (what’s new, installation, etc). - -The docstrings follow a pandas convention, based on the Numpy Docstring Standard. Follow the pandas docstring guide for detailed instructions on how to write a correct docstring. - -pandas docstring guide -About docstrings and standards -Writing a docstring -Sharing docstrings -The tutorials make heavy use of the IPython directive sphinx extension. This directive lets you put code in the documentation which will be run during the doc build. For example: - -.. ipython:: python - - x = 2 - x**3 -will be rendered as: - -In [1]: x = 2 - -In [2]: x**3 -Out[2]: 8 -Almost all code examples in the docs are run (and the output saved) during the doc build. This approach means that code examples will always be up to date, but it does make the doc building a bit more complex. - -Our API documentation files in doc/source/reference house the auto-generated documentation from the docstrings. For classes, there are a few subtleties around controlling which methods and attributes have pages auto-generated. - -We have two autosummary templates for classes. - -_templates/autosummary/class.rst. Use this when you want to automatically generate a page for every public method and attribute on the class. The Attributes and Methods sections will be automatically added to the class’ rendered documentation by numpydoc. See DataFrame for an example. - -_templates/autosummary/class_without_autosummary. Use this when you want to pick a subset of methods / attributes to auto-generate pages for. When using this template, you should include an Attributes and Methods section in the class docstring. See CategoricalIndex for an example. - -Every method should be included in a toctree in one of the documentation files in doc/source/reference, else Sphinx will emit a warning. - -The utility script scripts/validate_docstrings.py can be used to get a csv summary of the API documentation. And also validate common errors in the docstring of a specific class, function or method. The summary also compares the list of methods documented in the files in doc/source/reference (which is used to generate the API Reference page) and the actual public methods. This will identify methods documented in doc/source/reference that are not actually class methods, and existing methods that are not documented in doc/source/reference. - -Updating a pandas docstring -When improving a single function or method’s docstring, it is not necessarily needed to build the full documentation (see next section). However, there is a script that checks a docstring (for example for the DataFrame.mean method): - -python scripts/validate_docstrings.py pandas.DataFrame.mean -This script will indicate some formatting errors if present, and will also run and test the examples included in the docstring. Check the pandas docstring guide for a detailed guide on how to format the docstring. - -The examples in the docstring (‘doctests’) must be valid Python code, that in a deterministic way returns the presented output, and that can be copied and run by users. This can be checked with the script above, and is also tested on Travis. A failing doctest will be a blocker for merging a PR. Check the examples section in the docstring guide for some tips and tricks to get the doctests passing. - -When doing a PR with a docstring update, it is good to post the output of the validation script in a comment on github. - -How to build the pandas documentation -Requirements -First, you need to have a development environment to be able to build pandas (see the docs on creating a development environment). - -Building the documentation -So how do you build the docs? Navigate to your local doc/ directory in the console and run: - -python make.py html -Then you can find the HTML output in the folder doc/build/html/. - -The first time you build the docs, it will take quite a while because it has to run all the code examples and build all the generated docstring pages. In subsequent evocations, sphinx will try to only build the pages that have been modified. - -If you want to do a full clean build, do: - -python make.py clean -python make.py html -You can tell make.py to compile only a single section of the docs, greatly reducing the turn-around time for checking your changes. - -# omit autosummary and API section -python make.py clean -python make.py --no-api - -# compile the docs with only a single section, relative to the "source" folder. -# For example, compiling only this guide (doc/source/development/contributing.rst) -python make.py clean -python make.py --single development/contributing.rst - -# compile the reference docs for a single function -python make.py clean -python make.py --single pandas.DataFrame.join - -# compile whatsnew and API section (to resolve links in the whatsnew) -python make.py clean -python make.py --whatsnew -For comparison, a full documentation build may take 15 minutes, but a single section may take 15 seconds. Subsequent builds, which only process portions you have changed, will be faster. - -The build will automatically use the number of cores available on your machine to speed up the documentation build. You can override this: - -python make.py html --num-jobs 4 -Open the following file in a web browser to see the full documentation you just built doc/build/html/index.html. - -And you’ll have the satisfaction of seeing your new and improved documentation! - -Building main branch documentation -When pull requests are merged into the pandas main branch, the main parts of the documentation are also built by Travis-CI. These docs are then hosted here, see also the Continuous Integration section. - -Previewing changes -Once, the pull request is submitted, GitHub Actions will automatically build the documentation. To view the built site: - -Wait for the CI / Web and docs check to complete. - -Click Details next to it. - -From the Artifacts drop-down, click docs or website to download the site as a ZIP file. \ No newline at end of file diff --git a/.cursor/rules/pandas-maintenace.mdc b/.cursor/rules/pandas-maintenace.mdc deleted file mode 100644 index 03e5d6c2908c9..0000000000000 --- a/.cursor/rules/pandas-maintenace.mdc +++ /dev/null @@ -1,333 +0,0 @@ ---- -alwaysApply: true ---- -pandas maintenance -This guide is for pandas’ maintainers. It may also be interesting to contributors looking to understand the pandas development process and what steps are necessary to become a maintainer. - -The main contributing guide is available at Contributing to pandas. - -Roles -pandas uses two levels of permissions: triage and core team members. - -Triage members can label and close issues and pull requests. - -Core team members can label and close issues and pull request, and can merge pull requests. - -GitHub publishes the full list of permissions. - -Tasks -pandas is largely a volunteer project, so these tasks shouldn’t be read as “expectations” of triage and maintainers. Rather, they’re general descriptions of what it means to be a maintainer. - -Triage newly filed issues (see Issue triage) - -Review newly opened pull requests - -Respond to updates on existing issues and pull requests - -Drive discussion and decisions on stalled issues and pull requests - -Provide experience / wisdom on API design questions to ensure consistency and maintainability - -Project organization (run / attend developer meetings, represent pandas) - -https://matthewrocklin.com/blog/2019/05/18/maintainer may be interesting background reading. - -Issue triage -Triage is an important first step in addressing issues reported by the community, and even partial contributions are a great way to help maintain pandas. Only remove the “Needs Triage” tag once all of the steps below have been completed. - -Here’s a typical workflow for triaging a newly opened issue. - -Thank the reporter for opening an issue - -The issue tracker is many people’s first interaction with the pandas project itself, beyond just using the library. As such, we want it to be a welcoming, pleasant experience. - -Is the necessary information provided? - -Ideally reporters would fill out the issue template, but many don’t. If crucial information (like the version of pandas they used), is missing feel free to ask for that and label the issue with “Needs info”. The report should follow the guidelines in Bug reports and enhancement requests. You may want to link to that if they didn’t follow the template. - -Make sure that the title accurately reflects the issue. Edit it yourself if it’s not clear. - -Is this a duplicate issue? - -We have many open issues. If a new issue is clearly a duplicate, label the new issue as “Duplicate” and close the issue with a link to the original issue. Make sure to still thank the reporter, and encourage them to chime in on the original issue, and perhaps try to fix it. - -If the new issue provides relevant information, such as a better or slightly different example, add it to the original issue as a comment or an edit to the original post. - -Is the issue minimal and reproducible? - -For bug reports, we ask that the reporter provide a minimal reproducible example. See https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports for a good explanation. If the example is not reproducible, or if it’s clearly not minimal, feel free to ask the reporter if they can provide an example or simplify the provided one. Do acknowledge that writing minimal reproducible examples is hard work. If the reporter is struggling, you can try to write one yourself and we’ll edit the original post to include it. - -If a reproducible example can’t be provided, add the “Needs info” label. - -If a reproducible example is provided, but you see a simplification, edit the original post with your simpler reproducible example. - -If this is a regression report, post the result of a git bisect run. More info on this can be found in the Investigating regressions section. - -Ensure the issue exists on the main branch and that it has the “Needs Triage” tag until all steps have been completed. Add a comment to the issue once you have verified it exists on the main branch, so others know it has been confirmed. - -Is this a clearly defined feature request? - -Generally, pandas prefers to discuss and design new features in issues, before a pull request is made. Encourage the submitter to include a proposed API for the new feature. Having them write a full docstring is a good way to pin down specifics. - -Tag new feature requests with “Needs Discussion”, as we’ll need a discussion from several pandas maintainers before deciding whether the proposal is in scope for pandas. - -Is this a usage question? - -We prefer that usage questions are asked on StackOverflow with the pandas tag. https://stackoverflow.com/questions/tagged/pandas - -If it’s easy to answer, feel free to link to the relevant documentation section, let them know that in the future this kind of question should be on StackOverflow, and close the issue. - -What labels and milestones should I add? - -Apply the relevant labels. This is a bit of an art, and comes with experience. Look at similar issues to get a feel for how things are labeled. - -If the issue is clearly defined and the fix seems relatively straightforward, label the issue as “Good first issue”. - -If the issue is a regression report, add the “Regression” label and the next patch release milestone. - -Once you have completed the above, make sure to remove the “Needs Triage” label. - -Investigating regressions -Regressions are bugs that unintentionally break previously working code. The common way to investigate regressions is by using git bisect, which finds the first commit that introduced the bug. - -For example: a user reports that pd.Series([1, 1]).sum() returns 3 in pandas version 1.5.0 while in version 1.4.0 it returned 2. To begin, create a file t.py in your pandas directory, which contains - -import pandas as pd -assert pd.Series([1, 1]).sum() == 2 -and then run: - -git bisect start -git bisect good v1.4.0 -git bisect bad v1.5.0 -git bisect run bash -c "python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true; python t.py" -This finds the first commit that changed the behavior. The C extensions have to be rebuilt at every step, so the search can take a while. - -Exit bisect and rebuild the current version: - -git bisect reset -python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true -Report your findings under the corresponding issue and ping the commit author to get their input. - -Note - -In the bisect run command above, commits are considered good if t.py exits with 0 and bad otherwise. When raising an exception is the desired behavior, wrap the code in an appropriate try/except statement. See GH 35685 for more examples. - -Closing issues -Be delicate here: many people interpret closing an issue as us saying that the conversation is over. It’s typically best to give the reporter some time to respond or self-close their issue if it’s determined that the behavior is not a bug, or the feature is out of scope. Sometimes reporters just go away though, and we’ll close the issue after the conversation has died. If you think an issue should be closed but are not completely sure, please apply the “closing candidate” label and wait for other maintainers to take a look. - -Reviewing pull requests -Anybody can review a pull request: regular contributors, triagers, or core-team members. But only core-team members can merge pull requests when they’re ready. - -Here are some things to check when reviewing a pull request. - -Tests should be in a sensible location: in the same file as closely related tests. - -New public APIs should be included somewhere in doc/source/reference/. - -New / changed API should use the versionadded or versionchanged directives in the docstring. - -User-facing changes should have a whatsnew in the appropriate file. - -Regression tests should reference the original GitHub issue number like # GH-1234. - -The pull request should be labeled and assigned the appropriate milestone (the next patch release for regression fixes and small bug fixes, the next minor milestone otherwise) - -Changes should comply with our Version policy. - -Backporting -pandas supports point releases (e.g. 1.4.3) that aim to: - -Fix bugs in new features introduced in the first minor version release. - -e.g. If a new feature was added in 1.4 and contains a bug, a fix can be applied in 1.4.3 - -Fix bugs that used to work in a few minor releases prior. There should be agreement between core team members that a backport is appropriate. - -e.g. If a feature worked in 1.2 and stopped working since 1.3, a fix can be applied in 1.4.3. - -Since pandas minor releases are based on GitHub branches (e.g. point release of 1.4 are based off the 1.4.x branch), “backporting” means merging a pull request fix to the main branch and correct minor branch associated with the next point release. - -By default, if a pull request is assigned to the next point release milestone within the GitHub interface, the backporting process should happen automatically by the @meeseeksdev bot once the pull request is merged. A new pull request will be made backporting the pull request to the correct version branch. Sometimes due to merge conflicts, a manual pull request will need to be made addressing the code conflict. - -If the bot does not automatically start the backporting process, you can also write a GitHub comment in the merged pull request to trigger the backport: - -@meeseeksdev backport version-branch -This will trigger a workflow which will backport a given change to a branch (e.g. @meeseeksdev backport 1.4.x) - -Cleaning up old issues -Every open issue in pandas has a cost. Open issues make finding duplicates harder, and can make it harder to know what needs to be done in pandas. That said, closing issues isn’t a goal on its own. Our goal is to make pandas the best it can be, and that’s best done by ensuring that the quality of our open issues is high. - -Occasionally, bugs are fixed but the issue isn’t linked to in the Pull Request. In these cases, comment that “This has been fixed, but could use a test.” and label the issue as “Good First Issue” and “Needs Test”. - -If an older issue doesn’t follow our issue template, edit the original post to include a minimal example, the actual output, and the expected output. Uniformity in issue reports is valuable. - -If an older issue lacks a reproducible example, label it as “Needs Info” and ask them to provide one (or write one yourself if possible). If one isn’t provide reasonably soon, close it according to the policies in Closing issues. - -Cleaning up old pull requests -Occasionally, contributors are unable to finish off a pull request. If some time has passed (two weeks, say) since the last review requesting changes, gently ask if they’re still interested in working on this. If another two weeks or so passes with no response, thank them for their work and then either: - -close the pull request; - -push to the contributor’s branch to push their work over the finish line (if you’re part of pandas-core). This can be helpful for pushing an important PR across the line, or for fixing a small merge conflict. - -If closing the pull request, then please comment on the original issue that “There’s a stalled PR at #1234 that may be helpful.”, and perhaps label the issue as “Good first issue” if the PR was relatively close to being accepted. - -Becoming a pandas maintainer -The full process is outlined in our governance documents. In summary, we’re happy to give triage permissions to anyone who shows interest by being helpful on the issue tracker. - -The required steps for adding a maintainer are: - -Contact the contributor and ask their interest to join. - -Add the contributor to the appropriate GitHub Team if accepted the invitation. - -pandas-core is for core team members - -pandas-triage is for pandas triage members - -If adding to pandas-core, there are two additional steps: - -Add the contributor to the pandas Google group. - -Create a pull request to add the contributor’s GitHub handle to pandas-dev/pandas/web/pandas/config.yml. - -The current list of core-team members is at pandas-dev/pandas - -Merging pull requests -Only core team members can merge pull requests. We have a few guidelines. - -You should typically not self-merge your own pull requests without approval. Exceptions include things like small changes to fix CI (e.g. pinning a package version). Self-merging with approval from other core team members is fine if the change is something you’re very confident about. - -You should not merge pull requests that have an active discussion, or pull requests that has any -1 votes from a core maintainer. pandas operates by consensus. - -For larger changes, it’s good to have a +1 from at least two core team members. - -In addition to the items listed in Closing issues, you should verify that the pull request is assigned the correct milestone. - -Pull requests merged with a patch-release milestone will typically be backported by our bot. Verify that the bot noticed the merge (it will leave a comment within a minute typically). If a manual backport is needed please do that, and remove the “Needs backport” label once you’ve done it manually. If you forget to assign a milestone before tagging, you can request the bot to backport it with: - -@Meeseeksdev backport -Release process -The release process makes a snapshot of pandas (a git commit) available to users with a particular version number. After the release the new pandas version will be available in the next places: - -Git repo with a new tag - -Source distribution in a GitHub release - -Pip packages in the PyPI - -Conda packages in conda-forge - -The process for releasing a new version of pandas is detailed next section. - -The instructions contain which needs to be replaced with the version to be released (e.g. 1.5.2). Also the branch to be released , which depends on whether the version being released is the release candidate of a new version, or any other version. Release candidates are released from main, while other versions are released from their branch (e.g. 1.5.x). - -Prerequisites -In order to be able to release a new pandas version, the next permissions are needed: - -Merge rights to the pandas and pandas-feedstock repositories. For the latter, open a PR adding your GitHub username to the conda-forge recipe. - -Permissions to push to main in the pandas repository, to push the new tags. - -Write permissions to PyPI. - -Access to our website / documentation server. Share your public key with the infrastructure committee to be added to the authorized_keys file of the main server user. - -Access to the social media accounts, to publish the announcements. - -Pre-release -Agree with the core team on the next topics: - -Release date (major/minor releases happen usually every 6 months, and patch releases monthly until x.x.5, just before the next major/minor) - -Blockers (issues and PRs that must be part of the release) - -Next version after the one being released - -Update and clean release notes for the version to be released, including: - -Set the final date of the release - -Remove any unused bullet point - -Make sure there are no formatting issues, typos, etc. - -Make sure the CI is green for the last commit of the branch being released. - -If not a release candidate, make sure all backporting pull requests to the branch being released are merged. - -Create a new issue and milestone for the version after the one being released. If the release was a release candidate, we would usually want to create issues and milestones for both the next major/minor, and the next patch release. In the milestone of a patch release, we add the description on-merge: backport to , so tagged PRs are automatically backported to the release branch by our bot. - -Change the milestone of all issues and PRs in the milestone being released to the next milestone. - -Release -Create an empty commit and a tag in the last commit of the branch to be released: - -git checkout -git pull --ff-only upstream -git clean -xdf -git commit --allow-empty --author="pandas Development Team " -m "RLS: " -git tag -a v -m "Version " # NOTE that the tag is v1.5.2 with "v" not 1.5.2 -git push upstream --follow-tags -The docs for the new version will be built and published automatically with the docs job in the CI, which will be triggered when the tag is pushed. - -Only if the release is a release candidate, we want to create a new branch for it, immediately after creating the tag. For example, if we are releasing pandas 1.4.0rc0, we would like to create the branch 1.4.x to backport commits to the 1.4 versions. As well as create a tag to mark the start of the development of 1.5.0 (assuming it is the next version): - -git checkout -b 1.4.x -git push upstream 1.4.x -git checkout main -git commit --allow-empty -m "Start 1.5.0" -git tag -a v1.5.0.dev0 -m "DEV: Start 1.5.0" -git push upstream main --follow-tags -Download the source distribution and wheels from the wheel staging area. Be careful to make sure that no wheels are missing (e.g. due to failed builds). - -Running scripts/download_wheels.sh with the version that you want to download wheels/the sdist for should do the trick. This script will make a dist folder inside your clone of pandas and put the downloaded wheels and sdist there: - -scripts/download_wheels.sh -Create a new GitHub release: - -Tag: - -Title: pandas - -Description: Copy the description of the last release of the same kind (release candidate, major/minor or patch release) - -Files: pandas-.tar.gz source distribution just generated - -Set as a pre-release: Only check for a release candidate - -Set as the latest release: Leave checked, unless releasing a patch release for an older version (e.g. releasing 1.4.5 after 1.5 has been released) - -Upload wheels to PyPI: - -twine upload pandas/dist/pandas-*.{whl,tar.gz} --skip-existing -The GitHub release will after some hours trigger an automated conda-forge PR. (If you don’t want to wait, you can open an issue titled @conda-forge-admin, please update version to trigger the bot.) Merge it once the CI is green, and it will generate the conda-forge packages. - -In case a manual PR needs to be done, the version, sha256 and build fields are the ones that usually need to be changed. If anything else in the recipe has changed since the last release, those changes should be available in ci/meta.yaml. - -Post-Release -Update symlinks to stable documentation by logging in to our web server, and editing /var/www/html/pandas-docs/stable to point to version/ for major and minor releases, or version/ to version/ for patch releases. The exact instructions are (replace the example version numbers by the appropriate ones for the version you are releasing): - -Log in to the server and use the correct user. - -cd /var/www/html/pandas-docs/ - -ln -sfn version/2.1 stable (for a major or minor release) - -ln -sfn version/2.0.3 version/2.0 (for a patch release) - -If releasing a major or minor release, open a PR in our source code to update web/pandas/versions.json, to have the desired versions in the documentation dropdown menu. - -Close the milestone and the issue for the released version. - -Create a new issue for the next release, with the estimated date of release. - -Open a PR with the placeholder for the release notes of the next version. See for example the PR for 1.5.3. Note that the template to use depends on whether it is a major, minor or patch release. - -Announce the new release in the official channels (use previous announcements for reference): - -The pandas-dev and pydata mailing lists - -X, Mastodon, Telegram and LinkedIn - -Update this release instructions to fix anything incorrect and to update about any change since the last release. \ No newline at end of file diff --git a/.cursor/rules/policies.mdc b/.cursor/rules/policies.mdc deleted file mode 100644 index 9c72057d7117d..0000000000000 --- a/.cursor/rules/policies.mdc +++ /dev/null @@ -1,34 +0,0 @@ ---- -alwaysApply: true ---- -Policies -Version policy -pandas uses a loose variant of semantic versioning (SemVer) to govern deprecations, API compatibility, and version numbering. - -A pandas release number is made up of MAJOR.MINOR.PATCH. - -API breaking changes should only occur in major releases. These changes will be documented, with clear guidance on what is changing, why it’s changing, and how to migrate existing code to the new behavior. - -Whenever possible, a deprecation path will be provided rather than an outright breaking change. - -pandas will introduce deprecations in minor releases. These deprecations will preserve the existing behavior while emitting a warning that provide guidance on: - -How to achieve similar behavior if an alternative is available - -The pandas version in which the deprecation will be enforced. - -We will not introduce new deprecations in patch releases. - -Deprecations will only be enforced in major releases. For example, if a behavior is deprecated in pandas 1.2.0, it will continue to work, with a warning, for all releases in the 1.x series. The behavior will change and the deprecation removed in the next major release (2.0.0). - -Note - -pandas will sometimes make behavior changing bug fixes, as part of minor or patch releases. Whether or not a change is a bug fix or an API-breaking change is a judgement call. We’ll do our best, and we invite you to participate in development discussion on the issue tracker or mailing list. - -These policies do not apply to features marked as experimental in the documentation. pandas may change the behavior of experimental features at any time. - -Python support -pandas mirrors the SPEC 0 guideline for Python support. - -Security policy -To report a security vulnerability to pandas, please go to pandas-dev/pandas and see the instructions there. \ No newline at end of file From 2b00b8f31024030ef09eb1c40cb85e0f205507d1 Mon Sep 17 00:00:00 2001 From: siryoos Date: Tue, 15 Jul 2025 09:35:25 +0330 Subject: [PATCH 5/6] DOC: Simplify footer by removing custom template and integrating sponsor information directly into copyright - Deleted the custom "pandas_footer.html" template. - Updated copyright statement in conf.py to include sponsor information. - Removed related CSS styles for the footer sponsors. --- doc/_templates/pandas_footer.html | 3 --- doc/source/_static/css/pandas.css | 4 ---- doc/source/conf.py | 6 +++--- 3 files changed, 3 insertions(+), 10 deletions(-) delete mode 100644 doc/_templates/pandas_footer.html diff --git a/doc/_templates/pandas_footer.html b/doc/_templates/pandas_footer.html deleted file mode 100644 index 8d781d909a2fe..0000000000000 --- a/doc/_templates/pandas_footer.html +++ /dev/null @@ -1,3 +0,0 @@ - - via NumFOCUS, Inc. Hosted by OVHcloud. - diff --git a/doc/source/_static/css/pandas.css b/doc/source/_static/css/pandas.css index 25b8e36ed214d..1c07679d7bdc9 100644 --- a/doc/source/_static/css/pandas.css +++ b/doc/source/_static/css/pandas.css @@ -57,7 +57,3 @@ table { align-items: center; gap: 0.5rem; } - -.footer-sponsors { - display: inline-block; -} diff --git a/doc/source/conf.py b/doc/source/conf.py index 8c06026df469e..95195e11d9c47 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -163,8 +163,8 @@ # General information about the project. project = "pandas" -# We have our custom "pandas_footer.html" template, using copyright for the current year -copyright = f"{datetime.now().year}, pandas" +# Updated copyright to include sponsor information, removing the need for custom template +copyright = f'{datetime.now().year}, pandas via NumFOCUS, Inc. Hosted by OVHcloud.' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -242,7 +242,7 @@ html_theme_options = { "external_links": [], - "footer_start": ["copyright", "pandas_footer", "sphinx-version"], + "footer_start": ["copyright", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", "analytics": { "plausible_analytics_domain": "pandas.pydata.org", From 139781a3ff40120a29e9f82b04464712905c4082 Mon Sep 17 00:00:00 2001 From: siryoos Date: Tue, 15 Jul 2025 11:47:22 +0330 Subject: [PATCH 6/6] DOC: Refine copyright statement in conf.py to enhance clarity and include sponsor information directly --- doc/source/conf.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 95195e11d9c47..93a6869d91a32 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -163,8 +163,12 @@ # General information about the project. project = "pandas" -# Updated copyright to include sponsor information, removing the need for custom template -copyright = f'{datetime.now().year}, pandas via NumFOCUS, Inc. Hosted by OVHcloud.' +# Updated copyright to include sponsor information, removing custom template +copyright = ( + f'{datetime.now().year}, pandas via ' + 'NumFOCUS, Inc. ' + 'Hosted by OVHcloud.' +) # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the