Add documentation to `pandas-vet` rules (#5629)

## Summary Completes all the documentation for the `pandas-vet` rules, except for `pandas-use-of-dot-read-table` as I am unclear of the rule's motivation (see #5628). Related to #2646. ## Test Plan `python scripts/check_docs_formatted.py && mkdocs serve`
2023-07-10 16:45:36 +01:00 · 2023-07-10 16:45:36 +01:00 · 4cac75bc27
parent ed872145fe
commit 4cac75bc27
5 changed files with 331 additions and 12 deletions
--- a/crates/ruff/src/rules/pandas_vet/rules/assignment_to_df.rs
+++ b/crates/ruff/src/rules/pandas_vet/rules/assignment_to_df.rs
@ -3,6 +3,29 @@ use rustpython_parser::ast::{self, Expr, Ranged};
 use ruff_diagnostics::{Diagnostic, Violation};
 use ruff_macros::{derive_message_formats, violation};

+/// ## What it does
+/// Checks for assignments to the variable `df`.
+///
+/// ## Why is this bad?
+/// Although `df` is a common variable name for a Pandas DataFrame, it's not a
+/// great variable name for production code, as it's non-descriptive and
+/// prone to name conflicts.
+///
+/// Instead, use a more descriptive variable name.
+///
+/// ## Example
+/// ```python
+/// import pandas as pd
+///
+/// df = pd.read_csv("animals.csv")
+/// ```
+///
+/// Use instead:
+/// ```python
+/// import pandas as pd
+///
+/// animals = pd.read_csv("animals.csv")
+/// ```
 #[violation]
 pub struct PandasDfVariableName;

--- a/crates/ruff/src/rules/pandas_vet/rules/attr.rs
+++ b/crates/ruff/src/rules/pandas_vet/rules/attr.rs
@ -8,6 +8,32 @@ use crate::checkers::ast::Checker;
 use crate::registry::Rule;
 use crate::rules::pandas_vet::helpers::{test_expression, Resolution};

+/// ## What it does
+/// Checks for uses of `.values` on Pandas Series and Index objects.
+///
+/// ## Why is this bad?
+/// The `.values` attribute is ambiguous as it's return type is unclear. As
+/// such, it is no longer recommended by the Pandas documentation.
+///
+/// Instead, use `.to_numpy()` to return a NumPy array, or `.array` to return a
+/// Pandas `ExtensionArray`.
+///
+/// ## Example
+/// ```python
+/// import pandas as pd
+///
+/// animals = pd.read_csv("animals.csv").values  # Ambiguous.
+/// ```
+///
+/// Use instead:
+/// ```python
+/// import pandas as pd
+///
+/// animals = pd.read_csv("animals.csv").to_numpy()  # Explicit.
+/// ```
+///
+/// ## References
+/// - [Pandas documentation: Accessing the values in a Series or Index](https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.24.0.html#accessing-the-values-in-a-series-or-index)
 #[violation]
 pub struct PandasUseOfDotValues;

@ -19,9 +45,10 @@ impl Violation for PandasUseOfDotValues {
 }

 pub(crate) fn attr(checker: &mut Checker, attr: &str, value: &Expr, attr_expr: &Expr) {
-    let rules = &checker.settings.rules;
    let violation: DiagnosticKind = match attr {
-        "values" if rules.enabled(Rule::PandasUseOfDotValues) => PandasUseOfDotValues.into(),
+        "values" if checker.settings.rules.enabled(Rule::PandasUseOfDotValues) => {
+            PandasUseOfDotValues.into()
+        }
        _ => return,
    };

--- a/crates/ruff/src/rules/pandas_vet/rules/call.rs
+++ b/crates/ruff/src/rules/pandas_vet/rules/call.rs
@ -8,6 +8,36 @@ use crate::checkers::ast::Checker;
 use crate::registry::Rule;
 use crate::rules::pandas_vet::helpers::{test_expression, Resolution};

+/// ## What it does
+/// Checks for uses of `.isnull` on Pandas objects.
+///
+/// ## Why is this bad?
+/// In the Pandas API, `.isna` and `.isnull` are equivalent. For consistency,
+/// prefer `.isna` over `.isnull`.
+///
+/// As a name, `.isna` more accurately reflects the behavior of the method,
+/// since these methods check for `NaN` and `NaT` values in addition to `None`
+/// values.
+///
+/// ## Example
+/// ```python
+/// import pandas as pd
+///
+/// animals_df = pd.read_csv("animals.csv")
+/// pd.isnull(animals_df)
+/// ```
+///
+/// Use instead:
+/// ```python
+/// import pandas as pd
+///
+/// animals_df = pd.read_csv("animals.csv")
+/// pd.isna(animals_df)
+/// ```
+///
+/// ## References
+/// - [Pandas documentation: `isnull`](https://pandas.pydata.org/docs/reference/api/pandas.isnull.html#pandas.isnull)
+/// - [Pandas documentation: `isna`](https://pandas.pydata.org/docs/reference/api/pandas.isna.html#pandas.isna)
 #[violation]
 pub struct PandasUseOfDotIsNull;

@ -18,6 +48,36 @@ impl Violation for PandasUseOfDotIsNull {
    }
 }

+/// ## What it does
+/// Checks for uses of `.notnull` on Pandas objects.
+///
+/// ## Why is this bad?
+/// In the Pandas API, `.notna` and `.notnull` are equivalent. For consistency,
+/// prefer `.notna` over `.notnull`.
+///
+/// As a name, `.notna` more accurately reflects the behavior of the method,
+/// since these methods check for `NaN` and `NaT` values in addition to `None`
+/// values.
+///
+/// ## Example
+/// ```python
+/// import pandas as pd
+///
+/// animals_df = pd.read_csv("animals.csv")
+/// pd.notnull(animals_df)
+/// ```
+///
+/// Use instead:
+/// ```python
+/// import pandas as pd
+///
+/// animals_df = pd.read_csv("animals.csv")
+/// pd.notna(animals_df)
+/// ```
+///
+/// ## References
+/// - [Pandas documentation: `notnull`](https://pandas.pydata.org/docs/reference/api/pandas.notnull.html#pandas.notnull)
+/// - [Pandas documentation: `notna`](https://pandas.pydata.org/docs/reference/api/pandas.notna.html#pandas.notna)
 #[violation]
 pub struct PandasUseOfDotNotNull;

@ -28,6 +88,32 @@ impl Violation for PandasUseOfDotNotNull {
    }
 }

+/// ## What it does
+/// Checks for uses of `.pivot` or `.unstack` on Pandas objects.
+///
+/// ## Why is this bad?
+/// Prefer `.pivot_table` to `.pivot` or `.unstack`. `.pivot_table` is more general
+/// and can be used to implement the same behavior as `.pivot` and `.unstack`.
+///
+/// ## Example
+/// ```python
+/// import pandas as pd
+///
+/// df = pd.read_csv("cities.csv")
+/// df.pivot(index="city", columns="year", values="population")
+/// ```
+///
+/// Use instead:
+/// ```python
+/// import pandas as pd
+///
+/// df = pd.read_csv("cities.csv")
+/// df.pivot_table(index="city", columns="year", values="population")
+/// ```
+///
+/// ## References
+/// - [Pandas documentation: Reshaping and pivot tables](https://pandas.pydata.org/docs/user_guide/reshaping.html)
+/// - [Pandas documentation: `pivot_table`](https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html#pandas.pivot_table)
 #[violation]
 pub struct PandasUseOfDotPivotOrUnstack;

@ -40,6 +126,8 @@ impl Violation for PandasUseOfDotPivotOrUnstack {
    }
 }

+// TODO(tjkuson): Add documentation for this rule once clarified.
+// https://github.com/astral-sh/ruff/issues/5628
 #[violation]
 pub struct PandasUseOfDotReadTable;

@ -50,6 +138,32 @@ impl Violation for PandasUseOfDotReadTable {
    }
 }

+/// ## What it does
+/// Checks for uses of `.stack` on Pandas objects.
+///
+/// ## Why is this bad?
+/// Prefer `.melt` to `.stack`, which has the same functionality but with
+/// support for direct column renaming and no dependence on `MultiIndex`.
+///
+/// ## Example
+/// ```python
+/// import pandas as pd
+///
+/// cities_df = pd.read_csv("cities.csv")
+/// cities_df.set_index("city").stack()
+/// ```
+///
+/// Use instead:
+/// ```python
+/// import pandas as pd
+///
+/// cities_df = pd.read_csv("cities.csv")
+/// cities_df.melt(id_vars="city")
+/// ```
+///
+/// ## References
+/// - [Pandas documentation: `melt`](https://pandas.pydata.org/docs/reference/api/pandas.melt.html)
+/// - [Pandas documentation: `stack`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.stack.html)
 #[violation]
 pub struct PandasUseOfDotStack;

@ -61,20 +175,35 @@ impl Violation for PandasUseOfDotStack {
 }

 pub(crate) fn call(checker: &mut Checker, func: &Expr) {
-    let rules = &checker.settings.rules;
    let Expr::Attribute(ast::ExprAttribute { value, attr, .. }) = func else {
        return;
    };
    let violation: DiagnosticKind = match attr.as_str() {
-        "isnull" if rules.enabled(Rule::PandasUseOfDotIsNull) => PandasUseOfDotIsNull.into(),
-        "notnull" if rules.enabled(Rule::PandasUseOfDotNotNull) => PandasUseOfDotNotNull.into(),
-        "pivot" | "unstack" if rules.enabled(Rule::PandasUseOfDotPivotOrUnstack) => {
+        "isnull" if checker.settings.rules.enabled(Rule::PandasUseOfDotIsNull) => {
+            PandasUseOfDotIsNull.into()
+        }
+        "notnull" if checker.settings.rules.enabled(Rule::PandasUseOfDotNotNull) => {
+            PandasUseOfDotNotNull.into()
+        }
+        "pivot" | "unstack"
+            if checker
+                .settings
+                .rules
+                .enabled(Rule::PandasUseOfDotPivotOrUnstack) =>
+        {
            PandasUseOfDotPivotOrUnstack.into()
        }
-        "read_table" if rules.enabled(Rule::PandasUseOfDotReadTable) => {
+        "read_table"
+            if checker
+                .settings
+                .rules
+                .enabled(Rule::PandasUseOfDotReadTable) =>
+        {
            PandasUseOfDotReadTable.into()
        }
-        "stack" if rules.enabled(Rule::PandasUseOfDotStack) => PandasUseOfDotStack.into(),
+        "stack" if checker.settings.rules.enabled(Rule::PandasUseOfDotStack) => {
+            PandasUseOfDotStack.into()
+        }
        _ => return,
    };

--- a/crates/ruff/src/rules/pandas_vet/rules/pd_merge.rs
+++ b/crates/ruff/src/rules/pandas_vet/rules/pd_merge.rs
@ -4,6 +4,42 @@ use crate::checkers::ast::Checker;
 use ruff_diagnostics::{Diagnostic, Violation};
 use ruff_macros::{derive_message_formats, violation};

+/// ## What it does
+/// Checks for uses of `pd.merge` on Pandas objects.
+///
+/// ## Why is this bad?
+/// In Pandas, the `.merge` method (exposed on, e.g., DataFrame objects) and
+/// the `pd.merge` function (exposed on the Pandas module) are equivalent.
+///
+/// For consistency, prefer calling `.merge` on an object over calling
+/// `pd.merge` on the Pandas module, as the former is more idiomatic.
+///
+/// Further, `pd.merge` is not a method, but a function, which prohibits it
+/// from being used in method chains, a common pattern in Pandas code.
+///
+/// ## Example
+/// ```python
+/// import pandas as pd
+///
+/// cats_df = pd.read_csv("cats.csv")
+/// dogs_df = pd.read_csv("dogs.csv")
+/// rabbits_df = pd.read_csv("rabbits.csv")
+/// pets_df = pd.merge(pd.merge(cats_df, dogs_df), rabbits_df)  # Hard to read.
+/// ```
+///
+/// Use instead:
+/// ```python
+/// import pandas as pd
+///
+/// cats_df = pd.read_csv("cats.csv")
+/// dogs_df = pd.read_csv("dogs.csv")
+/// rabbits_df = pd.read_csv("rabbits.csv")
+/// pets_df = cats_df.merge(dogs_df).merge(rabbits_df)
+/// ```
+///
+/// ## References
+/// - [Pandas documentation: `merge`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html#pandas.DataFrame.merge)
+/// - [Pandas documentation: `pd.merge`](https://pandas.pydata.org/docs/reference/api/pandas.merge.html#pandas.merge)
 #[violation]
 pub struct PandasUseOfPdMerge;

--- a/crates/ruff/src/rules/pandas_vet/rules/subscript.rs
+++ b/crates/ruff/src/rules/pandas_vet/rules/subscript.rs
@ -8,6 +8,36 @@ use crate::checkers::ast::Checker;
 use crate::registry::Rule;
 use crate::rules::pandas_vet::helpers::{test_expression, Resolution};

+/// ## What it does
+/// Checks for uses of `.ix` on Pandas objects.
+///
+/// ## Why is this bad?
+/// The `.ix` method is deprecated as its behavior is ambiguous. Specifically,
+/// it's often unclear whether `.ix` is indexing by label or by ordinal position.
+///
+/// Instead, prefer the `.loc` method for label-based indexing, and `.iloc` for
+/// ordinal indexing.
+///
+/// ## Example
+/// ```python
+/// import pandas as pd
+///
+/// students_df = pd.read_csv("students.csv")
+/// students_df.ix[0]  # 0th row or row with label 0?
+/// ```
+///
+/// Use instead:
+/// ```python
+/// import pandas as pd
+///
+/// students_df = pd.read_csv("students.csv")
+/// students_df.iloc[0]  # 0th row.
+/// ```
+///
+/// ## References
+/// - [Pandas release notes: Deprecate `.ix`](https://pandas.pydata.org/pandas-docs/version/0.20/whatsnew.html#deprecate-ix)
+/// - [Pandas documentation: `loc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html)
+/// - [Pandas documentation: `iloc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html)
 #[violation]
 pub struct PandasUseOfDotIx;

@ -18,6 +48,38 @@ impl Violation for PandasUseOfDotIx {
    }
 }

+/// ## What it does
+/// Checks for uses of `.at` on Pandas objects.
+///
+/// ## Why is this bad?
+/// The `.at` method selects a single value from a DataFrame or Series based on
+/// a label index, and is slightly faster than using `.loc`. However, `.loc` is
+/// more idiomatic and versatile, as it can be used to select multiple values at
+/// once.
+///
+/// If performance is an important consideration, convert the object to a NumPy
+/// array, which will provide a much greater performance boost than using `.at`
+/// over `.loc`.
+///
+/// ## Example
+/// ```python
+/// import pandas as pd
+///
+/// students_df = pd.read_csv("students.csv")
+/// students_df.at["Maria"]
+/// ```
+///
+/// Use instead:
+/// ```python
+/// import pandas as pd
+///
+/// students_df = pd.read_csv("students.csv")
+/// students_df.loc["Maria"]
+/// ```
+///
+/// ## References
+/// - [Pandas documentation: `loc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html)
+/// - [Pandas documentation: `at`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.at.html)
 #[violation]
 pub struct PandasUseOfDotAt;

@ -28,6 +90,47 @@ impl Violation for PandasUseOfDotAt {
    }
 }

+/// ## What it does
+/// Checks for uses of `.iat` on Pandas objects.
+///
+/// ## Why is this bad?
+/// The `.iat` method selects a single value from a DataFrame or Series based
+/// on an ordinal index, and is slightly faster than using `.iloc`. However,
+/// `.iloc` is more idiomatic and versatile, as it can be used to select
+/// multiple values at once.
+///
+/// If performance is an important consideration, convert the object to a NumPy
+/// array, which will provide a much greater performance boost than using `.iat`
+/// over `.iloc`.
+///
+/// ## Example
+/// ```python
+/// import pandas as pd
+///
+/// students_df = pd.read_csv("students.csv")
+/// students_df.iat[0]
+/// ```
+///
+/// Use instead:
+/// ```python
+/// import pandas as pd
+///
+/// students_df = pd.read_csv("students.csv")
+/// students_df.iloc[0]
+/// ```
+///
+/// Or, using NumPy:
+/// ```python
+/// import numpy as np
+/// import pandas as pd
+///
+/// students_df = pd.read_csv("students.csv")
+/// students_df.to_numpy()[0]
+/// ```
+///
+/// ## References
+/// - [Pandas documentation: `iloc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html)
+/// - [Pandas documentation: `iat`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iat.html)
 #[violation]
 pub struct PandasUseOfDotIat;

@ -43,11 +146,12 @@ pub(crate) fn subscript(checker: &mut Checker, value: &Expr, expr: &Expr) {
        return;
    };

-    let rules = &checker.settings.rules;
    let violation: DiagnosticKind = match attr.as_str() {
-        "ix" if rules.enabled(Rule::PandasUseOfDotIx) => PandasUseOfDotIx.into(),
-        "at" if rules.enabled(Rule::PandasUseOfDotAt) => PandasUseOfDotAt.into(),
-        "iat" if rules.enabled(Rule::PandasUseOfDotIat) => PandasUseOfDotIat.into(),
+        "ix" if checker.settings.rules.enabled(Rule::PandasUseOfDotIx) => PandasUseOfDotIx.into(),
+        "at" if checker.settings.rules.enabled(Rule::PandasUseOfDotAt) => PandasUseOfDotAt.into(),
+        "iat" if checker.settings.rules.enabled(Rule::PandasUseOfDotIat) => {
+            PandasUseOfDotIat.into()
+        }
        _ => return,
    };