Add documentation to `pandas-vet` rules (#5629)

## Summary

Completes all the documentation for the `pandas-vet` rules, except for
`pandas-use-of-dot-read-table` as I am unclear of the rule's motivation
(see #5628).

Related to #2646.

## Test Plan

`python scripts/check_docs_formatted.py && mkdocs serve`
This commit is contained in:
Tom Kuson 2023-07-10 16:45:36 +01:00 committed by GitHub
parent ed872145fe
commit 4cac75bc27
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 331 additions and 12 deletions

View File

@ -3,6 +3,29 @@ use rustpython_parser::ast::{self, Expr, Ranged};
use ruff_diagnostics::{Diagnostic, Violation}; use ruff_diagnostics::{Diagnostic, Violation};
use ruff_macros::{derive_message_formats, violation}; use ruff_macros::{derive_message_formats, violation};
/// ## What it does
/// Checks for assignments to the variable `df`.
///
/// ## Why is this bad?
/// Although `df` is a common variable name for a Pandas DataFrame, it's not a
/// great variable name for production code, as it's non-descriptive and
/// prone to name conflicts.
///
/// Instead, use a more descriptive variable name.
///
/// ## Example
/// ```python
/// import pandas as pd
///
/// df = pd.read_csv("animals.csv")
/// ```
///
/// Use instead:
/// ```python
/// import pandas as pd
///
/// animals = pd.read_csv("animals.csv")
/// ```
#[violation] #[violation]
pub struct PandasDfVariableName; pub struct PandasDfVariableName;

View File

@ -8,6 +8,32 @@ use crate::checkers::ast::Checker;
use crate::registry::Rule; use crate::registry::Rule;
use crate::rules::pandas_vet::helpers::{test_expression, Resolution}; use crate::rules::pandas_vet::helpers::{test_expression, Resolution};
/// ## What it does
/// Checks for uses of `.values` on Pandas Series and Index objects.
///
/// ## Why is this bad?
/// The `.values` attribute is ambiguous as it's return type is unclear. As
/// such, it is no longer recommended by the Pandas documentation.
///
/// Instead, use `.to_numpy()` to return a NumPy array, or `.array` to return a
/// Pandas `ExtensionArray`.
///
/// ## Example
/// ```python
/// import pandas as pd
///
/// animals = pd.read_csv("animals.csv").values # Ambiguous.
/// ```
///
/// Use instead:
/// ```python
/// import pandas as pd
///
/// animals = pd.read_csv("animals.csv").to_numpy() # Explicit.
/// ```
///
/// ## References
/// - [Pandas documentation: Accessing the values in a Series or Index](https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.24.0.html#accessing-the-values-in-a-series-or-index)
#[violation] #[violation]
pub struct PandasUseOfDotValues; pub struct PandasUseOfDotValues;
@ -19,9 +45,10 @@ impl Violation for PandasUseOfDotValues {
} }
pub(crate) fn attr(checker: &mut Checker, attr: &str, value: &Expr, attr_expr: &Expr) { pub(crate) fn attr(checker: &mut Checker, attr: &str, value: &Expr, attr_expr: &Expr) {
let rules = &checker.settings.rules;
let violation: DiagnosticKind = match attr { let violation: DiagnosticKind = match attr {
"values" if rules.enabled(Rule::PandasUseOfDotValues) => PandasUseOfDotValues.into(), "values" if checker.settings.rules.enabled(Rule::PandasUseOfDotValues) => {
PandasUseOfDotValues.into()
}
_ => return, _ => return,
}; };

View File

@ -8,6 +8,36 @@ use crate::checkers::ast::Checker;
use crate::registry::Rule; use crate::registry::Rule;
use crate::rules::pandas_vet::helpers::{test_expression, Resolution}; use crate::rules::pandas_vet::helpers::{test_expression, Resolution};
/// ## What it does
/// Checks for uses of `.isnull` on Pandas objects.
///
/// ## Why is this bad?
/// In the Pandas API, `.isna` and `.isnull` are equivalent. For consistency,
/// prefer `.isna` over `.isnull`.
///
/// As a name, `.isna` more accurately reflects the behavior of the method,
/// since these methods check for `NaN` and `NaT` values in addition to `None`
/// values.
///
/// ## Example
/// ```python
/// import pandas as pd
///
/// animals_df = pd.read_csv("animals.csv")
/// pd.isnull(animals_df)
/// ```
///
/// Use instead:
/// ```python
/// import pandas as pd
///
/// animals_df = pd.read_csv("animals.csv")
/// pd.isna(animals_df)
/// ```
///
/// ## References
/// - [Pandas documentation: `isnull`](https://pandas.pydata.org/docs/reference/api/pandas.isnull.html#pandas.isnull)
/// - [Pandas documentation: `isna`](https://pandas.pydata.org/docs/reference/api/pandas.isna.html#pandas.isna)
#[violation] #[violation]
pub struct PandasUseOfDotIsNull; pub struct PandasUseOfDotIsNull;
@ -18,6 +48,36 @@ impl Violation for PandasUseOfDotIsNull {
} }
} }
/// ## What it does
/// Checks for uses of `.notnull` on Pandas objects.
///
/// ## Why is this bad?
/// In the Pandas API, `.notna` and `.notnull` are equivalent. For consistency,
/// prefer `.notna` over `.notnull`.
///
/// As a name, `.notna` more accurately reflects the behavior of the method,
/// since these methods check for `NaN` and `NaT` values in addition to `None`
/// values.
///
/// ## Example
/// ```python
/// import pandas as pd
///
/// animals_df = pd.read_csv("animals.csv")
/// pd.notnull(animals_df)
/// ```
///
/// Use instead:
/// ```python
/// import pandas as pd
///
/// animals_df = pd.read_csv("animals.csv")
/// pd.notna(animals_df)
/// ```
///
/// ## References
/// - [Pandas documentation: `notnull`](https://pandas.pydata.org/docs/reference/api/pandas.notnull.html#pandas.notnull)
/// - [Pandas documentation: `notna`](https://pandas.pydata.org/docs/reference/api/pandas.notna.html#pandas.notna)
#[violation] #[violation]
pub struct PandasUseOfDotNotNull; pub struct PandasUseOfDotNotNull;
@ -28,6 +88,32 @@ impl Violation for PandasUseOfDotNotNull {
} }
} }
/// ## What it does
/// Checks for uses of `.pivot` or `.unstack` on Pandas objects.
///
/// ## Why is this bad?
/// Prefer `.pivot_table` to `.pivot` or `.unstack`. `.pivot_table` is more general
/// and can be used to implement the same behavior as `.pivot` and `.unstack`.
///
/// ## Example
/// ```python
/// import pandas as pd
///
/// df = pd.read_csv("cities.csv")
/// df.pivot(index="city", columns="year", values="population")
/// ```
///
/// Use instead:
/// ```python
/// import pandas as pd
///
/// df = pd.read_csv("cities.csv")
/// df.pivot_table(index="city", columns="year", values="population")
/// ```
///
/// ## References
/// - [Pandas documentation: Reshaping and pivot tables](https://pandas.pydata.org/docs/user_guide/reshaping.html)
/// - [Pandas documentation: `pivot_table`](https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html#pandas.pivot_table)
#[violation] #[violation]
pub struct PandasUseOfDotPivotOrUnstack; pub struct PandasUseOfDotPivotOrUnstack;
@ -40,6 +126,8 @@ impl Violation for PandasUseOfDotPivotOrUnstack {
} }
} }
// TODO(tjkuson): Add documentation for this rule once clarified.
// https://github.com/astral-sh/ruff/issues/5628
#[violation] #[violation]
pub struct PandasUseOfDotReadTable; pub struct PandasUseOfDotReadTable;
@ -50,6 +138,32 @@ impl Violation for PandasUseOfDotReadTable {
} }
} }
/// ## What it does
/// Checks for uses of `.stack` on Pandas objects.
///
/// ## Why is this bad?
/// Prefer `.melt` to `.stack`, which has the same functionality but with
/// support for direct column renaming and no dependence on `MultiIndex`.
///
/// ## Example
/// ```python
/// import pandas as pd
///
/// cities_df = pd.read_csv("cities.csv")
/// cities_df.set_index("city").stack()
/// ```
///
/// Use instead:
/// ```python
/// import pandas as pd
///
/// cities_df = pd.read_csv("cities.csv")
/// cities_df.melt(id_vars="city")
/// ```
///
/// ## References
/// - [Pandas documentation: `melt`](https://pandas.pydata.org/docs/reference/api/pandas.melt.html)
/// - [Pandas documentation: `stack`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.stack.html)
#[violation] #[violation]
pub struct PandasUseOfDotStack; pub struct PandasUseOfDotStack;
@ -61,20 +175,35 @@ impl Violation for PandasUseOfDotStack {
} }
pub(crate) fn call(checker: &mut Checker, func: &Expr) { pub(crate) fn call(checker: &mut Checker, func: &Expr) {
let rules = &checker.settings.rules;
let Expr::Attribute(ast::ExprAttribute { value, attr, .. }) = func else { let Expr::Attribute(ast::ExprAttribute { value, attr, .. }) = func else {
return; return;
}; };
let violation: DiagnosticKind = match attr.as_str() { let violation: DiagnosticKind = match attr.as_str() {
"isnull" if rules.enabled(Rule::PandasUseOfDotIsNull) => PandasUseOfDotIsNull.into(), "isnull" if checker.settings.rules.enabled(Rule::PandasUseOfDotIsNull) => {
"notnull" if rules.enabled(Rule::PandasUseOfDotNotNull) => PandasUseOfDotNotNull.into(), PandasUseOfDotIsNull.into()
"pivot" | "unstack" if rules.enabled(Rule::PandasUseOfDotPivotOrUnstack) => { }
"notnull" if checker.settings.rules.enabled(Rule::PandasUseOfDotNotNull) => {
PandasUseOfDotNotNull.into()
}
"pivot" | "unstack"
if checker
.settings
.rules
.enabled(Rule::PandasUseOfDotPivotOrUnstack) =>
{
PandasUseOfDotPivotOrUnstack.into() PandasUseOfDotPivotOrUnstack.into()
} }
"read_table" if rules.enabled(Rule::PandasUseOfDotReadTable) => { "read_table"
if checker
.settings
.rules
.enabled(Rule::PandasUseOfDotReadTable) =>
{
PandasUseOfDotReadTable.into() PandasUseOfDotReadTable.into()
} }
"stack" if rules.enabled(Rule::PandasUseOfDotStack) => PandasUseOfDotStack.into(), "stack" if checker.settings.rules.enabled(Rule::PandasUseOfDotStack) => {
PandasUseOfDotStack.into()
}
_ => return, _ => return,
}; };

View File

@ -4,6 +4,42 @@ use crate::checkers::ast::Checker;
use ruff_diagnostics::{Diagnostic, Violation}; use ruff_diagnostics::{Diagnostic, Violation};
use ruff_macros::{derive_message_formats, violation}; use ruff_macros::{derive_message_formats, violation};
/// ## What it does
/// Checks for uses of `pd.merge` on Pandas objects.
///
/// ## Why is this bad?
/// In Pandas, the `.merge` method (exposed on, e.g., DataFrame objects) and
/// the `pd.merge` function (exposed on the Pandas module) are equivalent.
///
/// For consistency, prefer calling `.merge` on an object over calling
/// `pd.merge` on the Pandas module, as the former is more idiomatic.
///
/// Further, `pd.merge` is not a method, but a function, which prohibits it
/// from being used in method chains, a common pattern in Pandas code.
///
/// ## Example
/// ```python
/// import pandas as pd
///
/// cats_df = pd.read_csv("cats.csv")
/// dogs_df = pd.read_csv("dogs.csv")
/// rabbits_df = pd.read_csv("rabbits.csv")
/// pets_df = pd.merge(pd.merge(cats_df, dogs_df), rabbits_df) # Hard to read.
/// ```
///
/// Use instead:
/// ```python
/// import pandas as pd
///
/// cats_df = pd.read_csv("cats.csv")
/// dogs_df = pd.read_csv("dogs.csv")
/// rabbits_df = pd.read_csv("rabbits.csv")
/// pets_df = cats_df.merge(dogs_df).merge(rabbits_df)
/// ```
///
/// ## References
/// - [Pandas documentation: `merge`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html#pandas.DataFrame.merge)
/// - [Pandas documentation: `pd.merge`](https://pandas.pydata.org/docs/reference/api/pandas.merge.html#pandas.merge)
#[violation] #[violation]
pub struct PandasUseOfPdMerge; pub struct PandasUseOfPdMerge;

View File

@ -8,6 +8,36 @@ use crate::checkers::ast::Checker;
use crate::registry::Rule; use crate::registry::Rule;
use crate::rules::pandas_vet::helpers::{test_expression, Resolution}; use crate::rules::pandas_vet::helpers::{test_expression, Resolution};
/// ## What it does
/// Checks for uses of `.ix` on Pandas objects.
///
/// ## Why is this bad?
/// The `.ix` method is deprecated as its behavior is ambiguous. Specifically,
/// it's often unclear whether `.ix` is indexing by label or by ordinal position.
///
/// Instead, prefer the `.loc` method for label-based indexing, and `.iloc` for
/// ordinal indexing.
///
/// ## Example
/// ```python
/// import pandas as pd
///
/// students_df = pd.read_csv("students.csv")
/// students_df.ix[0] # 0th row or row with label 0?
/// ```
///
/// Use instead:
/// ```python
/// import pandas as pd
///
/// students_df = pd.read_csv("students.csv")
/// students_df.iloc[0] # 0th row.
/// ```
///
/// ## References
/// - [Pandas release notes: Deprecate `.ix`](https://pandas.pydata.org/pandas-docs/version/0.20/whatsnew.html#deprecate-ix)
/// - [Pandas documentation: `loc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html)
/// - [Pandas documentation: `iloc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html)
#[violation] #[violation]
pub struct PandasUseOfDotIx; pub struct PandasUseOfDotIx;
@ -18,6 +48,38 @@ impl Violation for PandasUseOfDotIx {
} }
} }
/// ## What it does
/// Checks for uses of `.at` on Pandas objects.
///
/// ## Why is this bad?
/// The `.at` method selects a single value from a DataFrame or Series based on
/// a label index, and is slightly faster than using `.loc`. However, `.loc` is
/// more idiomatic and versatile, as it can be used to select multiple values at
/// once.
///
/// If performance is an important consideration, convert the object to a NumPy
/// array, which will provide a much greater performance boost than using `.at`
/// over `.loc`.
///
/// ## Example
/// ```python
/// import pandas as pd
///
/// students_df = pd.read_csv("students.csv")
/// students_df.at["Maria"]
/// ```
///
/// Use instead:
/// ```python
/// import pandas as pd
///
/// students_df = pd.read_csv("students.csv")
/// students_df.loc["Maria"]
/// ```
///
/// ## References
/// - [Pandas documentation: `loc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html)
/// - [Pandas documentation: `at`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.at.html)
#[violation] #[violation]
pub struct PandasUseOfDotAt; pub struct PandasUseOfDotAt;
@ -28,6 +90,47 @@ impl Violation for PandasUseOfDotAt {
} }
} }
/// ## What it does
/// Checks for uses of `.iat` on Pandas objects.
///
/// ## Why is this bad?
/// The `.iat` method selects a single value from a DataFrame or Series based
/// on an ordinal index, and is slightly faster than using `.iloc`. However,
/// `.iloc` is more idiomatic and versatile, as it can be used to select
/// multiple values at once.
///
/// If performance is an important consideration, convert the object to a NumPy
/// array, which will provide a much greater performance boost than using `.iat`
/// over `.iloc`.
///
/// ## Example
/// ```python
/// import pandas as pd
///
/// students_df = pd.read_csv("students.csv")
/// students_df.iat[0]
/// ```
///
/// Use instead:
/// ```python
/// import pandas as pd
///
/// students_df = pd.read_csv("students.csv")
/// students_df.iloc[0]
/// ```
///
/// Or, using NumPy:
/// ```python
/// import numpy as np
/// import pandas as pd
///
/// students_df = pd.read_csv("students.csv")
/// students_df.to_numpy()[0]
/// ```
///
/// ## References
/// - [Pandas documentation: `iloc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html)
/// - [Pandas documentation: `iat`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iat.html)
#[violation] #[violation]
pub struct PandasUseOfDotIat; pub struct PandasUseOfDotIat;
@ -43,11 +146,12 @@ pub(crate) fn subscript(checker: &mut Checker, value: &Expr, expr: &Expr) {
return; return;
}; };
let rules = &checker.settings.rules;
let violation: DiagnosticKind = match attr.as_str() { let violation: DiagnosticKind = match attr.as_str() {
"ix" if rules.enabled(Rule::PandasUseOfDotIx) => PandasUseOfDotIx.into(), "ix" if checker.settings.rules.enabled(Rule::PandasUseOfDotIx) => PandasUseOfDotIx.into(),
"at" if rules.enabled(Rule::PandasUseOfDotAt) => PandasUseOfDotAt.into(), "at" if checker.settings.rules.enabled(Rule::PandasUseOfDotAt) => PandasUseOfDotAt.into(),
"iat" if rules.enabled(Rule::PandasUseOfDotIat) => PandasUseOfDotIat.into(), "iat" if checker.settings.rules.enabled(Rule::PandasUseOfDotIat) => {
PandasUseOfDotIat.into()
}
_ => return, _ => return,
}; };