mirror of https://github.com/astral-sh/ruff
Add documentation to `pandas-vet` rules (#5629)
## Summary Completes all the documentation for the `pandas-vet` rules, except for `pandas-use-of-dot-read-table` as I am unclear of the rule's motivation (see #5628). Related to #2646. ## Test Plan `python scripts/check_docs_formatted.py && mkdocs serve`
This commit is contained in:
parent
ed872145fe
commit
4cac75bc27
|
|
@ -3,6 +3,29 @@ use rustpython_parser::ast::{self, Expr, Ranged};
|
|||
use ruff_diagnostics::{Diagnostic, Violation};
|
||||
use ruff_macros::{derive_message_formats, violation};
|
||||
|
||||
/// ## What it does
|
||||
/// Checks for assignments to the variable `df`.
|
||||
///
|
||||
/// ## Why is this bad?
|
||||
/// Although `df` is a common variable name for a Pandas DataFrame, it's not a
|
||||
/// great variable name for production code, as it's non-descriptive and
|
||||
/// prone to name conflicts.
|
||||
///
|
||||
/// Instead, use a more descriptive variable name.
|
||||
///
|
||||
/// ## Example
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// df = pd.read_csv("animals.csv")
|
||||
/// ```
|
||||
///
|
||||
/// Use instead:
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// animals = pd.read_csv("animals.csv")
|
||||
/// ```
|
||||
#[violation]
|
||||
pub struct PandasDfVariableName;
|
||||
|
||||
|
|
|
|||
|
|
@ -8,6 +8,32 @@ use crate::checkers::ast::Checker;
|
|||
use crate::registry::Rule;
|
||||
use crate::rules::pandas_vet::helpers::{test_expression, Resolution};
|
||||
|
||||
/// ## What it does
|
||||
/// Checks for uses of `.values` on Pandas Series and Index objects.
|
||||
///
|
||||
/// ## Why is this bad?
|
||||
/// The `.values` attribute is ambiguous as it's return type is unclear. As
|
||||
/// such, it is no longer recommended by the Pandas documentation.
|
||||
///
|
||||
/// Instead, use `.to_numpy()` to return a NumPy array, or `.array` to return a
|
||||
/// Pandas `ExtensionArray`.
|
||||
///
|
||||
/// ## Example
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// animals = pd.read_csv("animals.csv").values # Ambiguous.
|
||||
/// ```
|
||||
///
|
||||
/// Use instead:
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// animals = pd.read_csv("animals.csv").to_numpy() # Explicit.
|
||||
/// ```
|
||||
///
|
||||
/// ## References
|
||||
/// - [Pandas documentation: Accessing the values in a Series or Index](https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.24.0.html#accessing-the-values-in-a-series-or-index)
|
||||
#[violation]
|
||||
pub struct PandasUseOfDotValues;
|
||||
|
||||
|
|
@ -19,9 +45,10 @@ impl Violation for PandasUseOfDotValues {
|
|||
}
|
||||
|
||||
pub(crate) fn attr(checker: &mut Checker, attr: &str, value: &Expr, attr_expr: &Expr) {
|
||||
let rules = &checker.settings.rules;
|
||||
let violation: DiagnosticKind = match attr {
|
||||
"values" if rules.enabled(Rule::PandasUseOfDotValues) => PandasUseOfDotValues.into(),
|
||||
"values" if checker.settings.rules.enabled(Rule::PandasUseOfDotValues) => {
|
||||
PandasUseOfDotValues.into()
|
||||
}
|
||||
_ => return,
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -8,6 +8,36 @@ use crate::checkers::ast::Checker;
|
|||
use crate::registry::Rule;
|
||||
use crate::rules::pandas_vet::helpers::{test_expression, Resolution};
|
||||
|
||||
/// ## What it does
|
||||
/// Checks for uses of `.isnull` on Pandas objects.
|
||||
///
|
||||
/// ## Why is this bad?
|
||||
/// In the Pandas API, `.isna` and `.isnull` are equivalent. For consistency,
|
||||
/// prefer `.isna` over `.isnull`.
|
||||
///
|
||||
/// As a name, `.isna` more accurately reflects the behavior of the method,
|
||||
/// since these methods check for `NaN` and `NaT` values in addition to `None`
|
||||
/// values.
|
||||
///
|
||||
/// ## Example
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// animals_df = pd.read_csv("animals.csv")
|
||||
/// pd.isnull(animals_df)
|
||||
/// ```
|
||||
///
|
||||
/// Use instead:
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// animals_df = pd.read_csv("animals.csv")
|
||||
/// pd.isna(animals_df)
|
||||
/// ```
|
||||
///
|
||||
/// ## References
|
||||
/// - [Pandas documentation: `isnull`](https://pandas.pydata.org/docs/reference/api/pandas.isnull.html#pandas.isnull)
|
||||
/// - [Pandas documentation: `isna`](https://pandas.pydata.org/docs/reference/api/pandas.isna.html#pandas.isna)
|
||||
#[violation]
|
||||
pub struct PandasUseOfDotIsNull;
|
||||
|
||||
|
|
@ -18,6 +48,36 @@ impl Violation for PandasUseOfDotIsNull {
|
|||
}
|
||||
}
|
||||
|
||||
/// ## What it does
|
||||
/// Checks for uses of `.notnull` on Pandas objects.
|
||||
///
|
||||
/// ## Why is this bad?
|
||||
/// In the Pandas API, `.notna` and `.notnull` are equivalent. For consistency,
|
||||
/// prefer `.notna` over `.notnull`.
|
||||
///
|
||||
/// As a name, `.notna` more accurately reflects the behavior of the method,
|
||||
/// since these methods check for `NaN` and `NaT` values in addition to `None`
|
||||
/// values.
|
||||
///
|
||||
/// ## Example
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// animals_df = pd.read_csv("animals.csv")
|
||||
/// pd.notnull(animals_df)
|
||||
/// ```
|
||||
///
|
||||
/// Use instead:
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// animals_df = pd.read_csv("animals.csv")
|
||||
/// pd.notna(animals_df)
|
||||
/// ```
|
||||
///
|
||||
/// ## References
|
||||
/// - [Pandas documentation: `notnull`](https://pandas.pydata.org/docs/reference/api/pandas.notnull.html#pandas.notnull)
|
||||
/// - [Pandas documentation: `notna`](https://pandas.pydata.org/docs/reference/api/pandas.notna.html#pandas.notna)
|
||||
#[violation]
|
||||
pub struct PandasUseOfDotNotNull;
|
||||
|
||||
|
|
@ -28,6 +88,32 @@ impl Violation for PandasUseOfDotNotNull {
|
|||
}
|
||||
}
|
||||
|
||||
/// ## What it does
|
||||
/// Checks for uses of `.pivot` or `.unstack` on Pandas objects.
|
||||
///
|
||||
/// ## Why is this bad?
|
||||
/// Prefer `.pivot_table` to `.pivot` or `.unstack`. `.pivot_table` is more general
|
||||
/// and can be used to implement the same behavior as `.pivot` and `.unstack`.
|
||||
///
|
||||
/// ## Example
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// df = pd.read_csv("cities.csv")
|
||||
/// df.pivot(index="city", columns="year", values="population")
|
||||
/// ```
|
||||
///
|
||||
/// Use instead:
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// df = pd.read_csv("cities.csv")
|
||||
/// df.pivot_table(index="city", columns="year", values="population")
|
||||
/// ```
|
||||
///
|
||||
/// ## References
|
||||
/// - [Pandas documentation: Reshaping and pivot tables](https://pandas.pydata.org/docs/user_guide/reshaping.html)
|
||||
/// - [Pandas documentation: `pivot_table`](https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html#pandas.pivot_table)
|
||||
#[violation]
|
||||
pub struct PandasUseOfDotPivotOrUnstack;
|
||||
|
||||
|
|
@ -40,6 +126,8 @@ impl Violation for PandasUseOfDotPivotOrUnstack {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO(tjkuson): Add documentation for this rule once clarified.
|
||||
// https://github.com/astral-sh/ruff/issues/5628
|
||||
#[violation]
|
||||
pub struct PandasUseOfDotReadTable;
|
||||
|
||||
|
|
@ -50,6 +138,32 @@ impl Violation for PandasUseOfDotReadTable {
|
|||
}
|
||||
}
|
||||
|
||||
/// ## What it does
|
||||
/// Checks for uses of `.stack` on Pandas objects.
|
||||
///
|
||||
/// ## Why is this bad?
|
||||
/// Prefer `.melt` to `.stack`, which has the same functionality but with
|
||||
/// support for direct column renaming and no dependence on `MultiIndex`.
|
||||
///
|
||||
/// ## Example
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// cities_df = pd.read_csv("cities.csv")
|
||||
/// cities_df.set_index("city").stack()
|
||||
/// ```
|
||||
///
|
||||
/// Use instead:
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// cities_df = pd.read_csv("cities.csv")
|
||||
/// cities_df.melt(id_vars="city")
|
||||
/// ```
|
||||
///
|
||||
/// ## References
|
||||
/// - [Pandas documentation: `melt`](https://pandas.pydata.org/docs/reference/api/pandas.melt.html)
|
||||
/// - [Pandas documentation: `stack`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.stack.html)
|
||||
#[violation]
|
||||
pub struct PandasUseOfDotStack;
|
||||
|
||||
|
|
@ -61,20 +175,35 @@ impl Violation for PandasUseOfDotStack {
|
|||
}
|
||||
|
||||
pub(crate) fn call(checker: &mut Checker, func: &Expr) {
|
||||
let rules = &checker.settings.rules;
|
||||
let Expr::Attribute(ast::ExprAttribute { value, attr, .. }) = func else {
|
||||
return;
|
||||
};
|
||||
let violation: DiagnosticKind = match attr.as_str() {
|
||||
"isnull" if rules.enabled(Rule::PandasUseOfDotIsNull) => PandasUseOfDotIsNull.into(),
|
||||
"notnull" if rules.enabled(Rule::PandasUseOfDotNotNull) => PandasUseOfDotNotNull.into(),
|
||||
"pivot" | "unstack" if rules.enabled(Rule::PandasUseOfDotPivotOrUnstack) => {
|
||||
"isnull" if checker.settings.rules.enabled(Rule::PandasUseOfDotIsNull) => {
|
||||
PandasUseOfDotIsNull.into()
|
||||
}
|
||||
"notnull" if checker.settings.rules.enabled(Rule::PandasUseOfDotNotNull) => {
|
||||
PandasUseOfDotNotNull.into()
|
||||
}
|
||||
"pivot" | "unstack"
|
||||
if checker
|
||||
.settings
|
||||
.rules
|
||||
.enabled(Rule::PandasUseOfDotPivotOrUnstack) =>
|
||||
{
|
||||
PandasUseOfDotPivotOrUnstack.into()
|
||||
}
|
||||
"read_table" if rules.enabled(Rule::PandasUseOfDotReadTable) => {
|
||||
"read_table"
|
||||
if checker
|
||||
.settings
|
||||
.rules
|
||||
.enabled(Rule::PandasUseOfDotReadTable) =>
|
||||
{
|
||||
PandasUseOfDotReadTable.into()
|
||||
}
|
||||
"stack" if rules.enabled(Rule::PandasUseOfDotStack) => PandasUseOfDotStack.into(),
|
||||
"stack" if checker.settings.rules.enabled(Rule::PandasUseOfDotStack) => {
|
||||
PandasUseOfDotStack.into()
|
||||
}
|
||||
_ => return,
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,42 @@ use crate::checkers::ast::Checker;
|
|||
use ruff_diagnostics::{Diagnostic, Violation};
|
||||
use ruff_macros::{derive_message_formats, violation};
|
||||
|
||||
/// ## What it does
|
||||
/// Checks for uses of `pd.merge` on Pandas objects.
|
||||
///
|
||||
/// ## Why is this bad?
|
||||
/// In Pandas, the `.merge` method (exposed on, e.g., DataFrame objects) and
|
||||
/// the `pd.merge` function (exposed on the Pandas module) are equivalent.
|
||||
///
|
||||
/// For consistency, prefer calling `.merge` on an object over calling
|
||||
/// `pd.merge` on the Pandas module, as the former is more idiomatic.
|
||||
///
|
||||
/// Further, `pd.merge` is not a method, but a function, which prohibits it
|
||||
/// from being used in method chains, a common pattern in Pandas code.
|
||||
///
|
||||
/// ## Example
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// cats_df = pd.read_csv("cats.csv")
|
||||
/// dogs_df = pd.read_csv("dogs.csv")
|
||||
/// rabbits_df = pd.read_csv("rabbits.csv")
|
||||
/// pets_df = pd.merge(pd.merge(cats_df, dogs_df), rabbits_df) # Hard to read.
|
||||
/// ```
|
||||
///
|
||||
/// Use instead:
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// cats_df = pd.read_csv("cats.csv")
|
||||
/// dogs_df = pd.read_csv("dogs.csv")
|
||||
/// rabbits_df = pd.read_csv("rabbits.csv")
|
||||
/// pets_df = cats_df.merge(dogs_df).merge(rabbits_df)
|
||||
/// ```
|
||||
///
|
||||
/// ## References
|
||||
/// - [Pandas documentation: `merge`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html#pandas.DataFrame.merge)
|
||||
/// - [Pandas documentation: `pd.merge`](https://pandas.pydata.org/docs/reference/api/pandas.merge.html#pandas.merge)
|
||||
#[violation]
|
||||
pub struct PandasUseOfPdMerge;
|
||||
|
||||
|
|
|
|||
|
|
@ -8,6 +8,36 @@ use crate::checkers::ast::Checker;
|
|||
use crate::registry::Rule;
|
||||
use crate::rules::pandas_vet::helpers::{test_expression, Resolution};
|
||||
|
||||
/// ## What it does
|
||||
/// Checks for uses of `.ix` on Pandas objects.
|
||||
///
|
||||
/// ## Why is this bad?
|
||||
/// The `.ix` method is deprecated as its behavior is ambiguous. Specifically,
|
||||
/// it's often unclear whether `.ix` is indexing by label or by ordinal position.
|
||||
///
|
||||
/// Instead, prefer the `.loc` method for label-based indexing, and `.iloc` for
|
||||
/// ordinal indexing.
|
||||
///
|
||||
/// ## Example
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// students_df = pd.read_csv("students.csv")
|
||||
/// students_df.ix[0] # 0th row or row with label 0?
|
||||
/// ```
|
||||
///
|
||||
/// Use instead:
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// students_df = pd.read_csv("students.csv")
|
||||
/// students_df.iloc[0] # 0th row.
|
||||
/// ```
|
||||
///
|
||||
/// ## References
|
||||
/// - [Pandas release notes: Deprecate `.ix`](https://pandas.pydata.org/pandas-docs/version/0.20/whatsnew.html#deprecate-ix)
|
||||
/// - [Pandas documentation: `loc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html)
|
||||
/// - [Pandas documentation: `iloc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html)
|
||||
#[violation]
|
||||
pub struct PandasUseOfDotIx;
|
||||
|
||||
|
|
@ -18,6 +48,38 @@ impl Violation for PandasUseOfDotIx {
|
|||
}
|
||||
}
|
||||
|
||||
/// ## What it does
|
||||
/// Checks for uses of `.at` on Pandas objects.
|
||||
///
|
||||
/// ## Why is this bad?
|
||||
/// The `.at` method selects a single value from a DataFrame or Series based on
|
||||
/// a label index, and is slightly faster than using `.loc`. However, `.loc` is
|
||||
/// more idiomatic and versatile, as it can be used to select multiple values at
|
||||
/// once.
|
||||
///
|
||||
/// If performance is an important consideration, convert the object to a NumPy
|
||||
/// array, which will provide a much greater performance boost than using `.at`
|
||||
/// over `.loc`.
|
||||
///
|
||||
/// ## Example
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// students_df = pd.read_csv("students.csv")
|
||||
/// students_df.at["Maria"]
|
||||
/// ```
|
||||
///
|
||||
/// Use instead:
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// students_df = pd.read_csv("students.csv")
|
||||
/// students_df.loc["Maria"]
|
||||
/// ```
|
||||
///
|
||||
/// ## References
|
||||
/// - [Pandas documentation: `loc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html)
|
||||
/// - [Pandas documentation: `at`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.at.html)
|
||||
#[violation]
|
||||
pub struct PandasUseOfDotAt;
|
||||
|
||||
|
|
@ -28,6 +90,47 @@ impl Violation for PandasUseOfDotAt {
|
|||
}
|
||||
}
|
||||
|
||||
/// ## What it does
|
||||
/// Checks for uses of `.iat` on Pandas objects.
|
||||
///
|
||||
/// ## Why is this bad?
|
||||
/// The `.iat` method selects a single value from a DataFrame or Series based
|
||||
/// on an ordinal index, and is slightly faster than using `.iloc`. However,
|
||||
/// `.iloc` is more idiomatic and versatile, as it can be used to select
|
||||
/// multiple values at once.
|
||||
///
|
||||
/// If performance is an important consideration, convert the object to a NumPy
|
||||
/// array, which will provide a much greater performance boost than using `.iat`
|
||||
/// over `.iloc`.
|
||||
///
|
||||
/// ## Example
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// students_df = pd.read_csv("students.csv")
|
||||
/// students_df.iat[0]
|
||||
/// ```
|
||||
///
|
||||
/// Use instead:
|
||||
/// ```python
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// students_df = pd.read_csv("students.csv")
|
||||
/// students_df.iloc[0]
|
||||
/// ```
|
||||
///
|
||||
/// Or, using NumPy:
|
||||
/// ```python
|
||||
/// import numpy as np
|
||||
/// import pandas as pd
|
||||
///
|
||||
/// students_df = pd.read_csv("students.csv")
|
||||
/// students_df.to_numpy()[0]
|
||||
/// ```
|
||||
///
|
||||
/// ## References
|
||||
/// - [Pandas documentation: `iloc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html)
|
||||
/// - [Pandas documentation: `iat`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iat.html)
|
||||
#[violation]
|
||||
pub struct PandasUseOfDotIat;
|
||||
|
||||
|
|
@ -43,11 +146,12 @@ pub(crate) fn subscript(checker: &mut Checker, value: &Expr, expr: &Expr) {
|
|||
return;
|
||||
};
|
||||
|
||||
let rules = &checker.settings.rules;
|
||||
let violation: DiagnosticKind = match attr.as_str() {
|
||||
"ix" if rules.enabled(Rule::PandasUseOfDotIx) => PandasUseOfDotIx.into(),
|
||||
"at" if rules.enabled(Rule::PandasUseOfDotAt) => PandasUseOfDotAt.into(),
|
||||
"iat" if rules.enabled(Rule::PandasUseOfDotIat) => PandasUseOfDotIat.into(),
|
||||
"ix" if checker.settings.rules.enabled(Rule::PandasUseOfDotIx) => PandasUseOfDotIx.into(),
|
||||
"at" if checker.settings.rules.enabled(Rule::PandasUseOfDotAt) => PandasUseOfDotAt.into(),
|
||||
"iat" if checker.settings.rules.enabled(Rule::PandasUseOfDotIat) => {
|
||||
PandasUseOfDotIat.into()
|
||||
}
|
||||
_ => return,
|
||||
};
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue