diff --git a/crates/ruff/src/rules/pandas_vet/rules/assignment_to_df.rs b/crates/ruff/src/rules/pandas_vet/rules/assignment_to_df.rs index 6473d4a7a9..ba66f2950c 100644 --- a/crates/ruff/src/rules/pandas_vet/rules/assignment_to_df.rs +++ b/crates/ruff/src/rules/pandas_vet/rules/assignment_to_df.rs @@ -3,6 +3,29 @@ use rustpython_parser::ast::{self, Expr, Ranged}; use ruff_diagnostics::{Diagnostic, Violation}; use ruff_macros::{derive_message_formats, violation}; +/// ## What it does +/// Checks for assignments to the variable `df`. +/// +/// ## Why is this bad? +/// Although `df` is a common variable name for a Pandas DataFrame, it's not a +/// great variable name for production code, as it's non-descriptive and +/// prone to name conflicts. +/// +/// Instead, use a more descriptive variable name. +/// +/// ## Example +/// ```python +/// import pandas as pd +/// +/// df = pd.read_csv("animals.csv") +/// ``` +/// +/// Use instead: +/// ```python +/// import pandas as pd +/// +/// animals = pd.read_csv("animals.csv") +/// ``` #[violation] pub struct PandasDfVariableName; diff --git a/crates/ruff/src/rules/pandas_vet/rules/attr.rs b/crates/ruff/src/rules/pandas_vet/rules/attr.rs index fd0de36a96..cb2ec2398d 100644 --- a/crates/ruff/src/rules/pandas_vet/rules/attr.rs +++ b/crates/ruff/src/rules/pandas_vet/rules/attr.rs @@ -8,6 +8,32 @@ use crate::checkers::ast::Checker; use crate::registry::Rule; use crate::rules::pandas_vet::helpers::{test_expression, Resolution}; +/// ## What it does +/// Checks for uses of `.values` on Pandas Series and Index objects. +/// +/// ## Why is this bad? +/// The `.values` attribute is ambiguous as it's return type is unclear. As +/// such, it is no longer recommended by the Pandas documentation. +/// +/// Instead, use `.to_numpy()` to return a NumPy array, or `.array` to return a +/// Pandas `ExtensionArray`. +/// +/// ## Example +/// ```python +/// import pandas as pd +/// +/// animals = pd.read_csv("animals.csv").values # Ambiguous. +/// ``` +/// +/// Use instead: +/// ```python +/// import pandas as pd +/// +/// animals = pd.read_csv("animals.csv").to_numpy() # Explicit. +/// ``` +/// +/// ## References +/// - [Pandas documentation: Accessing the values in a Series or Index](https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.24.0.html#accessing-the-values-in-a-series-or-index) #[violation] pub struct PandasUseOfDotValues; @@ -19,9 +45,10 @@ impl Violation for PandasUseOfDotValues { } pub(crate) fn attr(checker: &mut Checker, attr: &str, value: &Expr, attr_expr: &Expr) { - let rules = &checker.settings.rules; let violation: DiagnosticKind = match attr { - "values" if rules.enabled(Rule::PandasUseOfDotValues) => PandasUseOfDotValues.into(), + "values" if checker.settings.rules.enabled(Rule::PandasUseOfDotValues) => { + PandasUseOfDotValues.into() + } _ => return, }; diff --git a/crates/ruff/src/rules/pandas_vet/rules/call.rs b/crates/ruff/src/rules/pandas_vet/rules/call.rs index a0ab67ca2c..74bf8de249 100644 --- a/crates/ruff/src/rules/pandas_vet/rules/call.rs +++ b/crates/ruff/src/rules/pandas_vet/rules/call.rs @@ -8,6 +8,36 @@ use crate::checkers::ast::Checker; use crate::registry::Rule; use crate::rules::pandas_vet::helpers::{test_expression, Resolution}; +/// ## What it does +/// Checks for uses of `.isnull` on Pandas objects. +/// +/// ## Why is this bad? +/// In the Pandas API, `.isna` and `.isnull` are equivalent. For consistency, +/// prefer `.isna` over `.isnull`. +/// +/// As a name, `.isna` more accurately reflects the behavior of the method, +/// since these methods check for `NaN` and `NaT` values in addition to `None` +/// values. +/// +/// ## Example +/// ```python +/// import pandas as pd +/// +/// animals_df = pd.read_csv("animals.csv") +/// pd.isnull(animals_df) +/// ``` +/// +/// Use instead: +/// ```python +/// import pandas as pd +/// +/// animals_df = pd.read_csv("animals.csv") +/// pd.isna(animals_df) +/// ``` +/// +/// ## References +/// - [Pandas documentation: `isnull`](https://pandas.pydata.org/docs/reference/api/pandas.isnull.html#pandas.isnull) +/// - [Pandas documentation: `isna`](https://pandas.pydata.org/docs/reference/api/pandas.isna.html#pandas.isna) #[violation] pub struct PandasUseOfDotIsNull; @@ -18,6 +48,36 @@ impl Violation for PandasUseOfDotIsNull { } } +/// ## What it does +/// Checks for uses of `.notnull` on Pandas objects. +/// +/// ## Why is this bad? +/// In the Pandas API, `.notna` and `.notnull` are equivalent. For consistency, +/// prefer `.notna` over `.notnull`. +/// +/// As a name, `.notna` more accurately reflects the behavior of the method, +/// since these methods check for `NaN` and `NaT` values in addition to `None` +/// values. +/// +/// ## Example +/// ```python +/// import pandas as pd +/// +/// animals_df = pd.read_csv("animals.csv") +/// pd.notnull(animals_df) +/// ``` +/// +/// Use instead: +/// ```python +/// import pandas as pd +/// +/// animals_df = pd.read_csv("animals.csv") +/// pd.notna(animals_df) +/// ``` +/// +/// ## References +/// - [Pandas documentation: `notnull`](https://pandas.pydata.org/docs/reference/api/pandas.notnull.html#pandas.notnull) +/// - [Pandas documentation: `notna`](https://pandas.pydata.org/docs/reference/api/pandas.notna.html#pandas.notna) #[violation] pub struct PandasUseOfDotNotNull; @@ -28,6 +88,32 @@ impl Violation for PandasUseOfDotNotNull { } } +/// ## What it does +/// Checks for uses of `.pivot` or `.unstack` on Pandas objects. +/// +/// ## Why is this bad? +/// Prefer `.pivot_table` to `.pivot` or `.unstack`. `.pivot_table` is more general +/// and can be used to implement the same behavior as `.pivot` and `.unstack`. +/// +/// ## Example +/// ```python +/// import pandas as pd +/// +/// df = pd.read_csv("cities.csv") +/// df.pivot(index="city", columns="year", values="population") +/// ``` +/// +/// Use instead: +/// ```python +/// import pandas as pd +/// +/// df = pd.read_csv("cities.csv") +/// df.pivot_table(index="city", columns="year", values="population") +/// ``` +/// +/// ## References +/// - [Pandas documentation: Reshaping and pivot tables](https://pandas.pydata.org/docs/user_guide/reshaping.html) +/// - [Pandas documentation: `pivot_table`](https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html#pandas.pivot_table) #[violation] pub struct PandasUseOfDotPivotOrUnstack; @@ -40,6 +126,8 @@ impl Violation for PandasUseOfDotPivotOrUnstack { } } +// TODO(tjkuson): Add documentation for this rule once clarified. +// https://github.com/astral-sh/ruff/issues/5628 #[violation] pub struct PandasUseOfDotReadTable; @@ -50,6 +138,32 @@ impl Violation for PandasUseOfDotReadTable { } } +/// ## What it does +/// Checks for uses of `.stack` on Pandas objects. +/// +/// ## Why is this bad? +/// Prefer `.melt` to `.stack`, which has the same functionality but with +/// support for direct column renaming and no dependence on `MultiIndex`. +/// +/// ## Example +/// ```python +/// import pandas as pd +/// +/// cities_df = pd.read_csv("cities.csv") +/// cities_df.set_index("city").stack() +/// ``` +/// +/// Use instead: +/// ```python +/// import pandas as pd +/// +/// cities_df = pd.read_csv("cities.csv") +/// cities_df.melt(id_vars="city") +/// ``` +/// +/// ## References +/// - [Pandas documentation: `melt`](https://pandas.pydata.org/docs/reference/api/pandas.melt.html) +/// - [Pandas documentation: `stack`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.stack.html) #[violation] pub struct PandasUseOfDotStack; @@ -61,20 +175,35 @@ impl Violation for PandasUseOfDotStack { } pub(crate) fn call(checker: &mut Checker, func: &Expr) { - let rules = &checker.settings.rules; let Expr::Attribute(ast::ExprAttribute { value, attr, .. }) = func else { return; }; let violation: DiagnosticKind = match attr.as_str() { - "isnull" if rules.enabled(Rule::PandasUseOfDotIsNull) => PandasUseOfDotIsNull.into(), - "notnull" if rules.enabled(Rule::PandasUseOfDotNotNull) => PandasUseOfDotNotNull.into(), - "pivot" | "unstack" if rules.enabled(Rule::PandasUseOfDotPivotOrUnstack) => { + "isnull" if checker.settings.rules.enabled(Rule::PandasUseOfDotIsNull) => { + PandasUseOfDotIsNull.into() + } + "notnull" if checker.settings.rules.enabled(Rule::PandasUseOfDotNotNull) => { + PandasUseOfDotNotNull.into() + } + "pivot" | "unstack" + if checker + .settings + .rules + .enabled(Rule::PandasUseOfDotPivotOrUnstack) => + { PandasUseOfDotPivotOrUnstack.into() } - "read_table" if rules.enabled(Rule::PandasUseOfDotReadTable) => { + "read_table" + if checker + .settings + .rules + .enabled(Rule::PandasUseOfDotReadTable) => + { PandasUseOfDotReadTable.into() } - "stack" if rules.enabled(Rule::PandasUseOfDotStack) => PandasUseOfDotStack.into(), + "stack" if checker.settings.rules.enabled(Rule::PandasUseOfDotStack) => { + PandasUseOfDotStack.into() + } _ => return, }; diff --git a/crates/ruff/src/rules/pandas_vet/rules/pd_merge.rs b/crates/ruff/src/rules/pandas_vet/rules/pd_merge.rs index 873d5c7f68..a515cd3639 100644 --- a/crates/ruff/src/rules/pandas_vet/rules/pd_merge.rs +++ b/crates/ruff/src/rules/pandas_vet/rules/pd_merge.rs @@ -4,6 +4,42 @@ use crate::checkers::ast::Checker; use ruff_diagnostics::{Diagnostic, Violation}; use ruff_macros::{derive_message_formats, violation}; +/// ## What it does +/// Checks for uses of `pd.merge` on Pandas objects. +/// +/// ## Why is this bad? +/// In Pandas, the `.merge` method (exposed on, e.g., DataFrame objects) and +/// the `pd.merge` function (exposed on the Pandas module) are equivalent. +/// +/// For consistency, prefer calling `.merge` on an object over calling +/// `pd.merge` on the Pandas module, as the former is more idiomatic. +/// +/// Further, `pd.merge` is not a method, but a function, which prohibits it +/// from being used in method chains, a common pattern in Pandas code. +/// +/// ## Example +/// ```python +/// import pandas as pd +/// +/// cats_df = pd.read_csv("cats.csv") +/// dogs_df = pd.read_csv("dogs.csv") +/// rabbits_df = pd.read_csv("rabbits.csv") +/// pets_df = pd.merge(pd.merge(cats_df, dogs_df), rabbits_df) # Hard to read. +/// ``` +/// +/// Use instead: +/// ```python +/// import pandas as pd +/// +/// cats_df = pd.read_csv("cats.csv") +/// dogs_df = pd.read_csv("dogs.csv") +/// rabbits_df = pd.read_csv("rabbits.csv") +/// pets_df = cats_df.merge(dogs_df).merge(rabbits_df) +/// ``` +/// +/// ## References +/// - [Pandas documentation: `merge`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html#pandas.DataFrame.merge) +/// - [Pandas documentation: `pd.merge`](https://pandas.pydata.org/docs/reference/api/pandas.merge.html#pandas.merge) #[violation] pub struct PandasUseOfPdMerge; diff --git a/crates/ruff/src/rules/pandas_vet/rules/subscript.rs b/crates/ruff/src/rules/pandas_vet/rules/subscript.rs index 6bcbb8298b..8916c7d29a 100644 --- a/crates/ruff/src/rules/pandas_vet/rules/subscript.rs +++ b/crates/ruff/src/rules/pandas_vet/rules/subscript.rs @@ -8,6 +8,36 @@ use crate::checkers::ast::Checker; use crate::registry::Rule; use crate::rules::pandas_vet::helpers::{test_expression, Resolution}; +/// ## What it does +/// Checks for uses of `.ix` on Pandas objects. +/// +/// ## Why is this bad? +/// The `.ix` method is deprecated as its behavior is ambiguous. Specifically, +/// it's often unclear whether `.ix` is indexing by label or by ordinal position. +/// +/// Instead, prefer the `.loc` method for label-based indexing, and `.iloc` for +/// ordinal indexing. +/// +/// ## Example +/// ```python +/// import pandas as pd +/// +/// students_df = pd.read_csv("students.csv") +/// students_df.ix[0] # 0th row or row with label 0? +/// ``` +/// +/// Use instead: +/// ```python +/// import pandas as pd +/// +/// students_df = pd.read_csv("students.csv") +/// students_df.iloc[0] # 0th row. +/// ``` +/// +/// ## References +/// - [Pandas release notes: Deprecate `.ix`](https://pandas.pydata.org/pandas-docs/version/0.20/whatsnew.html#deprecate-ix) +/// - [Pandas documentation: `loc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html) +/// - [Pandas documentation: `iloc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html) #[violation] pub struct PandasUseOfDotIx; @@ -18,6 +48,38 @@ impl Violation for PandasUseOfDotIx { } } +/// ## What it does +/// Checks for uses of `.at` on Pandas objects. +/// +/// ## Why is this bad? +/// The `.at` method selects a single value from a DataFrame or Series based on +/// a label index, and is slightly faster than using `.loc`. However, `.loc` is +/// more idiomatic and versatile, as it can be used to select multiple values at +/// once. +/// +/// If performance is an important consideration, convert the object to a NumPy +/// array, which will provide a much greater performance boost than using `.at` +/// over `.loc`. +/// +/// ## Example +/// ```python +/// import pandas as pd +/// +/// students_df = pd.read_csv("students.csv") +/// students_df.at["Maria"] +/// ``` +/// +/// Use instead: +/// ```python +/// import pandas as pd +/// +/// students_df = pd.read_csv("students.csv") +/// students_df.loc["Maria"] +/// ``` +/// +/// ## References +/// - [Pandas documentation: `loc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html) +/// - [Pandas documentation: `at`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.at.html) #[violation] pub struct PandasUseOfDotAt; @@ -28,6 +90,47 @@ impl Violation for PandasUseOfDotAt { } } +/// ## What it does +/// Checks for uses of `.iat` on Pandas objects. +/// +/// ## Why is this bad? +/// The `.iat` method selects a single value from a DataFrame or Series based +/// on an ordinal index, and is slightly faster than using `.iloc`. However, +/// `.iloc` is more idiomatic and versatile, as it can be used to select +/// multiple values at once. +/// +/// If performance is an important consideration, convert the object to a NumPy +/// array, which will provide a much greater performance boost than using `.iat` +/// over `.iloc`. +/// +/// ## Example +/// ```python +/// import pandas as pd +/// +/// students_df = pd.read_csv("students.csv") +/// students_df.iat[0] +/// ``` +/// +/// Use instead: +/// ```python +/// import pandas as pd +/// +/// students_df = pd.read_csv("students.csv") +/// students_df.iloc[0] +/// ``` +/// +/// Or, using NumPy: +/// ```python +/// import numpy as np +/// import pandas as pd +/// +/// students_df = pd.read_csv("students.csv") +/// students_df.to_numpy()[0] +/// ``` +/// +/// ## References +/// - [Pandas documentation: `iloc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html) +/// - [Pandas documentation: `iat`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iat.html) #[violation] pub struct PandasUseOfDotIat; @@ -43,11 +146,12 @@ pub(crate) fn subscript(checker: &mut Checker, value: &Expr, expr: &Expr) { return; }; - let rules = &checker.settings.rules; let violation: DiagnosticKind = match attr.as_str() { - "ix" if rules.enabled(Rule::PandasUseOfDotIx) => PandasUseOfDotIx.into(), - "at" if rules.enabled(Rule::PandasUseOfDotAt) => PandasUseOfDotAt.into(), - "iat" if rules.enabled(Rule::PandasUseOfDotIat) => PandasUseOfDotIat.into(), + "ix" if checker.settings.rules.enabled(Rule::PandasUseOfDotIx) => PandasUseOfDotIx.into(), + "at" if checker.settings.rules.enabled(Rule::PandasUseOfDotAt) => PandasUseOfDotAt.into(), + "iat" if checker.settings.rules.enabled(Rule::PandasUseOfDotIat) => { + PandasUseOfDotIat.into() + } _ => return, };