Source code for moderndive.correlation

"""Correlation and population-spread helpers mirroring the R ``moderndive`` package.

- :func:`get_correlation` ~ ``moderndive::get_correlation`` (tidy 1-row ``cor`` frame)
- :func:`pop_sd`          ~ ``moderndive::pop_sd`` (population standard deviation)
"""

from __future__ import annotations

import numpy as np
import polars as pl

__all__ = ["get_correlation", "pop_sd"]


def _parse_pair(formula: str | None, x: str | None, y: str | None) -> tuple[str, str]:
    """Resolve the (y, x) column pair from a ``"y ~ x"`` formula or x=/y= kwargs."""
    if formula is not None:
        if x is not None or y is not None:
            raise ValueError("Pass either formula or x=/y=, not both.")
        if "~" not in formula:
            raise ValueError(f"formula must look like 'y ~ x', got {formula!r}.")
        lhs, rhs = (part.strip() for part in formula.split("~", 1))
        if not lhs or not rhs:
            raise ValueError(f"formula must look like 'y ~ x', got {formula!r}.")
        return lhs, rhs
    if x is None or y is None:
        raise ValueError("Provide a formula 'y ~ x' or both x= and y=.")
    return y, x


[docs] def get_correlation( data, formula: str | None = None, *, x: str | None = None, y: str | None = None, ) -> pl.DataFrame: """Pearson correlation as a tidy 1-row frame with a ``cor`` column. Mirrors ``moderndive::get_correlation(data, y ~ x)``. Specify the variable pair either as a formula string (``"y ~ x"``) or via the ``x=`` and ``y=`` keyword arguments. Rows with a null in either column are dropped. """ df = data if isinstance(data, pl.DataFrame) else pl.from_pandas(data) y_col, x_col = _parse_pair(formula, x, y) for col in (y_col, x_col): if col not in df.columns: raise ValueError(f"Column {col!r} is not in the data.") pair = df.select(x_col, y_col).drop_nulls() value = float(np.corrcoef(pair[x_col].to_numpy(), pair[y_col].to_numpy())[0, 1]) return pl.DataFrame({"cor": [value]})
[docs] def pop_sd(x) -> float: """Population standard deviation (divides by ``n``, not ``n - 1``). Mirrors ``moderndive::pop_sd``. Accepts a polars Series, list, numpy array, or any sequence; nulls/NaNs are dropped before computing. """ if isinstance(x, pl.Series): values = x.drop_nulls().to_numpy() else: values = np.asarray(list(x), dtype=float) values = values[~np.isnan(values)] return float(np.std(values, ddof=0))