Source code for moderndive.modeling

"""Regression + summary helpers mirroring the R ``moderndive`` package.

Built on ``statsmodels`` (which, unlike scikit-learn, reports standard errors,
test statistics, p-values, and confidence intervals — the inferential output the
book teaches). Inputs may be polars or pandas frames; outputs are polars frames.

- :func:`get_regression_table`  ~ ``broom::tidy`` + CIs
- :func:`get_regression_points` ~ ``broom::augment`` (fitted values + residuals)
- :func:`tidy_summary`          ~ per-variable summary statistics
"""

from __future__ import annotations

import re

import numpy as np
import polars as pl

__all__ = [
    "get_regression_table",
    "get_regression_points",
    "get_regression_summaries",
    "tidy_summary",
    "count_missing",
]


def _to_pandas(data):
    if isinstance(data, pl.DataFrame):
        return data.to_pandas()
    return data


def _clean_term(term: str) -> str:
    """Turn patsy term labels into the tidy names moderndive uses.

    Examples:
        ``Intercept``                         -> ``intercept``
        ``income[T.Lower middle income]``     -> ``income: Lower middle income``
        ``life_exp:income[T.High income]``    -> ``life_exp:income: High income``
        ``C(income, levels=[...])[T.High income]`` -> ``income: High income``
    """
    term = term.replace("Intercept", "intercept")
    # Drop any C(col, ...) wrapper down to just the column name.
    term = re.sub(r"C\(\s*([A-Za-z_]\w*)[^)]*\)", r"\1", term)
    # Turn a categorical level marker [T.level] into ": level".
    term = re.sub(r"\[T\.(.*?)\]", r": \1", term)
    return term



[docs]
def get_regression_table(model, digits: int = 3, conf_level: float = 0.95) -> pl.DataFrame:
    """Tidy regression table: term, estimate, std_error, statistic, p_value, lower/upper_ci.

    ``model`` is a fitted ``statsmodels`` results object (e.g. from
    ``statsmodels.formula.api.ols("y ~ x", data).fit()``).
    """
    conf = model.conf_int(alpha=1 - conf_level)
    terms = [_clean_term(t) for t in model.params.index]
    table = pl.DataFrame(
        {
            "term": terms,
            "estimate": model.params.to_numpy(),
            "std_error": model.bse.to_numpy(),
            "statistic": model.tvalues.to_numpy(),
            "p_value": model.pvalues.to_numpy(),
            "lower_ci": np.asarray(conf)[:, 0],
            "upper_ci": np.asarray(conf)[:, 1],
        }
    )
    numeric = [c for c in table.columns if c != "term"]
    return table.with_columns(pl.col(numeric).round(digits))




[docs]
def get_regression_points(model, digits: int = 3) -> pl.DataFrame:
    """Fitted values + residuals per observation (~ ``broom::augment``).

    Columns: ``ID``, the response, each explanatory term, ``<response>_hat``,
    ``residual``.
    """
    endog_name = model.model.endog_names
    exog_names = [n for n in model.model.exog_names if n != "Intercept"]
    frame = _to_pandas(model.model.data.frame)

    out = {
        "ID": np.arange(1, len(frame) + 1, dtype=np.int64),
        endog_name: np.asarray(frame[endog_name]),
    }
    for name in exog_names:
        if name in frame.columns:
            out[name] = np.asarray(frame[name])
    out[f"{endog_name}_hat"] = np.asarray(model.fittedvalues)
    out["residual"] = np.asarray(model.resid)

    df = pl.DataFrame(out)
    round_cols = [endog_name, f"{endog_name}_hat", "residual"]
    round_cols = [c for c in round_cols if df.schema[c].is_numeric()]
    return df.with_columns(pl.col(round_cols).round(digits))




[docs]
def get_regression_summaries(model, digits: int = 3) -> pl.DataFrame:
    """Model-fit summaries as a tidy 1-row frame (~ ``moderndive::get_regression_summaries``).

    Columns: ``r_squared``, ``adj_r_squared``, ``mse``, ``rmse``, ``sigma``,
    ``statistic`` (overall F), ``p_value``, ``df`` (model degrees of freedom),
    ``nobs``. ``model`` is a fitted ``statsmodels`` results object.

    ``mse`` is the mean squared residual using ``n`` in the denominator (so
    ``rmse = sqrt(mse)``), while ``sigma`` is the residual standard error using
    ``n - p`` — matching the R package.
    """
    nobs = int(model.nobs)
    mse = float(model.ssr) / nobs
    table = pl.DataFrame(
        {
            "r_squared": [float(model.rsquared)],
            "adj_r_squared": [float(model.rsquared_adj)],
            "mse": [mse],
            "rmse": [float(np.sqrt(mse))],
            "sigma": [float(np.sqrt(model.mse_resid))],
            "statistic": [float(model.fvalue)],
            "p_value": [float(model.f_pvalue)],
            "df": [int(model.df_model)],
            "nobs": [nobs],
        }
    )
    float_cols = [c for c, dt in table.schema.items() if dt.is_float()]
    return table.with_columns(pl.col(float_cols).round(digits))




[docs]
def tidy_summary(data, columns: list[str] | None = None, digits: int = 3) -> pl.DataFrame:
    """Per-variable summary statistics for the selected columns.

    Mirrors the R ``moderndive::tidy_summary`` column layout:
    ``column, n, group, type, min, Q1, mean, median, Q3, max, sd``.
    Numeric columns get the five-number summary + mean/sd; non-numeric columns
    report ``n`` and ``type`` with the numeric fields left null.
    """
    df = data if isinstance(data, pl.DataFrame) else pl.from_pandas(data)
    columns = columns or df.columns

    rows = []
    for col in columns:
        series = df[col]
        is_num = series.dtype.is_numeric()
        n = int(series.len() - series.null_count())
        row = {
            "column": col,
            "n": n,
            "group": None,
            "type": "numeric" if is_num else "categorical",
            "min": None,
            "Q1": None,
            "mean": None,
            "median": None,
            "Q3": None,
            "max": None,
            "sd": None,
        }
        if is_num:
            s = series.drop_nulls()
            row.update(
                min=round(float(s.min()), digits),
                Q1=round(float(s.quantile(0.25)), digits),
                mean=round(float(s.mean()), digits),
                median=round(float(s.median()), digits),
                Q3=round(float(s.quantile(0.75)), digits),
                max=round(float(s.max()), digits),
                sd=round(float(s.std()), digits),
            )
        rows.append(row)

    schema = {
        "column": pl.Utf8,
        "n": pl.Int64,
        "group": pl.Utf8,
        "type": pl.Utf8,
        "min": pl.Float64,
        "Q1": pl.Float64,
        "mean": pl.Float64,
        "median": pl.Float64,
        "Q3": pl.Float64,
        "max": pl.Float64,
        "sd": pl.Float64,
    }
    return pl.DataFrame(rows, schema=schema)




[docs]
def count_missing(data, columns: list[str] | None = None) -> pl.DataFrame:
    """Count missing (``null``) values in each column.

    A beginner-friendly alternative to ``df.select(pl.all().is_null().sum())``:
    it returns a tidy two-column data frame with one row per column
    (``column``, ``n_missing``), sorted from most to fewest missing values so the
    columns needing attention surface first.

    Parameters
    ----------
    data:
        A polars (or pandas) data frame.
    columns:
        Optional list of column names to check; defaults to every column.
    """
    df = data if isinstance(data, pl.DataFrame) else pl.from_pandas(data)
    columns = columns or df.columns
    return pl.DataFrame(
        {
            "column": columns,
            "n_missing": [int(df[col].null_count()) for col in columns],
        },
        schema={"column": pl.Utf8, "n_missing": pl.Int64},
    ).sort("n_missing", descending=True)