Source code for moderndive.sampling

"""Repeated sampling helpers for the sampling activities (Chapter 7).

Mirrors the R ``moderndive`` functions ``rep_slice_sample()`` /
``rep_sample_n()``: draw ``reps`` samples from a data frame and stack them into
one long data frame with a ``replicate`` column (1..reps), so a grouped summary
computes one statistic per virtual sample.
"""

from __future__ import annotations

import numpy as np
import polars as pl

from ._messaging import helpful_error

__all__ = ["rep_slice_sample", "rep_sample_n"]


def _weights(data: pl.DataFrame, weight_by) -> np.ndarray | None:
    """Normalize ``weight_by`` (a column name or a sequence) into probabilities."""
    if weight_by is None:
        return None
    w = data[weight_by].to_numpy() if isinstance(weight_by, str) else np.asarray(weight_by, float)
    total = w.sum()
    if total <= 0:
        raise ValueError(helpful_error("weight_by must contain positive weights that sum to > 0."))
    return w / total


[docs] def rep_slice_sample( data: pl.DataFrame, n: int | None = None, *, prop: float | None = None, reps: int = 1, replace: bool = False, weight_by=None, seed: int | None = None, ) -> pl.DataFrame: """Take ``reps`` samples from ``data``. Give the sample size as either ``n`` (a count) or ``prop`` (a fraction of the rows, e.g. ``prop=0.5``). Returns a polars DataFrame with a leading ``replicate`` column identifying which sample each row belongs to. Set ``replace=True`` for sampling with replacement (bootstrap-style). ``weight_by`` gives unequal selection probabilities — a column name or a sequence of weights. Pass ``seed`` for reproducibility. """ if (n is None) == (prop is None): raise ValueError( helpful_error( "Specify exactly one of n= (a count) or prop= (a fraction).", "e.g. rep_slice_sample(df, n=50) or rep_slice_sample(df, prop=0.5).", ) ) n_rows = data.height size = n if n is not None else int(round(prop * n_rows)) if not replace and size > n_rows: raise ValueError( f"cannot take a sample of size {size} without replacement from {n_rows} rows" ) probs = _weights(data, weight_by) rng = np.random.default_rng(seed) samples = [] for replicate in range(1, reps + 1): idx = rng.choice(n_rows, size=size, replace=replace, p=probs) sample = data[idx.tolist()].with_columns( pl.lit(replicate, dtype=pl.Int64).alias("replicate") ) samples.append(sample) combined = pl.concat(samples) return combined.select(["replicate", *data.columns])
[docs] def rep_sample_n( data: pl.DataFrame, n: int, *, reps: int = 1, replace: bool = False, prob=None, seed: int | None = None, ) -> pl.DataFrame: """Take ``reps`` samples of size ``n`` (older moderndive name). Like :func:`rep_slice_sample`, but the sample size is always the count ``n`` and unequal selection weights are passed as ``prob`` (a column name or a sequence), matching the R ``rep_sample_n`` signature. """ return rep_slice_sample(data, n=n, reps=reps, replace=replace, weight_by=prob, seed=seed)