Source code for moderndive.sampling
"""Repeated sampling helpers for the sampling activities (Chapter 7).
Mirrors the R ``moderndive`` functions ``rep_slice_sample()`` /
``rep_sample_n()``: draw ``reps`` samples from a data frame and stack them into
one long data frame with a ``replicate`` column (1..reps), so a grouped summary
computes one statistic per virtual sample.
"""
from __future__ import annotations
import numpy as np
import polars as pl
from ._messaging import helpful_error
__all__ = ["rep_slice_sample", "rep_sample_n"]
def _weights(data: pl.DataFrame, weight_by) -> np.ndarray | None:
"""Normalize ``weight_by`` (a column name or a sequence) into probabilities."""
if weight_by is None:
return None
w = data[weight_by].to_numpy() if isinstance(weight_by, str) else np.asarray(weight_by, float)
total = w.sum()
if total <= 0:
raise ValueError(helpful_error("weight_by must contain positive weights that sum to > 0."))
return w / total
[docs]
def rep_slice_sample(
data: pl.DataFrame,
n: int | None = None,
*,
prop: float | None = None,
reps: int = 1,
replace: bool = False,
weight_by=None,
seed: int | None = None,
) -> pl.DataFrame:
"""Take ``reps`` samples from ``data``.
Give the sample size as either ``n`` (a count) or ``prop`` (a fraction of the
rows, e.g. ``prop=0.5``). Returns a polars DataFrame with a leading
``replicate`` column identifying which sample each row belongs to. Set
``replace=True`` for sampling with replacement (bootstrap-style). ``weight_by``
gives unequal selection probabilities — a column name or a sequence of
weights. Pass ``seed`` for reproducibility.
"""
if (n is None) == (prop is None):
raise ValueError(
helpful_error(
"Specify exactly one of n= (a count) or prop= (a fraction).",
"e.g. rep_slice_sample(df, n=50) or rep_slice_sample(df, prop=0.5).",
)
)
n_rows = data.height
size = n if n is not None else int(round(prop * n_rows))
if not replace and size > n_rows:
raise ValueError(
f"cannot take a sample of size {size} without replacement from {n_rows} rows"
)
probs = _weights(data, weight_by)
rng = np.random.default_rng(seed)
samples = []
for replicate in range(1, reps + 1):
idx = rng.choice(n_rows, size=size, replace=replace, p=probs)
sample = data[idx.tolist()].with_columns(
pl.lit(replicate, dtype=pl.Int64).alias("replicate")
)
samples.append(sample)
combined = pl.concat(samples)
return combined.select(["replicate", *data.columns])
[docs]
def rep_sample_n(
data: pl.DataFrame,
n: int,
*,
reps: int = 1,
replace: bool = False,
prob=None,
seed: int | None = None,
) -> pl.DataFrame:
"""Take ``reps`` samples of size ``n`` (older moderndive name).
Like :func:`rep_slice_sample`, but the sample size is always the count ``n``
and unequal selection weights are passed as ``prob`` (a column name or a
sequence), matching the R ``rep_sample_n`` signature.
"""
return rep_slice_sample(data, n=n, reps=reps, replace=replace, weight_by=prob, seed=seed)