Source code for moderndive.sampling

"""Repeated sampling helpers for the sampling activities (Chapter 7).

Mirrors the R ``moderndive`` functions ``rep_slice_sample()`` /
``rep_sample_n()``: draw ``reps`` samples of size ``n`` from a data frame and
stack them into one long data frame with a ``replicate`` column (1..reps), so a
grouped summary computes one statistic per virtual sample.
"""

from __future__ import annotations

import numpy as np
import polars as pl

__all__ = ["rep_slice_sample", "rep_sample_n"]


[docs] def rep_slice_sample( data: pl.DataFrame, n: int, reps: int = 1, replace: bool = False, seed: int | None = None, ) -> pl.DataFrame: """Take ``reps`` samples of size ``n`` from ``data``. Returns a polars DataFrame with a leading ``replicate`` column identifying which sample each row belongs to. Set ``replace=True`` for sampling with replacement (e.g. bootstrap-style). Pass ``seed`` for reproducibility. """ rng = np.random.default_rng(seed) n_rows = data.height if not replace and n > n_rows: raise ValueError(f"cannot take a sample of size {n} without replacement from {n_rows} rows") samples = [] for replicate in range(1, reps + 1): idx = rng.choice(n_rows, size=n, replace=replace) sample = data[idx.tolist()].with_columns( pl.lit(replicate, dtype=pl.Int64).alias("replicate") ) samples.append(sample) combined = pl.concat(samples) # Put `replicate` first. return combined.select(["replicate", *data.columns])
# The older moderndive name is a thin alias with the same behavior.
[docs] def rep_sample_n( data: pl.DataFrame, n: int, reps: int = 1, replace: bool = False, seed: int | None = None, ) -> pl.DataFrame: """Alias for :func:`rep_slice_sample` (older moderndive name).""" return rep_slice_sample(data, n=n, reps=reps, replace=replace, seed=seed)