"""Core classes for the infer grammar: specify / hypothesize / generate / calculate / fit.
Mirrors the R ``infer`` pipeline. Because polars DataFrames have no ``.specify``
method, the pipeline entry point is the module-level :func:`specify` function:
bootstrap_means = (
specify(almonds_sample_100, response="weight")
.generate(reps=1000, type="bootstrap")
.calculate(stat="mean")
)
null = (
specify(spotify, formula="popular_or_not ~ track_genre", success="popular")
.hypothesize(null="independence")
.generate(reps=1000, type="permute")
.calculate(stat="diff in props", order=("metal", "deep-house"))
)
# Multiple regression: fit() instead of calculate()
boot_fits = (
specify(coffee, formula="total_cup_points ~ aroma + flavor")
.generate(reps=1000, type="bootstrap")
.fit()
)
"""
from __future__ import annotations
from dataclasses import dataclass, field
import numpy as np
import polars as pl
from . import resample as _resample
from . import statistics as _stats
def _parse_formula(formula: str) -> tuple[str, str | None]:
"""Parse ``"y ~ x"`` → (response, explanatory). ``x`` may be ``NULL``/``None``.
For multiple terms (``"y ~ a + b"``) the explanatory is returned as the whole
right-hand side string; single-variable stats use it as a column name, while
:meth:`GeneratedReplicates.fit` uses the full formula instead.
"""
if "~" not in formula:
raise ValueError(f"formula {formula!r} must contain '~' (e.g. 'y ~ x')")
lhs, rhs = (side.strip() for side in formula.split("~", 1))
explanatory = None if rhs in ("", "NULL", "None", "1", ".") else rhs
return lhs, explanatory
[docs]
@dataclass(frozen=True)
class Specification:
"""Result of :func:`specify`: the chosen response (+ optional explanatory)."""
data: pl.DataFrame
response: str
explanatory: str | None = None
success: object | None = None
formula: str | None = None
@property
def _response_values(self) -> np.ndarray:
return self.data[self.response].to_numpy()
@property
def _explanatory_values(self) -> np.ndarray | None:
# Single-column explanatory only (used by single-variable statistics).
if self.explanatory is None or self.explanatory not in self.data.columns:
return None
return self.data[self.explanatory].to_numpy()
def _full_formula(self) -> str:
if self.formula is not None:
return self.formula
if self.explanatory is None:
return f"{self.response} ~ 1"
return f"{self.response} ~ {self.explanatory}"
def hypothesize(
self,
null: str,
*,
mu: float | None = None,
p: float | None = None,
sigma: float | None = None,
) -> Hypothesis:
if null not in ("point", "independence", "paired independence"):
raise ValueError("null must be 'point', 'independence', or 'paired independence'")
return Hypothesis(spec=self, null=null, mu=mu, p=p, sigma=sigma)
def generate(
self, reps: int, type: str = "bootstrap", *, seed: int | None = None
) -> GeneratedReplicates:
return _generate(self, hypothesis=None, reps=reps, type=type, seed=seed)
[docs]
def calculate(
self,
stat,
*,
order: tuple[object, object] | None = None,
mu: float | None = None,
p: float | None = None,
sigma: float | None = None,
) -> ObservedStatistic:
"""Compute the observed statistic (no resampling)."""
value = _stats.compute_statistic(
self._response_values,
self._explanatory_values,
stat,
success=self.success,
order=order,
mu=mu,
p=p,
sigma=sigma,
)
label = stat if isinstance(stat, str) else "stat"
return ObservedStatistic(value=value, stat=label)
[docs]
def assume(self, distribution: str, df: object | None = None):
"""Set a theoretical sampling distribution (see :func:`assume`)."""
from .theoretical import assume as _assume
return _assume(distribution, df=df)
# British-spelling alias (infer parity).
def hypothesise(self, null: str, *, mu=None, p=None, sigma=None):
return self.hypothesize(null, mu=mu, p=p, sigma=sigma)
[docs]
def fit(self) -> FitResult:
"""Fit the observed regression (ordinary least squares) for the formula."""
import statsmodels.formula.api as smf
from ..modeling import _clean_term
model = smf.ols(self._full_formula(), data=self.data.to_pandas()).fit()
terms = [_clean_term(t) for t in model.params.index]
df = pl.DataFrame({"term": terms, "estimate": model.params.to_numpy()})
return FitResult(data=df)
[docs]
@dataclass(frozen=True)
class Hypothesis:
spec: Specification
null: str
mu: float | None = None
p: float | None = None
sigma: float | None = None
def generate(
self, reps: int, type: str | None = None, *, seed: int | None = None
) -> GeneratedReplicates:
if type is None:
type = "bootstrap" if self.null == "point" else "permute"
return _generate(self.spec, hypothesis=self, reps=reps, type=type, seed=seed)
[docs]
def calculate(
self,
stat,
*,
order: tuple[object, object] | None = None,
) -> ObservedStatistic:
"""Observed statistic that needs the hypothesized value (e.g. stat='t'/'z')."""
return self.spec.calculate(stat, order=order, mu=self.mu, p=self.p, sigma=self.sigma)
[docs]
@dataclass(frozen=True)
class GeneratedReplicates:
"""Materialized resampling plan; supports both ``calculate()`` and ``fit()``.
``plans`` holds one numpy array per replicate:
- bootstrap: row indices (with replacement) into the data,
- permute: a permutation of the row positions (used to shuffle a column),
- draw: a simulated response array under a point null proportion.
"""
spec: Specification
type: str
null: str | None
plans: list[np.ndarray] = field(repr=False)
shifted_response: np.ndarray | None = field(default=None, repr=False)
hyp_mu: float | None = None
hyp_p: float | None = None
hyp_sigma: float | None = None
# --- single-variable / two-group statistics ---------------------------
def calculate(self, stat, *, order: tuple[object, object] | None = None) -> Distribution:
spec = self.spec
resp_full = self.shifted_response
if resp_full is None:
resp_full = spec._response_values
expl_full = spec._explanatory_values
paired = self.null == "paired independence"
values = []
for plan in self.plans:
if self.type == "bootstrap":
resp = resp_full[plan]
expl = None if expl_full is None else expl_full[plan]
elif self.type == "permute" and paired:
resp = resp_full * plan # randomly flip the sign of each difference
expl = None
elif self.type == "permute":
resp = resp_full
expl = None if expl_full is None else expl_full[plan]
else: # draw
resp = plan # the simulated response array
expl = None
values.append(
_stats.compute_statistic(
resp,
expl,
stat,
success=spec.success,
order=order,
mu=self.hyp_mu,
p=self.hyp_p,
sigma=self.hyp_sigma,
)
)
df = pl.DataFrame(
{
"replicate": np.arange(1, len(values) + 1, dtype=np.int64),
"stat": np.asarray(values, dtype=float),
}
)
label = stat if isinstance(stat, str) else "stat"
return Distribution(data=df, stat=label, null=self.null, type=self.type)
# --- multiple regression ---------------------------------------------
[docs]
def fit(self) -> FitResult:
"""Fit OLS on each replicate, returning per-term estimates (long form)."""
import statsmodels.formula.api as smf
from ..modeling import _clean_term
spec = self.spec
formula = spec._full_formula()
base = spec.data
response = spec.response
rows_replicate, rows_term, rows_estimate = [], [], []
for i, plan in enumerate(self.plans, start=1):
if self.type == "bootstrap":
rep_data = base[plan.tolist()]
elif self.type == "permute":
# Shuffle the response column to break association under the null.
shuffled = base[response].to_numpy()[plan]
rep_data = base.with_columns(pl.Series(response, shuffled))
else:
raise ValueError("fit() supports type='bootstrap' or 'permute'")
model = smf.ols(formula, data=rep_data.to_pandas()).fit()
for term, est in zip(model.params.index, model.params.to_numpy(), strict=False):
rows_replicate.append(i)
rows_term.append(_clean_term(term))
rows_estimate.append(float(est))
df = pl.DataFrame(
{"replicate": rows_replicate, "term": rows_term, "estimate": rows_estimate}
)
return FitResult(data=df, null=self.null, type=self.type)
def _generate(
spec: Specification,
*,
hypothesis: Hypothesis | None,
reps: int,
type: str,
seed: int | None,
) -> GeneratedReplicates:
if type == "simulate": # infer accepts "simulate" as an alias for "draw"
type = "draw"
if type not in ("bootstrap", "permute", "draw"):
raise ValueError("type must be 'bootstrap', 'permute', 'draw', or 'simulate'")
rng = _resample.make_rng(seed)
n = spec.data.height
null = None if hypothesis is None else hypothesis.null
hyp_mu = None if hypothesis is None else hypothesis.mu
hyp_p = None if hypothesis is None else hypothesis.p
hyp_sigma = None if hypothesis is None else hypothesis.sigma
shifted = None
plans: list[np.ndarray] = []
if type == "permute" and null == "paired independence":
# Randomly flip the sign of each paired difference.
for _ in range(reps):
plans.append(rng.choice([-1.0, 1.0], size=n))
elif type == "permute":
if spec.explanatory is None and spec.formula is None:
raise ValueError("type='permute' requires an explanatory variable")
for _ in range(reps):
plans.append(rng.permutation(n))
elif type == "draw":
if hypothesis is None or hypothesis.p is None:
raise ValueError("type='draw' requires hypothesize(null='point', p=...)")
success = spec.success
nonsuccess = "\x00not_success"
p0 = hypothesis.p
for _ in range(reps):
draws = rng.random(n) < p0
plans.append(np.where(draws, success, nonsuccess).astype(object))
else: # bootstrap
if hypothesis is not None and hypothesis.null == "point" and hypothesis.mu is not None:
shifted = _resample.shift_for_point_null(
spec._response_values, stat="mean", mu=hypothesis.mu, p=None
)
for _ in range(reps):
plans.append(rng.integers(0, n, size=n))
return GeneratedReplicates(
spec=spec,
type=type,
null=null,
plans=plans,
shifted_response=shifted,
hyp_mu=hyp_mu,
hyp_p=hyp_p,
hyp_sigma=hyp_sigma,
)
@dataclass(frozen=True)
class ObservedStatistic:
"""A single observed statistic. Behaves like a float and prints like infer."""
value: float
stat: str
def __float__(self) -> float:
return float(self.value)
def to_frame(self) -> pl.DataFrame:
return pl.DataFrame({"stat": [self.value]})
def __repr__(self) -> str:
return f"ObservedStatistic(stat={self.stat!r}, value={self.value:.6g})"
[docs]
@dataclass(frozen=True)
class Distribution:
"""A simulated distribution of statistics (one row per replicate)."""
data: pl.DataFrame
stat: str
null: str | None = None
type: str = "bootstrap"
@property
def stats(self) -> np.ndarray:
return self.data["stat"].to_numpy()
def __len__(self) -> int:
return self.data.height
def get_confidence_interval(
self,
level: float = 0.95,
type: str = "percentile",
*,
point_estimate: float | None = None,
) -> pl.DataFrame:
from .intervals import get_confidence_interval
return get_confidence_interval(self, level=level, type=type, point_estimate=point_estimate)
def get_p_value(self, obs_stat, direction: str) -> pl.DataFrame:
from .pvalue import get_p_value
return get_p_value(self, obs_stat=obs_stat, direction=direction)
def visualize(self, bins: int = 20, **kwargs):
from .viz import visualize
return visualize(self, bins=bins, **kwargs)
# infer-parity aliases.
def get_pvalue(self, obs_stat, direction: str) -> pl.DataFrame:
return self.get_p_value(obs_stat, direction)
def get_ci(self, level: float = 0.95, type: str = "percentile", **kw) -> pl.DataFrame:
return self.get_confidence_interval(level=level, type=type, **kw)
def visualise(self, bins: int = 20, **kwargs):
return self.visualize(bins=bins, **kwargs)
[docs]
@dataclass(frozen=True)
class FitResult:
"""Regression coefficients: observed (one row per term) or a distribution of
them across replicates (``replicate``, ``term``, ``estimate``)."""
data: pl.DataFrame
null: str | None = None
type: str = "bootstrap"
@property
def is_distribution(self) -> bool:
return "replicate" in self.data.columns
# Display niceties so a FitResult shows its table in notebooks/Quarto.
def head(self, n: int = 5) -> pl.DataFrame:
return self.data.head(n)
def _repr_html_(self) -> str:
return self.data._repr_html_()
def __repr__(self) -> str:
return repr(self.data)
def estimate_for(self, term: str) -> float:
row = self.data.filter(pl.col("term") == term)
return float(row["estimate"][0])
def get_confidence_interval(self, level: float = 0.95) -> pl.DataFrame:
from .intervals import get_fit_confidence_interval
return get_fit_confidence_interval(self, level=level)
def get_p_value(self, obs_stat: FitResult, direction: str = "two-sided") -> pl.DataFrame:
from .pvalue import get_fit_p_value
return get_fit_p_value(self, obs_stat=obs_stat, direction=direction)
def visualize(
self, bins: int = 20, *, engine: str = "plotly", shade_pvalue=None, shade_ci=None
):
from .viz import visualize_fit
return visualize_fit(
self, bins=bins, engine=engine, shade_pvalue=shade_pvalue, shade_ci=shade_ci
)
[docs]
def specify(
data: pl.DataFrame,
*,
response: str | None = None,
explanatory: str | None = None,
formula: str | None = None,
success: object | None = None,
) -> Specification:
"""Specify the response (and optional explanatory) variable(s) for inference.
Two equivalent forms, mirroring R ``infer``:
- ``specify(df, response="weight")``
- ``specify(df, formula="popular_or_not ~ track_genre", success="popular")``
Multi-term formulas (``"y ~ a + b"``) are supported for :meth:`fit`.
"""
full_formula = formula
if formula is not None:
if response is not None or explanatory is not None:
raise ValueError("pass either `formula` or `response`/`explanatory`, not both")
response, explanatory = _parse_formula(formula)
if response is None:
raise ValueError("specify() needs a `response` or a `formula`")
if response not in data.columns:
raise ValueError(f"column {response!r} not found in data")
# Validate single-column explanatory only (multi-term handled by fit()).
if (
explanatory is not None
and "+" not in explanatory
and "*" not in explanatory
and explanatory not in data.columns
):
raise ValueError(f"column {explanatory!r} not found in data")
return Specification(
data=data,
response=response,
explanatory=explanatory,
success=success,
formula=full_formula,
)
[docs]
def observe(
data: pl.DataFrame,
*,
response: str | None = None,
explanatory: str | None = None,
formula: str | None = None,
success: object | None = None,
stat: str = "mean",
order: tuple[object, object] | None = None,
null: str | None = None,
mu: float | None = None,
p: float | None = None,
sigma: float | None = None,
) -> ObservedStatistic:
"""Shortcut for ``specify() |> [hypothesize()] |> calculate()``.
Mirrors R ``infer::observe()`` — compute an observed statistic in one call.
"""
spec = specify(
data, response=response, explanatory=explanatory, formula=formula, success=success
)
if null is not None:
return spec.hypothesize(null=null, mu=mu, p=p, sigma=sigma).calculate(stat, order=order)
return spec.calculate(stat, order=order, mu=mu, p=p, sigma=sigma)
def _dataframe_specify(self, *, response=None, explanatory=None, formula=None, success=None):
"""Start an inference pipeline from this DataFrame (see :func:`specify`)."""
return specify(
self, response=response, explanatory=explanatory, formula=formula, success=success
)
def register_dataframe_accessor() -> None:
"""Attach a ``.specify()`` method to polars and pandas DataFrames.
Lets you write ``df.specify(response="y")`` — mirroring the R
``df %>% specify(...)`` flow — instead of ``specify(df, response="y")``.
Called once on import; skips a frame library that is not installed and never
overwrites an existing ``specify`` attribute.
"""
import polars as pl
if not hasattr(pl.DataFrame, "specify"):
pl.DataFrame.specify = _dataframe_specify
try:
import pandas as pd
except ImportError: # pragma: no cover - pandas is a dependency, always present
return
if not hasattr(pd.DataFrame, "specify"):
pd.DataFrame.specify = _dataframe_specify