Source code for moderndive.theory

"""Theory-based inference wrappers (scipy.stats).

The book deliberately teaches simulation-based inference first, then ties results
back to the traditional theory-based methods (t-distribution CIs, the two-sample
test, normal approximations). These helpers provide those theory-based companions
so the chapters can draw the simulation-vs-theory comparison.

All functions return small polars frames with tidy column names.
"""

from __future__ import annotations

import numpy as np
import polars as pl

__all__ = [
    "t_test_one_sample",
    "t_test_two_sample",
    "t_confidence_interval",
    "prop_test_two_sample",
]


def _arr(x) -> np.ndarray:
    if isinstance(x, pl.Series):
        return x.drop_nulls().to_numpy()
    return np.asarray(x, dtype=float)



[docs]
def t_test_one_sample(x, mu: float = 0.0, alternative: str = "two-sided") -> pl.DataFrame:
    """One-sample t-test of H0: mean == ``mu``."""
    from scipy import stats

    a = _arr(x)
    res = stats.ttest_1samp(a, popmean=mu, alternative=alternative)
    return pl.DataFrame(
        {
            "statistic": [float(res.statistic)],
            "df": [float(a.size - 1)],
            "p_value": [float(res.pvalue)],
        }
    )




[docs]
def t_confidence_interval(x, level: float = 0.95) -> pl.DataFrame:
    """Theory-based t confidence interval for a single mean."""
    from scipy import stats

    a = _arr(x)
    n = a.size
    mean = float(a.mean())
    se = float(a.std(ddof=1) / np.sqrt(n))
    tcrit = float(stats.t.ppf(1 - (1 - level) / 2, df=n - 1))
    return pl.DataFrame({"lower_ci": [mean - tcrit * se], "upper_ci": [mean + tcrit * se]})




[docs]
def t_test_two_sample(
    x, y, alternative: str = "two-sided", equal_var: bool = False
) -> pl.DataFrame:
    """Two-sample (Welch by default) t-test of equal means."""
    from scipy import stats

    a, b = _arr(x), _arr(y)
    res = stats.ttest_ind(a, b, equal_var=equal_var, alternative=alternative)
    return pl.DataFrame({"statistic": [float(res.statistic)], "p_value": [float(res.pvalue)]})




[docs]
def prop_test_two_sample(
    successes: tuple[int, int], totals: tuple[int, int], alternative: str = "two-sided"
) -> pl.DataFrame:
    """Two-sample z-test for a difference in proportions (normal approximation)."""
    from scipy import stats

    x1, x2 = successes
    n1, n2 = totals
    p1, p2 = x1 / n1, x2 / n2
    p_pool = (x1 + x2) / (n1 + n2)
    se = np.sqrt(p_pool * (1 - p_pool) * (1 / n1 + 1 / n2))
    z = (p1 - p2) / se
    if alternative in ("greater", "right"):
        p = float(stats.norm.sf(z))
    elif alternative in ("less", "left"):
        p = float(stats.norm.cdf(z))
    else:
        p = float(2 * stats.norm.sf(abs(z)))
    return pl.DataFrame({"estimate": [p1 - p2], "statistic": [float(z)], "p_value": [p]})