tso-paper-eval/analysis/values/tests.py

"""Pairwise hypothesis tests across solutions.

Means: Welch's t (two-sided + one-sided) and Mann-Whitney U.
Variance: Brown-Forsythe (Levene, median-centered) and Fligner-Killeen.
"""
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import stats

PAIRS = [
    ("no-tso", "tso"),
    ("no-tso", "tso-pacing"),
    ("no-tso", "cake"),
    ("tso", "tso-pacing"),
    ("tso", "cake"),
    ("tso-pacing", "cake"),
]

ALPHA = 0.05


def _fmt(x: float) -> str:
    return f"{x:.4g}"


def _mean_tests(out: dict, base: str, a: np.ndarray, b: np.ndarray) -> None:
    _, p_t  = stats.ttest_ind(a, b, equal_var=False)
    _, p_tl = stats.ttest_ind(a, b, equal_var=False, alternative="less")
    _, p_tg = stats.ttest_ind(a, b, equal_var=False, alternative="greater")
    _, p_u  = stats.mannwhitneyu(a, b, alternative="two-sided")
    out[f"{base}/welch-p"]         = _fmt(p_t)
    out[f"{base}/welch-less-p"]    = _fmt(p_tl)
    out[f"{base}/welch-greater-p"] = _fmt(p_tg)
    out[f"{base}/mwu-p"]           = _fmt(p_u)


def _variance_tests(out: dict, base: str, a: np.ndarray, b: np.ndarray) -> None:
    bf = stats.levene(a, b, center="median")
    fl = stats.fligner(a, b)
    out[f"{base}/bf-p"]      = _fmt(bf.pvalue)
    out[f"{base}/fligner-p"] = _fmt(fl.pvalue)


def compute(derived: Path) -> tuple[dict[str, str], list[Path]]:
    out: dict[str, str] = {}
    sources: list[Path] = []
    for exp_dir in sorted(p for p in derived.iterdir() if p.is_dir()):
        runs_path = exp_dir / "runs.csv"
        rtts_path = exp_dir / "rtts.csv"

        if runs_path.exists():
            sources.append(runs_path)
            runs = pd.read_csv(runs_path)
            for metric, col in (("sender-cpu", "cpu_sender"),
                                ("receiver-cpu", "cpu_receiver")):
                vals = {sol: sub[col].to_numpy() for sol, sub in runs.groupby("solution")}
                for a, b in PAIRS:
                    if a not in vals or b not in vals:
                        continue
                    base = f"{exp_dir.name}/{metric}/test/{a}-vs-{b}"
                    _mean_tests(out, base, vals[a], vals[b])

        if rtts_path.exists():
            sources.append(rtts_path)
            rtts = pd.read_csv(rtts_path)
            vals = {sol: sub["rtt_us"].to_numpy() / 1000.0
                    for sol, sub in rtts.groupby("solution")}
            for a, b in PAIRS:
                if a not in vals or b not in vals:
                    continue
                base = f"{exp_dir.name}/rtt/test/{a}-vs-{b}"
                _mean_tests(out, base, vals[a], vals[b])
                _variance_tests(out, base, vals[a], vals[b])

        idts_path = exp_dir / "idts.csv"
        if idts_path.exists():
            sources.append(idts_path)
            idts = pd.read_csv(idts_path, usecols=["solution", "idt_us"])
            vals = {sol: sub["idt_us"].to_numpy()
                    for sol, sub in idts.groupby("solution")}
            for a, b in PAIRS:
                if a not in vals or b not in vals:
                    continue
                base = f"{exp_dir.name}/idt/test/{a}-vs-{b}"
                _mean_tests(out, base, vals[a], vals[b])
                _variance_tests(out, base, vals[a], vals[b])

    # Bonferroni-corrected thresholds. Count one entry per comparison
    # (welch-p for mean families, bf-p for variance) so the threshold reflects
    # the number of pairwise comparisons, not the number of test statistics.
    def n_with(in_pattern: str, suffix: str) -> int:
        return sum(1 for k in out if in_pattern in k and k.endswith(suffix))

    families = {
        "cpu":          n_with("-cpu/test/",  "/mwu-p"),
        "rtt-mean":     n_with("/rtt/test/",  "/mwu-p"),
        "rtt-variance": n_with("/rtt/test/",  "/bf-p"),
        "idt-mean":     n_with("/idt/test/",  "/mwu-p"),
        "idt-variance": n_with("/idt/test/",  "/bf-p"),
    }
    for name, n in families.items():
        if n > 0:
            out[f"bonferroni/{name}-n"]     = str(n)
            out[f"bonferroni/{name}-alpha"] = _fmt(ALPHA / n)
    out["bonferroni/alpha-uncorrected"] = _fmt(ALPHA)

    return out, sources