Source code for ergodic_insurance.statistical_tests

"""Statistical hypothesis testing utilities for simulation results.

This module provides bootstrap-based hypothesis testing functions for
comparing strategies, validating performance differences, and assessing
statistical significance of simulation outcomes.

Example:
    >>> from statistical_tests import test_difference_in_means
    >>> import numpy as np

    >>> # Compare two strategies
    >>> strategy_a_returns = np.random.normal(0.08, 0.02, 1000)
    >>> strategy_b_returns = np.random.normal(0.10, 0.03, 1000)

    >>> result = test_difference_in_means(
    ...     strategy_a_returns,
    ...     strategy_b_returns,
    ...     alternative='less'
    ... )
    >>> print(f"P-value: {result.p_value:.4f}")
    >>> print(f"Strategy B is better: {result.reject_null}")

Attributes:
    DEFAULT_N_BOOTSTRAP (int): Default bootstrap iterations for tests (10000).
    DEFAULT_ALPHA (float): Default significance level (0.05).
"""

from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Tuple

import numpy as np


def _calculate_p_value(
    bootstrap_vals: np.ndarray,
    observed_val: float,
    alternative: str,
    null_val: float = 0.0,
) -> float:
    """Calculate p-value for bootstrap hypothesis test.

    Args:
        bootstrap_vals: Bootstrap distribution values.
        observed_val: Observed test statistic.
        alternative: Type of alternative hypothesis.
        null_val: Null hypothesis value.

    Returns:
        P-value for the test.
    """
    if alternative == "two-sided":
        return float(np.mean(np.abs(bootstrap_vals - null_val) >= np.abs(observed_val - null_val)))
    if alternative == "less":
        return float(np.mean(bootstrap_vals <= observed_val))
    # greater
    return float(np.mean(bootstrap_vals >= observed_val))


def _bootstrap_confidence_interval(
    data: np.ndarray,
    statistic: Callable[[np.ndarray], float],
    n_bootstrap: int,
    alpha: float,
    rng: np.random.Generator,
) -> Tuple[float, float]:
    """Calculate bootstrap confidence interval.

    Args:
        data: Input data.
        statistic: Statistic to compute.
        n_bootstrap: Number of bootstrap iterations.
        alpha: Significance level.
        rng: Random number generator.

    Returns:
        Confidence interval tuple.
    """
    bootstrap_stats = np.zeros(n_bootstrap)
    n = len(data)

    for i in range(n_bootstrap):
        indices = rng.choice(n, size=n, replace=True)
        bootstrap_stats[i] = statistic(data[indices])

    percentiles = [(alpha / 2) * 100, (1 - alpha / 2) * 100]
    ci = np.percentile(bootstrap_stats, percentiles)
    return float(ci[0]), float(ci[1])


def _bootstrap_ratio_distribution(
    sample1: np.ndarray,
    sample2: np.ndarray,
    statistic: Callable[[np.ndarray], float],
    n_bootstrap: int,
    rng: np.random.Generator,
) -> np.ndarray:
    """Generate bootstrap distribution of ratios.

    Args:
        sample1: First sample.
        sample2: Second sample.
        statistic: Function to compute on each sample.
        n_bootstrap: Number of bootstrap iterations.
        rng: Random number generator.

    Returns:
        Array of valid bootstrap ratios.
    """
    n1, n2 = len(sample1), len(sample2)
    bootstrap_ratios = []

    for _i in range(n_bootstrap):
        idx1 = rng.choice(n1, size=n1, replace=True)
        idx2 = rng.choice(n2, size=n2, replace=True)

        boot_stat1 = statistic(sample1[idx1])
        boot_stat2 = statistic(sample2[idx2])

        if boot_stat2 != 0:
            bootstrap_ratios.append(boot_stat1 / boot_stat2)

    return np.array(bootstrap_ratios)


def _permutation_bootstrap(
    combined: np.ndarray,
    n1: int,
    n_bootstrap: int,
    rng: np.random.Generator,
) -> np.ndarray:
    """Generate bootstrap distribution via permutation.

    Args:
        combined: Combined data array.
        n1: Size of first sample.
        n_bootstrap: Number of bootstrap iterations.
        rng: Random number generator.

    Returns:
        Bootstrap distribution array.
    """
    bootstrap_diffs = np.zeros(n_bootstrap)

    for i in range(n_bootstrap):
        permuted = rng.permutation(combined)
        perm_sample1 = permuted[:n1]
        perm_sample2 = permuted[n1:]
        bootstrap_diffs[i] = np.mean(perm_sample1) - np.mean(perm_sample2)

    return bootstrap_diffs


[docs] @dataclass class HypothesisTestResult: """Container for hypothesis test results.""" test_statistic: float p_value: float reject_null: bool confidence_interval: Tuple[float, float] null_hypothesis: str alternative: str alpha: float method: str bootstrap_distribution: Optional[np.ndarray] = None metadata: Optional[Dict[str, Any]] = None
[docs] def summary(self) -> str: """Generate human-readable summary of test results. Returns: Formatted string with test results and interpretation. """ summary = [ "Hypothesis Test Results", f"{'=' * 40}", f"Null Hypothesis: {self.null_hypothesis}", f"Alternative: {self.alternative}", f"Test Statistic: {self.test_statistic:.6f}", f"P-value: {self.p_value:.4f}", f"Significance Level: {self.alpha:.3f}", f"Reject Null: {'Yes' if self.reject_null else 'No'}", f"{(1-self.alpha):.1%} Confidence Interval:", f" [{self.confidence_interval[0]:.6f}, {self.confidence_interval[1]:.6f}]", f"Method: {self.method}", ] # Add interpretation if self.reject_null: summary.append(f"\nConclusion: Significant difference detected (p < {self.alpha})") else: summary.append(f"\nConclusion: No significant difference (p >= {self.alpha})") return "\n".join(summary)
[docs] def difference_in_means_test( sample1: np.ndarray, sample2: np.ndarray, alternative: str = "two-sided", alpha: float = 0.05, n_bootstrap: int = 10000, seed: Optional[int] = None, ) -> HypothesisTestResult: """Test difference in means between two samples using bootstrap. Tests the null hypothesis that the means of two populations are equal against various alternatives using bootstrap resampling. Args: sample1: First sample array. sample2: Second sample array. alternative: Type of alternative hypothesis: - 'two-sided': means are different - 'less': mean1 < mean2 - 'greater': mean1 > mean2 alpha: Significance level (default 0.05). n_bootstrap: Number of bootstrap iterations (default 10000). seed: Random seed for reproducibility. Returns: HypothesisTestResult containing test statistics and decision. Raises: ValueError: If alternative is not valid. Example: >>> # Test if Strategy A has lower returns than Strategy B >>> result = test_difference_in_means( ... returns_a, returns_b, alternative='less' ... ) >>> if result.reject_null: ... print("Strategy B significantly outperforms Strategy A") """ if alternative not in ["two-sided", "less", "greater"]: raise ValueError( f"Alternative must be 'two-sided', 'less', or 'greater', got {alternative}" ) # Calculate observed difference and setup observed_diff = np.mean(sample1) - np.mean(sample2) n1, n2 = len(sample1), len(sample2) rng = np.random.default_rng(seed) # Bootstrap under null hypothesis (permutation) combined = np.concatenate([sample1, sample2]) bootstrap_diffs = _permutation_bootstrap(combined, n1, n_bootstrap, rng) # Calculate p-value based on alternative p_value = _calculate_p_value(bootstrap_diffs, observed_diff, alternative) # Calculate confidence interval for difference def diff_statistic(indices: np.ndarray) -> float: """Calculate difference in means for bootstrap sample.""" s1_idx = rng.choice(n1, size=n1, replace=True) s2_idx = rng.choice(n2, size=n2, replace=True) return float(np.mean(sample1[s1_idx]) - np.mean(sample2[s2_idx])) ci = _bootstrap_confidence_interval( np.arange(n1 + n2), # Dummy array for indexing diff_statistic, n_bootstrap, alpha, rng, ) return HypothesisTestResult( test_statistic=observed_diff, p_value=p_value, reject_null=p_value < alpha, confidence_interval=ci, null_hypothesis="mean1 = mean2", alternative=alternative, alpha=alpha, method="bootstrap permutation test", bootstrap_distribution=bootstrap_diffs, metadata={ "mean1": np.mean(sample1), "mean2": np.mean(sample2), "n1": n1, "n2": n2, "n_bootstrap": n_bootstrap, }, )
[docs] def ratio_of_metrics_test( sample1: np.ndarray, sample2: np.ndarray, statistic: Callable[[np.ndarray], float] = np.mean, null_ratio: float = 1.0, alternative: str = "two-sided", alpha: float = 0.05, n_bootstrap: int = 10000, seed: Optional[int] = None, ) -> HypothesisTestResult: """Test ratio of metrics between two samples using bootstrap. Tests whether the ratio of a statistic (e.g., mean, median) between two samples equals a specified value (typically 1.0). Args: sample1: First sample array. sample2: Second sample array. statistic: Function to compute on each sample (default: mean). null_ratio: Null hypothesis ratio value (default: 1.0). alternative: Alternative hypothesis type. alpha: Significance level. n_bootstrap: Number of bootstrap iterations. seed: Random seed. Returns: HypothesisTestResult for the ratio test. Example: >>> # Test if ROE ratio differs from 1.0 >>> result = test_ratio_of_metrics( ... roe_strategy_a, ... roe_strategy_b, ... statistic=np.median, ... null_ratio=1.0 ... ) """ if alternative not in ["two-sided", "less", "greater"]: raise ValueError("Alternative must be 'two-sided', 'less', or 'greater'") # Calculate observed ratio stat1, stat2 = statistic(sample1), statistic(sample2) if stat2 == 0: # Return result without warning for expected test cases return HypothesisTestResult( test_statistic=np.inf, p_value=1.0, reject_null=False, confidence_interval=(np.nan, np.nan), null_hypothesis=f"ratio = {null_ratio}", alternative=alternative, alpha=alpha, method="bootstrap ratio test", ) observed_ratio = stat1 / stat2 # Bootstrap distribution of ratio rng = np.random.default_rng(seed) bootstrap_ratios_array = _bootstrap_ratio_distribution( sample1, sample2, statistic, n_bootstrap, rng ) # Calculate p-value (center around null ratio for hypothesis test) p_value = _calculate_p_value( bootstrap_ratios_array - np.mean(bootstrap_ratios_array) + null_ratio, observed_ratio, alternative, null_ratio, ) # Confidence interval for ratio ci = np.percentile(bootstrap_ratios_array, [(alpha / 2) * 100, (1 - alpha / 2) * 100]) return HypothesisTestResult( test_statistic=observed_ratio, p_value=float(p_value), reject_null=p_value < alpha, confidence_interval=(float(ci[0]), float(ci[1])), null_hypothesis=f"ratio = {null_ratio}", alternative=alternative, alpha=alpha, method="bootstrap ratio test", bootstrap_distribution=bootstrap_ratios_array, metadata={ "stat1": stat1, "stat2": stat2, "null_ratio": null_ratio, "n_valid_bootstraps": len(bootstrap_ratios_array), }, )
[docs] def paired_comparison_test( paired_differences: np.ndarray, null_value: float = 0.0, alternative: str = "two-sided", alpha: float = 0.05, n_bootstrap: int = 10000, seed: Optional[int] = None, ) -> HypothesisTestResult: """Test paired differences using bootstrap. Tests whether paired differences (e.g., from matched scenarios) have a mean equal to a specified value (typically 0). Args: paired_differences: Array of paired differences. null_value: Null hypothesis value for mean difference (default: 0). alternative: Alternative hypothesis type. alpha: Significance level. n_bootstrap: Number of bootstrap iterations. seed: Random seed. Returns: HypothesisTestResult for the paired test. Example: >>> # Test if insurance improves outcomes >>> differences = outcomes_with_insurance - outcomes_without_insurance >>> result = paired_comparison_test(differences, alternative='greater') """ if alternative not in ["two-sided", "less", "greater"]: raise ValueError("Alternative must be 'two-sided', 'less', or 'greater'") # Calculate observed mean difference observed_mean = np.mean(paired_differences) n = len(paired_differences) # Center differences around null value for bootstrap centered_diffs = paired_differences - observed_mean + null_value # Bootstrap distribution under null rng = np.random.default_rng(seed) bootstrap_means = np.zeros(n_bootstrap) for i in range(n_bootstrap): indices = rng.choice(n, size=n, replace=True) bootstrap_means[i] = np.mean(centered_diffs[indices]) # Calculate p-value p_value = _calculate_p_value(bootstrap_means, observed_mean, alternative, null_value) # Confidence interval for mean difference ci = _bootstrap_confidence_interval( paired_differences, np.mean, n_bootstrap, alpha, rng, ) return HypothesisTestResult( test_statistic=observed_mean, p_value=p_value, reject_null=p_value < alpha, confidence_interval=ci, null_hypothesis=f"mean difference = {null_value}", alternative=alternative, alpha=alpha, method="bootstrap paired test", bootstrap_distribution=bootstrap_means, metadata={ "n_pairs": n, "std_difference": np.std(paired_differences), "null_value": null_value, }, )
def _bootstrap_null_distribution( data: np.ndarray, test_statistic: Callable[[np.ndarray], float], n_bootstrap: int, rng: np.random.Generator, ) -> np.ndarray: """Generate bootstrap distribution under null. Args: data: Data array (already transformed for null). test_statistic: Function to compute test statistic. n_bootstrap: Number of bootstrap iterations. rng: Random number generator. Returns: Bootstrap distribution array. """ n = len(data) bootstrap_stats = np.zeros(n_bootstrap) for i in range(n_bootstrap): indices = rng.choice(n, size=n, replace=True) bootstrap_sample = data[indices] bootstrap_stats[i] = test_statistic(bootstrap_sample) return bootstrap_stats
[docs] def bootstrap_hypothesis_test( data: np.ndarray, null_hypothesis: Callable[[np.ndarray], np.ndarray], test_statistic: Callable[[np.ndarray], float], alternative: str = "two-sided", alpha: float = 0.05, n_bootstrap: int = 10000, seed: Optional[int] = None, ) -> HypothesisTestResult: """General bootstrap hypothesis testing framework. Allows testing of custom hypotheses using any test statistic. Args: data: Input data array. null_hypothesis: Function that transforms data to satisfy null. test_statistic: Function to compute test statistic. alternative: Alternative hypothesis type. alpha: Significance level. n_bootstrap: Number of bootstrap iterations. seed: Random seed. Returns: HypothesisTestResult for the custom test. Example: >>> # Test if variance exceeds threshold >>> def null_transform(x): ... return x * np.sqrt(threshold_var / np.var(x)) >>> result = bootstrap_hypothesis_test( ... data, null_transform, np.var, alternative='greater' ... ) """ if alternative not in ["two-sided", "less", "greater"]: raise ValueError("Alternative must be 'two-sided', 'less', or 'greater'") # Calculate observed test statistic observed_stat = test_statistic(data) # Transform data to satisfy null hypothesis null_data = null_hypothesis(data) # Ensure null_data is array (type narrowing already ensures it's ndarray) # The null_hypothesis function always returns an ndarray # Bootstrap distribution under null rng = np.random.default_rng(seed) bootstrap_stats = _bootstrap_null_distribution(null_data, test_statistic, n_bootstrap, rng) # Calculate p-value null_stat = test_statistic(null_data) p_value = _calculate_p_value(bootstrap_stats, observed_stat, alternative, null_stat) # Confidence interval for test statistic ci = _bootstrap_confidence_interval( data, test_statistic, n_bootstrap, alpha, rng, ) return HypothesisTestResult( test_statistic=observed_stat, p_value=p_value, reject_null=p_value < alpha, confidence_interval=ci, null_hypothesis="Custom null hypothesis", alternative=alternative, alpha=alpha, method="bootstrap hypothesis test", bootstrap_distribution=bootstrap_stats, )
[docs] def multiple_comparison_correction( p_values: List[float], method: str = "bonferroni", alpha: float = 0.05, ) -> Tuple[np.ndarray, np.ndarray]: """Apply multiple comparison correction to p-values. Adjusts p-values when multiple hypothesis tests are performed to control family-wise error rate or false discovery rate. Args: p_values: List of p-values from multiple tests. method: Correction method: - 'bonferroni': Bonferroni correction - 'holm': Holm-Bonferroni method - 'fdr': Benjamini-Hochberg FDR alpha: Overall significance level. Returns: Tuple of (adjusted_p_values, reject_decisions). Example: >>> p_vals = [0.01, 0.04, 0.03, 0.20] >>> adj_p, reject = multiple_comparison_correction(p_vals) >>> print(f"Significant tests: {np.sum(reject)}") """ p_values_array: np.ndarray = np.array(p_values) n_tests = len(p_values_array) if method == "bonferroni": # Simple Bonferroni correction adjusted_p = np.minimum(p_values_array * n_tests, 1.0) reject = adjusted_p < alpha elif method == "holm": # Holm-Bonferroni method sorted_idx = np.argsort(p_values_array) sorted_p = p_values_array[sorted_idx] adjusted_p = np.zeros(n_tests) reject = np.zeros(n_tests, dtype=bool) for i in range(n_tests): adj_p = sorted_p[i] * (n_tests - i) adjusted_p[sorted_idx[i]] = min(adj_p, 1.0) if adj_p < alpha: reject[sorted_idx[i]] = True else: break # Stop testing once we fail to reject elif method == "fdr": # Benjamini-Hochberg FDR control sorted_idx = np.argsort(p_values_array) sorted_p = p_values_array[sorted_idx] adjusted_p = np.zeros(n_tests) reject = np.zeros(n_tests, dtype=bool) # Find largest i where P(i) <= (i/m) * alpha _threshold_found = False for i in range(n_tests - 1, -1, -1): threshold = ((i + 1) / n_tests) * alpha if sorted_p[i] <= threshold: _threshold_found = True # Reject all hypotheses up to i for j in range(i + 1): reject[sorted_idx[j]] = True break # Adjust p-values for i in range(n_tests): adj_p = sorted_p[i] * n_tests / (i + 1) adjusted_p[sorted_idx[i]] = min(adj_p, 1.0) else: raise ValueError(f"Method must be 'bonferroni', 'holm', or 'fdr', got {method}") return adjusted_p, reject