Validate

ValidationTest protocol, Bootstrap, OutOfSample, WalkForward

ValidationTest base

python
class ValidationTest(ABC):
    name: str = ""

    def __init__(self, thresholds: dict[str, float] | None = None):
        self.thresholds = thresholds or {}

    @abstractmethod
    def run(self, **kwargs) -> ValidationResult:
        """Execute the test."""

Subclasses accept their own specific kwargs (e.g., returns= for Bootstrap, strategy= and backtest= for OutOfSample).

ValidationResult base

python
@dataclass
class ValidationResult:
    test_name: str = ""
    thresholds: dict[str, float] = field(default_factory=dict)
    threshold_checks: dict[str, bool] = field(default_factory=dict)
    metadata: dict[str, Any] = field(default_factory=dict)

    @property
    def passed: bool          # True iff all user thresholds pass

    def to_dict(self) -> dict
    def to_json(self, indent: int = 2) -> str
    def save(self, path: str | Path) -> None

Bootstrap (real)

python
from horizon.validate import Bootstrap

Bootstrap(
    metrics: list[str] | None = None,    # default: ["sharpe","sortino","max_drawdown","cagr"]
    n_samples: int = 1000,
    method: str = "block",                # "block" | "iid"
    block_size: int = 20,
    seed: int | None = None,
    thresholds: dict[str, float] | None = None,
)

.run(returns=...)

python
result = Bootstrap(metrics=["sharpe"], n_samples=1000, seed=42).run(
    returns=[0.001, -0.002, 0.003, ...],
)

BootstrapResult

python
@dataclass
class BootstrapResult(ValidationResult):
    samples: dict[str, list[float]]
    point_estimates: dict[str, float]
    n_samples: int
    method: str
    block_size: int

    def ci(self, metric: str, conf: float = 0.95) -> tuple[float, float]
    def distribution(self, metric: str) -> list[float]
    def median(self, metric: str) -> float

Threshold keys

  • {metric}_ci_lo_min: lower 95% CI bound must exceed this value
  • {metric}_median_min: bootstrap median must exceed this value
python
bs = Bootstrap(
    metrics=["sharpe"],
    n_samples=1000,
    thresholds={"sharpe_ci_lo_min": 0.5},
)
result = bs.run(returns=my_returns)
assert result.passed

OutOfSample

python
from horizon.validate import OutOfSample

OutOfSample(
    train_pct: float = 0.7,
    thresholds: dict[str, float] | None = None,
)

.run(strategy, backtest, universe, asset_classes)

python
result = OutOfSample(train_pct=0.7).run(
    strategy=MyStrategy,
    backtest=BacktestConfig(initial_cash_usd=100_000),
    universe=my_universe,
    asset_classes=[Equity],
)

OutOfSampleResult

python
@dataclass
class OutOfSampleResult(ValidationResult):
    is_metrics: dict[str, float]
    oos_metrics: dict[str, float]
    is_equity_curve: Any
    oos_equity_curve: Any
    degradation: dict[str, float]

    @property
    def is_oos_gap_sharpe: float

Threshold keys

  • oos_sharpe_min. OOS Sharpe must be at least this value
  • is_oos_sharpe_ratio_max. IS/OOS Sharpe ratio must not exceed this

WalkForward

python
from horizon.validate import WalkForward

WalkForward(
    train: str = "2y",
    test: str = "3m",
    step: str = "3m",
    retune_params: list[str] | None = None,
    tuner: Any = None,
    thresholds: dict[str, float] | None = None,
)

.run(strategy, backtest, universe, asset_classes)

python
wf = WalkForward(train="2y", test="3m", step="3m")
result = wf.run(
    strategy=MyStrategy,
    backtest=my_bt_config,
    universe=my_universe,
    asset_classes=[Equity],
)

WalkForwardResult

python
@dataclass
class WalkForwardWindow:
    train_start: Any
    train_end: Any
    test_start: Any
    test_end: Any
    sharpe: float
    sortino: float
    cagr: float
    drawdown: float
    params: dict[str, Any]

@dataclass
class WalkForwardResult(ValidationResult):
    windows: list[WalkForwardWindow]
    aggregate_sharpe: float
    aggregate_drawdown: float
    aggregate_cagr: float
    aggregate_equity_curve: Any
    param_evolution: dict[str, list[float]]

    @property
    def per_window_sharpe: list[float]
    @property
    def per_window_drawdown: list[float]

    def worst_window_sharpe(self) -> float

Threshold keys

  • aggregate_sharpe_min: stitched OOS Sharpe must exceed
  • min_window_sharpe: worst per-window Sharpe must exceed

User-driven design

The framework does not provide a “validation suite”. you compose tests manually:

python
from horizon.validate import Bootstrap, OutOfSample, WalkForward

bs_result = Bootstrap(metrics=["sharpe"], n_samples=1000).run(returns=rets)
oos_result = OutOfSample(train_pct=0.7).run(strategy=MyStrategy, backtest=bt)
wf_result = WalkForward(train="2y", test="3m").run(strategy=MyStrategy, backtest=bt)

# Your own decision logic
deploy_worthy = (
    bs_result.ci("sharpe", 0.95)[0] > 0.3
    and oos_result.oos_metrics.get("sharpe", 0) > 0.5
    and wf_result.worst_window_sharpe() > 0.0
)

No black-box verdict. You interpret the numbers.

Next