Validate
ValidationTest protocol, Bootstrap, OutOfSample, WalkForward
ValidationTest base
python
class ValidationTest(ABC):
name: str = ""
def __init__(self, thresholds: dict[str, float] | None = None):
self.thresholds = thresholds or {}
@abstractmethod
def run(self, **kwargs) -> ValidationResult:
"""Execute the test."""
Subclasses accept their own specific kwargs (e.g., returns= for Bootstrap, strategy= and backtest= for OutOfSample).
ValidationResult base
python
@dataclass
class ValidationResult:
test_name: str = ""
thresholds: dict[str, float] = field(default_factory=dict)
threshold_checks: dict[str, bool] = field(default_factory=dict)
metadata: dict[str, Any] = field(default_factory=dict)
@property
def passed: bool # True iff all user thresholds pass
def to_dict(self) -> dict
def to_json(self, indent: int = 2) -> str
def save(self, path: str | Path) -> None
Bootstrap (real)
python
from horizon.validate import Bootstrap
Bootstrap(
metrics: list[str] | None = None, # default: ["sharpe","sortino","max_drawdown","cagr"]
n_samples: int = 1000,
method: str = "block", # "block" | "iid"
block_size: int = 20,
seed: int | None = None,
thresholds: dict[str, float] | None = None,
)
.run(returns=...)
python
result = Bootstrap(metrics=["sharpe"], n_samples=1000, seed=42).run(
returns=[0.001, -0.002, 0.003, ...],
)
BootstrapResult
python
@dataclass
class BootstrapResult(ValidationResult):
samples: dict[str, list[float]]
point_estimates: dict[str, float]
n_samples: int
method: str
block_size: int
def ci(self, metric: str, conf: float = 0.95) -> tuple[float, float]
def distribution(self, metric: str) -> list[float]
def median(self, metric: str) -> float
Threshold keys
{metric}_ci_lo_min: lower 95% CI bound must exceed this value{metric}_median_min: bootstrap median must exceed this value
python
bs = Bootstrap(
metrics=["sharpe"],
n_samples=1000,
thresholds={"sharpe_ci_lo_min": 0.5},
)
result = bs.run(returns=my_returns)
assert result.passed
OutOfSample
python
from horizon.validate import OutOfSample
OutOfSample(
train_pct: float = 0.7,
thresholds: dict[str, float] | None = None,
)
.run(strategy, backtest, universe, asset_classes)
python
result = OutOfSample(train_pct=0.7).run(
strategy=MyStrategy,
backtest=BacktestConfig(initial_cash_usd=100_000),
universe=my_universe,
asset_classes=[Equity],
)
OutOfSampleResult
python
@dataclass
class OutOfSampleResult(ValidationResult):
is_metrics: dict[str, float]
oos_metrics: dict[str, float]
is_equity_curve: Any
oos_equity_curve: Any
degradation: dict[str, float]
@property
def is_oos_gap_sharpe: float
Threshold keys
oos_sharpe_min. OOS Sharpe must be at least this valueis_oos_sharpe_ratio_max. IS/OOS Sharpe ratio must not exceed this
WalkForward
python
from horizon.validate import WalkForward
WalkForward(
train: str = "2y",
test: str = "3m",
step: str = "3m",
retune_params: list[str] | None = None,
tuner: Any = None,
thresholds: dict[str, float] | None = None,
)
.run(strategy, backtest, universe, asset_classes)
python
wf = WalkForward(train="2y", test="3m", step="3m")
result = wf.run(
strategy=MyStrategy,
backtest=my_bt_config,
universe=my_universe,
asset_classes=[Equity],
)
WalkForwardResult
python
@dataclass
class WalkForwardWindow:
train_start: Any
train_end: Any
test_start: Any
test_end: Any
sharpe: float
sortino: float
cagr: float
drawdown: float
params: dict[str, Any]
@dataclass
class WalkForwardResult(ValidationResult):
windows: list[WalkForwardWindow]
aggregate_sharpe: float
aggregate_drawdown: float
aggregate_cagr: float
aggregate_equity_curve: Any
param_evolution: dict[str, list[float]]
@property
def per_window_sharpe: list[float]
@property
def per_window_drawdown: list[float]
def worst_window_sharpe(self) -> float
Threshold keys
aggregate_sharpe_min: stitched OOS Sharpe must exceedmin_window_sharpe: worst per-window Sharpe must exceed
User-driven design
The framework does not provide a “validation suite”. you compose tests manually:
python
from horizon.validate import Bootstrap, OutOfSample, WalkForward
bs_result = Bootstrap(metrics=["sharpe"], n_samples=1000).run(returns=rets)
oos_result = OutOfSample(train_pct=0.7).run(strategy=MyStrategy, backtest=bt)
wf_result = WalkForward(train="2y", test="3m").run(strategy=MyStrategy, backtest=bt)
# Your own decision logic
deploy_worthy = (
bs_result.ci("sharpe", 0.95)[0] > 0.3
and oos_result.oos_metrics.get("sharpe", 0) > 0.5
and wf_result.worst_window_sharpe() > 0.0
)
No black-box verdict. You interpret the numbers.