Data Sources
Bar, DataSource protocol, SyntheticGBM, SyntheticRegimes, DictSource
Bar
@dataclass(frozen=True)
class Bar:
market_id: str
timestamp: datetime
price: float
open: float | None = None
high: float | None = None
low: float | None = None
close: float | None = None
volume: float | None = None
bid: float | None = None
ask: float | None = None
DataSource protocol
from typing import Protocol, runtime_checkable, Iterator
@runtime_checkable
class DataSource(Protocol):
def markets(self) -> list[str]:
"""List the market ids this source has data for."""
def iter_bars(self) -> Iterator[Bar]:
"""Yield bars in chronological order across all markets."""
Implementations yield bars in strict chronological order across all markets. The backtest loop buffers bars at the same timestamp and flushes them as a single tick.
SyntheticGBM
Seeded geometric Brownian motion. Deterministic.
from horizon.data import SyntheticGBM
data = SyntheticGBM(
market_ids: list[str],
start: datetime = datetime(2023, 1, 2),
n_bars: int = 252,
mu: float = 0.08, # annualized drift
sigma: float = 0.20, # annualized vol
periods_per_year: float = 252.0,
seed: int = 42,
initial_price: float = 100.0,
step: timedelta = timedelta(days=1),
)
Each market has a deterministic path driven by seed + i for the i-th market. Same seed → bit-identical price paths.
Example
data = SyntheticGBM(
market_ids=["AAPL", "MSFT", "NVDA"],
n_bars=100,
mu=0.12,
sigma=0.22,
seed=1,
)
for bar in data.iter_bars():
print(bar.market_id, bar.timestamp, bar.price)
SyntheticRegimes
Programmable regime shifts, useful for stress-testing risk enforcement.
from horizon.data import SyntheticRegimes
data = SyntheticRegimes(
market_ids: list[str],
start: datetime = datetime(2023, 1, 2),
n_bars: int = 252,
regimes: list[tuple[float, float, float]] = [
(0.4, 0.20, 0.15), # (fraction, mu, sigma)
(0.3, 0.0, 0.30),
(0.3, -0.30, 0.40),
],
periods_per_year: float = 252.0,
seed: int = 42,
initial_price: float = 100.0,
step: timedelta = timedelta(days=1),
)
Each regime tuple is (bar_fraction, mu, sigma). Fractions should sum to ≈1.0.
Example: pure crash
data = SyntheticRegimes(
market_ids=["A"],
n_bars=200,
regimes=[(1.0, -0.50, 0.35)],
seed=7,
)
Useful for verifying that stops + drawdown guards bound losses under adverse conditions.
DictSource
In-memory bars from a plain Python dict.
from datetime import datetime
from horizon.data import DictSource
data = DictSource({
"AAPL": [
(datetime(2024, 1, 1), 100.0),
(datetime(2024, 1, 2), 101.5),
(datetime(2024, 1, 3), 99.8),
],
"MSFT": [
(datetime(2024, 1, 1), 400.0),
(datetime(2024, 1, 2), 402.0),
(datetime(2024, 1, 3), 405.0),
],
})
Chronological ordering across markets is handled automatically.
Network providers
Pull real historical data with one line. All providers return a ProviderSource (an in-memory DataSource) you can pass directly to hz.run(data_source=...) or iterate via .iter_bars() for analysis.
Equities + crypto bars
import horizon as hz
# Yahoo Finance — free, no API key (needs ``pip install yfinance`` or the
# [notebooks] / [research] extra)
src = hz.data.yahoo(["AAPL", "MSFT"], start="2024-01-01", end="2025-01-01",
interval="1d")
# Alpaca — needs ALPACA_API_KEY + ALPACA_SECRET_KEY env vars
src = hz.data.alpaca(["AAPL", "MSFT"], start="2024-01-01", end="2025-01-01",
timeframe="1Day")
# Supported timeframes: "1Min", "5Min", "15Min", "1Hour", "1Day"
# Polygon.io — needs POLYGON_API_KEY env var
src = hz.data.polygon(["AAPL"], start="2024-01-01", timespan="day")
Options chain snapshot
chain = hz.data.alpaca_options_chain(
"AAPL",
expiry="2026-06-19",
option_type="call",
strike_gte=180.0, strike_lte=200.0,
limit=100, # capped at 100 per Alpaca
)
# Returns a list of dicts, one per contract:
for row in chain:
print(row["symbol"], row.get("impliedVolatility"), row.get("greeks"))
Each row has the OCC symbol under "symbol" plus whatever Alpaca’s
data plan includes (latestQuote, latestTrade, greeks, impliedVolatility). Empty list on HTTP error. Requires an Alpaca
account with options data access.
Options bars (historical, per contract)
# Build OCC symbols from the chain or by hand, then pull their history
symbols = ["AAPL250620C00200000", "AAPL250620P00180000"]
src = hz.data.alpaca_options_bars(
symbols, start="2025-01-01", end="2025-06-19", timeframe="1Day",
)
for bar in src.iter_bars():
print(bar.market_id, bar.timestamp, bar.close, bar.volume)
The provider paginates internally (Alpaca uses page tokens) and chunks
symbol lists into batches of 100, so you can pass arbitrarily many
contracts. Same Bar shape as the equity provider so the same
backtest pipeline works.
From file or DataFrame
hz.data.csv("history.csv",
market_col="market_id", date_col="date", price_col="close")
hz.data.dataframe(df, # pandas DataFrame
market_col="symbol", date_col="ts", price_col="close")
hz.data.from_dict({"AAPL": [(datetime(2024, 1, 1), 180.0), ...]})
Discovery
hz.data.available_providers()
# → ['yahoo', 'polygon', 'alpaca', 'alpaca_options', ...]
Writing your own data source
Implement the two-method protocol:
from typing import Iterator
from horizon.data import Bar
class MyCSVSource:
def __init__(self, path: str):
self.path = path
# ... load the CSV into a sorted list of Bars ...
def markets(self) -> list[str]:
return sorted(set(b.market_id for b in self.bars))
def iter_bars(self) -> Iterator[Bar]:
yield from self.bars
Use it:
hz.run(data_source=MyCSVSource("historical.csv"), ...)
Tests
9 tests in tests/test_data.py covering determinism, chronological ordering, initial prices, and regime behavior.