Skip to content

Commit 17056ce

Browse files
Merge pull request #16 from IteraLabs/feature/11-inter-arrival-ts-packaging-deploy
inter-arrival model fit and serve, packaging and deploy
2 parents 4d0eb28 + 22a4f29 commit 17056ce

File tree

113 files changed

+10994
-1324
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

113 files changed

+10994
-1324
lines changed

.cargo/config.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
[build]
22
rustdocflags = ["--html-in-header", "katex-header.html"]
3+
4+
[target.aarch64-unknown-linux-gnu]
5+
linker = "aarch64-unknown-linux-gnu-gcc"

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ members = [
1111
]
1212
default-members = [
1313
"atelier-data",
14+
"atelier-quant",
1415
]
1516

1617
[workspace.package]

atelier-data/Cargo.toml

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,25 @@ include = ["../katex-header.html", "Cargo.toml", "README.md", "src/**/*"]
1616
publish = true
1717

1818
[package.metadata.docs.rs]
19-
rustdoc-args = ["--html-in-header", "katex-header.html"]
19+
rustdoc-args = ["--html-in-header", "katex-header.html", "--cfg", "docsrs"]
2020
private-doc = true
21+
alll-features = true
2122
license = "Apache-2.0"
2223

2324
[lib]
2425
name = "atelier_data"
2526
path = "src/lib.rs"
2627

28+
# ── binaries ──
29+
30+
[[bin]]
31+
name = "market_worker"
32+
path = "src/bin/market_worker.rs"
33+
34+
[[bin]]
35+
name = "data_worker"
36+
path = "src/bin/data_worker.rs"
37+
2738
# ── market_sync ──
2839

2940
[[example]]
@@ -48,11 +59,25 @@ path = "examples/market_sync/coinbase/coinbase_markets.rs"
4859
name = "kraken_markets"
4960
path = "examples/market_sync/kraken/kraken_markets.rs"
5061

62+
# ── worker examples ──
63+
64+
[[example]]
65+
name = "run_data_worker"
66+
path = "examples/data_worker/run_data_worker.rs"
67+
68+
[[example]]
69+
name = "run_market_worker"
70+
path = "examples/market_worker/run_market_worker.rs"
71+
72+
[[example]]
73+
name = "read_market_worker"
74+
path = "examples/market_worker/read_market_worker.rs"
75+
5176
# ── market_strems ──
5277

5378
[[example]]
5479
name = "market_fetch"
55-
path = "examples/market_strems/market_fetch.rs"
80+
path = "examples/market_streams/market_fetch.rs"
5681

5782
# ── bybit tests ──
5883

@@ -126,6 +151,20 @@ path = "tests/types/temporal/test_resolutions.rs"
126151
name = "test_temporal_validations"
127152
path = "tests/types/temporal/test_validations.rs"
128153

154+
# ── worker related tests ──
155+
156+
[[test]]
157+
name = "test_data_worker_connections"
158+
path = "tests/workers/test_data_worker_connections.rs"
159+
160+
[[test]]
161+
name = "test_data_worker_markets"
162+
path = "tests/workers/test_data_worker_markets.rs"
163+
164+
[[test]]
165+
name = "test_worker_gap_detector"
166+
path = "tests/workers/test_worker_gap_detector.rs"
167+
129168
# ── parquet round-trip tests ──
130169

131170
[[test]]
@@ -158,7 +197,7 @@ arrow = { version = "57.2", optional = true }
158197
async-rate-limiter = { version = "1.0", features = ["rt-tokio"] }
159198
async-trait = { version = "0.1" }
160199
chrono = { version = "0.4", features = ["serde"] }
161-
config = { version = "0.13" }
200+
clap = { workspace = true }
162201
csv = { workspace = true }
163202
futures-util = { version = "0.3" }
164203
hex = { version = "0.4" }
@@ -176,7 +215,7 @@ tokio = { workspace = true }
176215
tokio-tungstenite = { version = "0.21", features = ["native-tls"] }
177216
toml = { workspace = true }
178217
tracing = { version = "0.1" }
179-
tracing-subscriber = { version = "0.3" }
218+
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
180219
url = { version = "2.0" }
181220
uuid = { version = "1.0", features = ["v4"] }
182221

atelier-data/README.md

Lines changed: 119 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,139 @@
11
# atelier-data
22

3-
# Overview
3+
Market data infrastructure for the **atelier-rs** trading engine.
44

5-
Foundational Data Types and I/O integrations for the atelier-rs project.
5+
This crate provides everything needed to connect to cryptocurrency exchanges,
6+
normalise their heterogeneous WebSocket feeds into a common data model,
7+
synchronise events onto a uniform time grid, and persist the result to
8+
Apache Parquet files.
69

7-
Core data types are:
10+
## Core Data Types
811

9-
OffChain activity
10-
- OrderBook
11-
- PublicTrades
12-
- Liquidations (When available)
13-
- FundingRates (When available)
14-
- OpenInterests (When available)
12+
**Off-chain activity** (market microstructure):
1513

16-
OnChain activity
17-
- Swaps
18-
- LendingRates
14+
| Type | Description |
15+
|------|-------------|
16+
| `Orderbook` | Full-depth limit order book snapshot (bid/ask levels) |
17+
| `OrderbookDelta` | Incremental order book maintained via `NormalizedDelta` updates |
18+
| `Trade` | Public trade execution (price, size, side, timestamp) |
19+
| `Liquidation` | Forced liquidation event |
20+
| `FundingRate` | Perpetual futures funding rate observation |
21+
| `OpenInterest` | Aggregate open interest snapshot |
22+
23+
**Composed types:**
1924

20-
## Orderbook data
25+
| Type | Description |
26+
|------|-------------|
27+
| `MarketSnapshot` | Time-aligned bundle of all market data for one grid period |
28+
| `MarketAggregate` | 15-scalar feature vector derived from a `MarketSnapshot` |
2129

22-
- Snapshots and Deltas
23-
- Metrics
30+
## Exchange Sources
2431

25-
## Sources
32+
| Source | Kind | API | Order Books | Public Trades | Liquidations | Funding Rates | Open Interest |
33+
|--------|------|-----|-------------|---------------|--------------|---------------|---------------|
34+
| Bybit | CEX | WSS | YES / YES | YES / YES | YES / YES | YES / YES | YES / YES |
35+
| Coinbase | CEX | WSS | YES / YES | YES / YES ||||
36+
| Kraken | CEX | WSS | YES / YES | YES / YES ||||
2637

27-
| Source | Kind | Type | API | Data | Implemented | Tests |
28-
| ------------- | ------- | ------- | ------- | ------------- | ------------- | ------- |
29-
| | | | | Order Books | YES | N/A |
30-
| | | | | Public Trades | YES | N/A |
31-
| Bybit | Markets | CEX | WSS | Liquidations | YES | N/A |
32-
| | | | | Funding Rates | YES | N/A |
33-
| | | | | Open Interest | YES | N/A |
34-
|---------------|---------|---------|---------|---------------|---------------|---------|
35-
| | | | | Order Books | N/A | N/A |
36-
| | | | | Public Trades | N/A | N/A |
37-
| Coinbase | Markets | CEX | WSS | Liquidations | N/A | N/A |
38-
| | | | | Funding Rates | N/A | N/A |
39-
| | | | | Open Interest | N/A | N/A |
40-
|---------------|---------|---------|---------|---------------|---------------|---------|
41-
| | | | | Order Books | N/A | N/A |
42-
| | | | | Public Trades | N/A | N/A |
43-
| Kraken | Markets | CEX | WSS | Liquidations | N/A | N/A |
44-
| | | | | Funding Rates | N/A | N/A |
45-
| | | | | Open Interest | N/A | N/A |
38+
*Format: Implemented / Tested. Dashes indicate the exchange does not expose
39+
the data type on its spot/linear WebSocket API.*
4640

41+
## Workers
4742

48-
<br>
43+
Two worker types handle end-to-end data collection:
4944

50-
---
45+
**DataWorker** — raw event ingestion without synchronisation. Connects to a
46+
live exchange WebSocket feed, decodes events, and delivers them through a
47+
pluggable `OutputSink` pipeline. Configuration is driven by a TOML manifest
48+
(`DataWorkerManifest`). Handles reconnection, backoff, health monitoring,
49+
and gap detection automatically.
50+
51+
**MarketWorker** — synchronised market snapshots. Extends `DataWorker`'s
52+
ingestion with a `MarketSynchronizer` that bins heterogeneous events onto
53+
a uniform nanosecond grid, producing `MarketSnapshot` objects at each tick.
54+
Multiple `ClockMode` strategies are supported: `OrderbookDriven`,
55+
`TradeDriven`, `LiquidationDriven`, and `ExternalClock`. Snapshots are
56+
delivered through the same `OutputSink` pipeline and can be flushed to
57+
Parquet automatically.
58+
59+
## Output Sinks
60+
61+
The `OutputSink` trait defines where worker output goes. Multiple sinks
62+
run simultaneously via `OutputSinkSet` (fan-out):
63+
64+
| Sink | Status | Description |
65+
|------|--------|-------------|
66+
| `ChannelSink` | Working | Wraps `TopicRegistry` broadcast channels for pub/sub |
67+
| `TerminalSink` | Working | Debug/tracing terminal output |
68+
| `ParquetSink` | Working | Buffers `MarketSnapshot`s, decomposes and flushes to per-datatype Parquet files |
69+
70+
## Parquet Persistence
71+
72+
Requires `--features parquet`. All five data types support read and write:
73+
74+
| Data Type | Writer | Reader |
75+
|-----------|--------|--------|
76+
| Orderbooks | `write_ob_parquet` | `read_ob_parquet` |
77+
| Trades | `write_trades_parquet_timestamped` | `read_trades_parquet` |
78+
| Liquidations | `write_liquidations_parquet_timestamped` | `read_liquidations_parquet` |
79+
| Funding Rates | `write_funding_parquet_timestamped` | `read_funding_parquet` |
80+
| Open Interest | `write_oi_parquet_timestamped` | `read_oi_parquet` |
5181

52-
**`atelier-data`** is a member of the [atelier-rs](https://github.com/iteralabs/atelier-rs) workspace, which has other published crates:
82+
### Filename Convention
5383

54-
- [atelier-engine](https://crates.io/crates/atelier-engine):
55-
- [atelier-quant](https://crates.io/crates/atelier-quant):
56-
- [atelier-retro](https://crates.io/crates/atelier-retro):
57-
- [atelier-rs](https://crates.io/crates/atelier-rs):
84+
All timestamped writers produce files following this pattern:
5885

59-
there are Github hosted artifacts:
86+
```
87+
{SYMBOL}_{DATATYPE}_{MODE}_{TIMESTAMP}.parquet
88+
```
89+
90+
Where `MODE` is `"sync"` for grid-aligned data or `"raw"` for unprocessed
91+
captures. Symbols containing `/` (e.g. Kraken's `BTC/USDT`) are sanitised
92+
to `-` in the filename (`BTC-USDT`) while the Parquet data retains the
93+
original symbol string. Examples:
94+
95+
```
96+
BTCUSDT_ob_sync_20260226_153000.123.parquet
97+
ETHUSDT_trades_raw_20260226_160000.456.parquet
98+
BTC-USDT_ob_sync_20260226_153000.123.parquet
99+
```
100+
101+
Files are organised into subdirectories per data type: `orderbooks/`,
102+
`trades/`, `liquidations/`, `fundings/`, `open_interests/`.
103+
104+
## Feature Flags
105+
106+
| Flag | Effect |
107+
|------|--------|
108+
| `parquet` | Enables Apache Parquet I/O (adds `arrow` + `parquet` deps) |
109+
| `torch` | Enables `tch`-based tensor conversion in the `datasets` module |
110+
111+
## Examples
112+
113+
| Example | Description | Command |
114+
|---------|-------------|---------|
115+
| `run_data_worker` | Raw event ingestion via DataWorker | `cargo run -p atelier_data --example run_data_worker -- --config <path>` |
116+
| `run_market_worker` | Synchronised snapshots to Parquet via MarketWorker | `cargo run -p atelier_data --example run_market_worker --features parquet -- --config <path>` |
117+
| `read_market_worker` | Read Parquet files and print per-symbol stats | `cargo run -p atelier_data --example read_market_worker --features parquet -- --dir <path>` |
118+
| `bybit_markets` | Bybit market snapshot collection (standalone) | `cargo run -p atelier_data --example bybit_markets --features parquet -- --config <path>` |
119+
| `coinbase_markets` | Coinbase market snapshot collection | `cargo run -p atelier_data --example coinbase_markets --features parquet -- --config <path>` |
120+
| `kraken_markets` | Kraken market snapshot collection | `cargo run -p atelier_data --example kraken_markets --features parquet -- --config <path>` |
121+
| `market_load` | Load and verify most recent Parquet files | `cargo run -p atelier_data --example market_load --features parquet -- --config <path>` |
122+
| `market_fetch` | Multi-exchange raw stream collector (Bybit/Coinbase/Kraken) | `cargo run -p atelier_data --example market_fetch --features parquet` |
123+
| `multi_sync_workers` | Multi-worker manifest parser (stub) | `cargo run -p atelier_data --example multi_sync_workers -- --config <path>` |
124+
125+
---
60126

61-
- [benches](https://github.com/IteraLabs/atelier-rs/tree/main/benches):
62-
- [datasets](https://github.com/IteraLabs/atelier-rs/tree/main/datasets):
127+
**`atelier-data`** is a member of the [atelier-rs](https://github.com/iteralabs/atelier-rs) workspace:
63128

64-
and consider this for the Development cycle:
129+
- [atelier-engine](https://crates.io/crates/atelier-engine)
130+
- [atelier-quant](https://crates.io/crates/atelier-quant)
131+
- [atelier-retro](https://crates.io/crates/atelier-retro)
132+
- [atelier-rs](https://crates.io/crates/atelier-rs)
65133

66-
- [examples](https://github.com/IteraLabs/atelier-rs/tree/main/examples):
67-
- [tests](https://github.com/IteraLabs/atelier-rs/tree/main/tests):
134+
Development resources:
68135

136+
- [examples](https://github.com/IteraLabs/atelier-rs/tree/main/atelier-data/examples)
137+
- [tests](https://github.com/IteraLabs/atelier-rs/tree/main/atelier-data/tests)
138+
- [benches](https://github.com/IteraLabs/atelier-rs/tree/main/benches)
139+
- [datasets](https://github.com/IteraLabs/atelier-rs/tree/main/datasets)
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# data_worker — Lean raw-data ingestion configuration
2+
#
3+
# This manifest drives the `data_worker` binary. Each [[workers]] entry
4+
# spawns a single WebSocket connection that publishes raw (un-normalised)
5+
# events to topic-keyed broadcast channels.
6+
#
7+
# Usage:
8+
# data_worker --config configs/data_worker.toml
9+
#
10+
# Topics produced per worker (when enabled):
11+
# orderbook.{depth}.{symbol} — raw orderbook snapshots / deltas
12+
# trade.all.{symbol} — public trade events
13+
# liquidation.all.{symbol} — forced liquidation events
14+
# funding.all.{symbol} — funding rate updates
15+
# open_interest.all.{symbol} — open interest snapshots
16+
17+
[defaults]
18+
# sync_mode is inherited from the MarketSnapshotConfig schema but is
19+
# unused by the data_worker (no grid synchronisation). Set to any valid
20+
# value; "on_trade" is a safe default.
21+
sync_mode = "on_trade"
22+
# flush_threshold is also unused (no Parquet writes) — set to 0.
23+
flush_threshold = 0
24+
25+
[defaults.update_frequency]
26+
value = 100
27+
unit = "Millis"
28+
29+
[defaults.datatypes]
30+
collect_orderbooks = true
31+
orderbook_depth = 50
32+
collect_trades = true
33+
collect_liquidations = false
34+
collect_funding_rates = false
35+
collect_open_interest = false
36+
37+
[defaults.logs]
38+
n_orderbooks = 10
39+
n_trades = 10
40+
n_liquidations = 0
41+
n_fundings = 0
42+
n_open_interests = 0
43+
44+
# ── Workers ─────────────────────────────────────────────────────────────
45+
46+
[[workers]]
47+
exchange = "bybit"
48+
symbol = "BTCUSDT"
49+
50+
[[workers]]
51+
exchange = "bybit"
52+
symbol = "SOLUSDT"
53+
54+
# [[connection]]
55+
# jitter_ms = 250 // for the startup time variation to avoid thundering-herd
56+
57+
# ── Output ──────────────────────────────────────────────────────────────
58+
59+
# base_dir is unused by the data_worker (no disk writes) but required
60+
# by the MarketSnapshotConfig schema.
61+
[output]
62+
base_dir = "/tmp/data_worker"
63+
64+
# ── Session ─────────────────────────────────────────────────────────────
65+
66+
[session]
67+
# Run indefinitely (Ctrl-C to stop). Uncomment to set a time limit:
68+
# duration_hours = 8

0 commit comments

Comments
 (0)