xlog_cuda/device_runtime/runtime.rs
1//! [`XlogDeviceRuntime`] — per-CUDA-ordinal singleton hosting the
2//! device-runtime allocator stack.
3//!
4//! Replaces the per-`CudaKernelProvider` `GpuMemoryManager` model with
5//! a single live runtime per physical GPU. All `CudaKernelProvider`s
6//! on a given ordinal share the same runtime once the migration
7//! commit lands; until then this type is constructed and used by
8//! tests only.
9//!
10//! Singleton lifetime: leaked-Box, so the returned `&'static` borrows
11//! are valid for the process. No teardown on drop — appropriate for a
12//! GPU device runtime that should outlive any single executor.
13//!
14//! # Initialization race semantics
15//!
16//! Earlier revisions used `OnceLock::get_or_init(|| leaked_box)`
17//! after building the runtime outside the lock. That pattern leaked
18//! the loser's runtime (and its CUDA context handle) when two
19//! threads raced on the first access for an ordinal.
20//!
21//! This module now uses an explicit per-ordinal `Mutex` plus
22//! `OnceLock`: callers fast-path on `OnceLock::get()`, and on a miss
23//! take the per-ordinal mutex, double-check the `OnceLock`, and only
24//! the winner inside the mutex builds and stores the runtime. The
25//! mutex is held only across the build, so subsequent reads are still
26//! lock-free.
27
28use std::sync::Arc;
29use std::sync::Mutex;
30use std::sync::OnceLock;
31
32use xlog_core::{Result, XlogError};
33
34use super::direct::DirectCudaResource;
35use super::resource::{
36 Access, AllocTag, BlockId, DeviceBlock, DeviceMemoryResource, ResourceResult, StreamId,
37};
38use super::stream_pool::StreamPool;
39use crate::CudaDevice;
40
41/// Maximum CUDA ordinal supported by the singleton table. CUDA itself
42/// caps at 16 visible devices in typical configurations; raise here
43/// only when a multi-GPU node demands it.
44pub const MAX_DEVICE_ORDINALS: usize = 16;
45
46/// Per-ordinal singleton table. Each slot is initialized at most once
47/// via `OnceLock`, gated by [`INIT_LOCKS`] so failed initialization
48/// does not leak partial state.
49static RUNTIMES: [OnceLock<&'static XlogDeviceRuntime>; MAX_DEVICE_ORDINALS] =
50 [const { OnceLock::new() }; MAX_DEVICE_ORDINALS];
51
52/// Per-ordinal initialization mutex. Only the holder may build and
53/// store a runtime in [`RUNTIMES`]. Held across the device-open and
54/// resource-construction calls so concurrent first callers do not
55/// race-leak loser runtimes.
56static INIT_LOCKS: [Mutex<()>; MAX_DEVICE_ORDINALS] =
57 [const { Mutex::new(()) }; MAX_DEVICE_ORDINALS];
58
59/// Per-CUDA-ordinal device-runtime singleton.
60///
61/// Owns the device handle, stream pool, and resource stack. Allocate
62/// / deallocate calls forward to the resource. The resource is fixed
63/// at construction (currently always [`DirectCudaResource`]); a
64/// future commit will swap in [`AsyncCudaResource`] as the default
65/// while keeping the direct backend reachable for sanitizer mode.
66pub struct XlogDeviceRuntime {
67 device_ordinal: u32,
68 device: Arc<CudaDevice>,
69 stream_pool: Arc<StreamPool>,
70 resource: Mutex<Box<dyn DeviceMemoryResource + Send + Sync>>,
71}
72
73impl XlogDeviceRuntime {
74 /// Compose an owned runtime around a caller-supplied resource
75 /// stack. **Not** a singleton — the returned value is *not*
76 /// stored in [`RUNTIMES`] and does not interact with `try_get`.
77 ///
78 /// Intended uses:
79 /// * Tests that need to drive a specific backend (e.g.,
80 /// `AsyncCudaResource`) through the same facade production
81 /// code uses, instead of constructing the resource directly.
82 /// * Future decorator stacks (`LoggingResource`,
83 /// `GlobalDeviceBudget`, `DebugGuardResource`) that wrap the
84 /// base resource before installation.
85 ///
86 /// The `device` and `stream_pool` arguments must be consistent
87 /// with `device_ordinal` (the pool must be bound to the same
88 /// device handle, and the device must be the one the resource
89 /// allocates against). The constructor does not verify this —
90 /// callers that compose mismatched parts get undefined
91 /// runtime-level behavior, but the per-resource device-ordinal
92 /// check on `deallocate` will still surface obvious mistakes as
93 /// `ResourceError::Driver`.
94 ///
95 /// The singleton path remains [`Self::try_get`], which today
96 /// always installs the cudarc default (non-pooled) backend
97 /// ([`DirectCudaResource`]). Swapping the singleton's default
98 /// resource is a separate later change gated on
99 /// `GlobalDeviceBudget` and `LoggingResource` landing.
100 pub fn with_resource(
101 device: Arc<CudaDevice>,
102 device_ordinal: u32,
103 stream_pool: Arc<StreamPool>,
104 resource: Box<dyn DeviceMemoryResource + Send + Sync>,
105 ) -> Self {
106 Self {
107 device_ordinal,
108 device,
109 stream_pool,
110 resource: Mutex::new(resource),
111 }
112 }
113
114 /// Get the singleton for `ordinal`, initializing it on first
115 /// access. Subsequent calls return the same `&'static`.
116 ///
117 /// Errors:
118 /// * `XlogError::Kernel` if `ordinal >= MAX_DEVICE_ORDINALS`.
119 /// * `XlogError::Kernel` if the CUDA device cannot be opened.
120 ///
121 /// Concurrency: at most one thread builds the runtime for a
122 /// given ordinal. Other concurrent first callers block on the
123 /// per-ordinal init mutex until the winner publishes via
124 /// `OnceLock::set`, after which they observe the published
125 /// runtime via the inside-mutex double-check or the lock-free
126 /// fast path on subsequent calls.
127 pub fn try_get(ordinal: u32) -> Result<&'static XlogDeviceRuntime> {
128 let idx = ordinal as usize;
129 if idx >= MAX_DEVICE_ORDINALS {
130 return Err(XlogError::Kernel(format!(
131 "XlogDeviceRuntime: ordinal {} exceeds MAX_DEVICE_ORDINALS={}",
132 ordinal, MAX_DEVICE_ORDINALS
133 )));
134 }
135 // Fast path: another thread already initialized this slot.
136 if let Some(rt) = RUNTIMES[idx].get() {
137 return Ok(*rt);
138 }
139
140 // Slow path: take the per-ordinal init mutex. Only one
141 // thread per ordinal builds the runtime; the rest wait here
142 // and observe the published value on the double-check below.
143 let _guard = INIT_LOCKS[idx]
144 .lock()
145 .expect("XlogDeviceRuntime init mutex poisoned");
146
147 // Double-check inside the lock: a previous holder may have
148 // initialized while we were waiting for the mutex.
149 if let Some(rt) = RUNTIMES[idx].get() {
150 return Ok(*rt);
151 }
152
153 // We are the first writer for this ordinal. Build the
154 // runtime; if any step fails, return the error and leave
155 // RUNTIMES[idx] uninitialized so the next caller can retry.
156 let device = Arc::new(CudaDevice::new(ordinal as usize).map_err(|e| {
157 XlogError::Kernel(format!(
158 "XlogDeviceRuntime: failed to open device {}: {}",
159 ordinal, e
160 ))
161 })?);
162 let stream_pool = Arc::new(StreamPool::with_defaults(Arc::clone(&device)));
163 let resource: Box<dyn DeviceMemoryResource + Send + Sync> =
164 Box::new(DirectCudaResource::new(Arc::clone(&device), ordinal));
165 let runtime = Box::new(XlogDeviceRuntime {
166 device_ordinal: ordinal,
167 device,
168 stream_pool,
169 resource: Mutex::new(resource),
170 });
171 let leaked: &'static XlogDeviceRuntime = Box::leak(runtime);
172
173 // We hold INIT_LOCKS[idx] and confirmed RUNTIMES[idx] is
174 // empty under that lock, so this `set` cannot fail. Fall
175 // through to a hard panic if it does — it indicates a
176 // process-internal bug we cannot recover from.
177 RUNTIMES[idx]
178 .set(leaked)
179 .map_err(|_| ())
180 .expect("XlogDeviceRuntime: OnceLock::set raced under INIT_LOCKS — bug");
181 Ok(leaked)
182 }
183
184 /// CUDA ordinal this runtime serves.
185 pub fn device_ordinal(&self) -> u32 {
186 self.device_ordinal
187 }
188
189 /// Borrow the device handle.
190 pub fn device(&self) -> &Arc<CudaDevice> {
191 &self.device
192 }
193
194 /// Borrow the stream pool.
195 pub fn stream_pool(&self) -> &Arc<StreamPool> {
196 &self.stream_pool
197 }
198
199 /// Allocate via the underlying resource. Stream-ordered: the
200 /// returned [`DeviceBlock`] is bound to `stream`.
201 pub fn allocate(
202 &self,
203 bytes: usize,
204 stream: StreamId,
205 tag: AllocTag,
206 ) -> ResourceResult<DeviceBlock> {
207 self.resource
208 .lock()
209 .expect("device-runtime resource poisoned")
210 .allocate(bytes, stream, tag)
211 }
212
213 /// Deallocate via the underlying resource.
214 pub fn deallocate(&self, block: DeviceBlock) -> ResourceResult<()> {
215 self.resource
216 .lock()
217 .expect("device-runtime resource poisoned")
218 .deallocate(block)
219 }
220
221 /// Sum of bytes currently outstanding on this device, as reported
222 /// by the underlying resource. Used by the global-budget adaptor
223 /// (later commit) and the parallel-stress acceptance test.
224 pub fn bytes_outstanding(&self) -> usize {
225 self.resource
226 .lock()
227 .expect("device-runtime resource poisoned")
228 .bytes_outstanding()
229 }
230
231 /// Drain pending async frees on the underlying resource. No-op
232 /// for synchronous backends. Callers that need an accurate
233 /// `bytes_outstanding` reading after a burst of asynchronous
234 /// deallocations should call this first.
235 pub fn reap_pending(&self) -> ResourceResult<()> {
236 self.resource
237 .lock()
238 .expect("device-runtime resource poisoned")
239 .reap_pending()
240 }
241
242 /// Record that work has been (or is being) submitted on
243 /// `use_stream` that touches `block`. Forwards to the
244 /// underlying resource stack
245 /// (`GlobalDeviceBudget` → `LoggingResource` → `AsyncCudaResource`),
246 /// where the stream-ordered backend attaches a CUDA event so
247 /// `block.alloc_stream` waits on it before the queued
248 /// `cuMemFreeAsync` runs. This is the production-reachable
249 /// hook the future xlog launch builder will call for
250 /// `read` / `write` / `read_write` buffer args; until that
251 /// lands, callers that submit raw CUDA work on a stream
252 /// other than `block.alloc_stream` should call this directly.
253 /// See [`DeviceMemoryResource::record_block_use`] for the
254 /// underlying contract.
255 pub fn record_block_use(
256 &self,
257 block: &DeviceBlock,
258 use_stream: StreamId,
259 ) -> ResourceResult<()> {
260 self.resource
261 .lock()
262 .expect("device-runtime resource poisoned")
263 .record_block_use(block, use_stream)
264 }
265
266 /// Whether the active resource stack tracks cross-stream
267 /// uses (i.e., supports `record_block_use`). The launch
268 /// recorder's preflight checks this BEFORE queuing CUDA
269 /// work, so a misconfigured runtime fails loudly at the
270 /// boundary rather than after the launch is in flight.
271 pub fn supports_block_use_tracking(&self) -> bool {
272 self.resource
273 .lock()
274 .expect("device-runtime resource poisoned")
275 .supports_block_use_tracking()
276 }
277
278 /// Pre-launch hook: queue cross-stream waits required for
279 /// `use_stream` to safely access `block` with `access`
280 /// semantics. MUST be called BEFORE the GPU work is enqueued
281 /// on `use_stream`. Forwards to the resource stack; see
282 /// [`DeviceMemoryResource::prepare_block_use`] for the
283 /// underlying contract.
284 pub fn prepare_block_use(
285 &self,
286 block: BlockId,
287 use_stream: StreamId,
288 access: Access,
289 ) -> ResourceResult<()> {
290 self.resource
291 .lock()
292 .expect("device-runtime resource poisoned")
293 .prepare_block_use(block, use_stream, access)
294 }
295
296 /// Post-launch hook: record an event on `use_stream`
297 /// capturing the work just enqueued and update `block`'s
298 /// dependency state. MUST be called AFTER the launch /
299 /// copy is queued. Forwards to the resource stack; see
300 /// [`DeviceMemoryResource::finish_block_use`] for the
301 /// underlying contract.
302 pub fn finish_block_use(
303 &self,
304 block: BlockId,
305 use_stream: StreamId,
306 access: Access,
307 ) -> ResourceResult<()> {
308 self.resource
309 .lock()
310 .expect("device-runtime resource poisoned")
311 .finish_block_use(block, use_stream, access)
312 }
313
314 /// Convenience for helper-internal scratch allocations that
315 /// will be immediately written / read on `use_stream`.
316 ///
317 /// Looks up the [`BlockId`] from the slice's runtime block
318 /// and calls [`Self::prepare_block_use`] with `access`. Use
319 /// this directly after `GpuMemoryManager::alloc` when the
320 /// buffer's first cross-stream consumer is the same operator
321 /// (e.g., a hash-table bucket array memset on `launch_stream`
322 /// against a buffer freshly allocated on the manager's
323 /// default stream).
324 ///
325 /// Returns `Err(ResourceError::StreamMisuse)` if `slice` is
326 /// not runtime-backed — strict callers should ensure their
327 /// memory manager carries a runtime.
328 pub fn prepare_first_use<T: cudarc::driver::DeviceRepr>(
329 &self,
330 slice: &crate::memory::TrackedCudaSlice<T>,
331 use_stream: StreamId,
332 access: Access,
333 ) -> ResourceResult<()> {
334 let block = slice.runtime_block().ok_or_else(|| {
335 super::resource::ResourceError::StreamMisuse(
336 "prepare_first_use: slice is not runtime-backed (the helper's \
337 GpuMemoryManager must be built via with_runtime)"
338 .to_string(),
339 )
340 })?;
341 self.prepare_block_use(BlockId::from_block(block), use_stream, access)
342 }
343
344 /// Convenience for helper-internal scratch finish: looks up
345 /// the [`BlockId`] from the slice and forwards to
346 /// [`Self::finish_block_use`].
347 pub fn finish_first_use<T: cudarc::driver::DeviceRepr>(
348 &self,
349 slice: &crate::memory::TrackedCudaSlice<T>,
350 use_stream: StreamId,
351 access: Access,
352 ) -> ResourceResult<()> {
353 let block = slice.runtime_block().ok_or_else(|| {
354 super::resource::ResourceError::StreamMisuse(
355 "finish_first_use: slice is not runtime-backed".to_string(),
356 )
357 })?;
358 self.finish_block_use(BlockId::from_block(block), use_stream, access)
359 }
360}
361
362#[cfg(test)]
363mod tests {
364 use super::*;
365
366 fn try_runtime() -> Option<&'static XlogDeviceRuntime> {
367 XlogDeviceRuntime::try_get(0).ok()
368 }
369
370 #[test]
371 fn try_get_returns_same_singleton() {
372 let Some(a) = try_runtime() else {
373 return;
374 };
375 let b = XlogDeviceRuntime::try_get(0).expect("re-get");
376 assert!(std::ptr::eq(a, b), "singleton must be stable for ordinal 0");
377 assert_eq!(a.device_ordinal(), 0);
378 }
379
380 #[test]
381 fn allocate_then_deallocate_via_runtime() {
382 let Some(rt) = try_runtime() else {
383 return;
384 };
385 let before = rt.bytes_outstanding();
386 let block = rt
387 .allocate(2048, StreamId::DEFAULT, AllocTag::UNTAGGED)
388 .expect("alloc");
389 assert_eq!(block.bytes, 2048);
390 assert_eq!(rt.bytes_outstanding(), before + 2048);
391 rt.deallocate(block).expect("dealloc");
392 rt.reap_pending().expect("reap pending");
393 assert_eq!(rt.bytes_outstanding(), before);
394 }
395
396 #[test]
397 fn try_get_rejects_out_of_range_ordinal() {
398 let err = XlogDeviceRuntime::try_get(MAX_DEVICE_ORDINALS as u32);
399 assert!(err.is_err());
400 }
401
402 #[test]
403 fn with_resource_composes_owned_runtime_outside_singleton() {
404 use super::super::async_resource::AsyncCudaResource;
405
406 let Some(rt) = try_runtime() else {
407 return;
408 };
409 let device = Arc::clone(rt.device());
410 let pool = Arc::new(StreamPool::with_defaults(Arc::clone(&device)));
411 let resource = Box::new(AsyncCudaResource::new(
412 Arc::clone(&device),
413 0,
414 Arc::clone(&pool),
415 ));
416
417 let owned = XlogDeviceRuntime::with_resource(device, 0, pool, resource);
418 assert_eq!(owned.device_ordinal(), 0);
419
420 let block = owned
421 .allocate(1024, StreamId::DEFAULT, AllocTag::UNTAGGED)
422 .expect("alloc through composed runtime");
423 assert_eq!(block.bytes, 1024);
424 assert_eq!(owned.bytes_outstanding(), 1024);
425 owned.deallocate(block).expect("dealloc");
426 owned.reap_pending().expect("reap");
427 assert_eq!(owned.bytes_outstanding(), 0);
428
429 // Composed runtime is not stored in the singleton table:
430 // the singleton for ordinal 0 is whatever `try_get` returns,
431 // which must be a different memory address.
432 let singleton = XlogDeviceRuntime::try_get(0).expect("singleton");
433 assert!(
434 !std::ptr::eq(&owned, singleton),
435 "with_resource must not aliase the singleton slot"
436 );
437 }
438
439 /// `try_get` installs `DirectCudaResource` by default. The
440 /// runtime's `record_block_use` must therefore return
441 /// `StreamMisuse` (the trait's default) rather than silently
442 /// claiming success — anything else would let a launch
443 /// builder running against the singleton observe `Ok(())`
444 /// while no event is actually recorded, reproducing the
445 /// cross-stream use-after-free this whole layer exists to
446 /// prevent. See the trait-level doc on
447 /// `DeviceMemoryResource::record_block_use`.
448 #[test]
449 fn try_get_runtime_record_block_use_rejected_with_stream_misuse() {
450 let Some(rt) = try_runtime() else {
451 return;
452 };
453 let block = rt
454 .allocate(64, StreamId::DEFAULT, AllocTag::UNTAGGED)
455 .expect("alloc through runtime");
456 let err = rt.record_block_use(&block, StreamId::DEFAULT);
457 match err {
458 Err(super::super::resource::ResourceError::StreamMisuse(msg)) => {
459 assert!(
460 msg.contains("unsupported"),
461 "expected 'unsupported' in StreamMisuse message, got {:?}",
462 msg
463 );
464 }
465 other => panic!(
466 "XlogDeviceRuntime::try_get default (DirectCudaResource) must \
467 reject record_block_use with StreamMisuse; got {:?}",
468 other
469 ),
470 }
471 rt.deallocate(block).expect("dealloc still works");
472 }
473}