xlog_cuda/device_runtime/
runtime.rs

1//! [`XlogDeviceRuntime`] — per-CUDA-ordinal singleton hosting the
2//! device-runtime allocator stack.
3//!
4//! Replaces the per-`CudaKernelProvider` `GpuMemoryManager` model with
5//! a single live runtime per physical GPU. All `CudaKernelProvider`s
6//! on a given ordinal share the same runtime once the migration
7//! commit lands; until then this type is constructed and used by
8//! tests only.
9//!
10//! Singleton lifetime: leaked-Box, so the returned `&'static` borrows
11//! are valid for the process. No teardown on drop — appropriate for a
12//! GPU device runtime that should outlive any single executor.
13//!
14//! # Initialization race semantics
15//!
16//! Earlier revisions used `OnceLock::get_or_init(|| leaked_box)`
17//! after building the runtime outside the lock. That pattern leaked
18//! the loser's runtime (and its CUDA context handle) when two
19//! threads raced on the first access for an ordinal.
20//!
21//! This module now uses an explicit per-ordinal `Mutex` plus
22//! `OnceLock`: callers fast-path on `OnceLock::get()`, and on a miss
23//! take the per-ordinal mutex, double-check the `OnceLock`, and only
24//! the winner inside the mutex builds and stores the runtime. The
25//! mutex is held only across the build, so subsequent reads are still
26//! lock-free.
27
28use std::sync::Arc;
29use std::sync::Mutex;
30use std::sync::OnceLock;
31
32use xlog_core::{Result, XlogError};
33
34use super::direct::DirectCudaResource;
35use super::resource::{
36    Access, AllocTag, BlockId, DeviceBlock, DeviceMemoryResource, ResourceResult, StreamId,
37};
38use super::stream_pool::StreamPool;
39use crate::CudaDevice;
40
41/// Maximum CUDA ordinal supported by the singleton table. CUDA itself
42/// caps at 16 visible devices in typical configurations; raise here
43/// only when a multi-GPU node demands it.
44pub const MAX_DEVICE_ORDINALS: usize = 16;
45
46/// Per-ordinal singleton table. Each slot is initialized at most once
47/// via `OnceLock`, gated by [`INIT_LOCKS`] so failed initialization
48/// does not leak partial state.
49static RUNTIMES: [OnceLock<&'static XlogDeviceRuntime>; MAX_DEVICE_ORDINALS] =
50    [const { OnceLock::new() }; MAX_DEVICE_ORDINALS];
51
52/// Per-ordinal initialization mutex. Only the holder may build and
53/// store a runtime in [`RUNTIMES`]. Held across the device-open and
54/// resource-construction calls so concurrent first callers do not
55/// race-leak loser runtimes.
56static INIT_LOCKS: [Mutex<()>; MAX_DEVICE_ORDINALS] =
57    [const { Mutex::new(()) }; MAX_DEVICE_ORDINALS];
58
59/// Per-CUDA-ordinal device-runtime singleton.
60///
61/// Owns the device handle, stream pool, and resource stack. Allocate
62/// / deallocate calls forward to the resource. The resource is fixed
63/// at construction (currently always [`DirectCudaResource`]); a
64/// future commit will swap in [`AsyncCudaResource`] as the default
65/// while keeping the direct backend reachable for sanitizer mode.
66pub struct XlogDeviceRuntime {
67    device_ordinal: u32,
68    device: Arc<CudaDevice>,
69    stream_pool: Arc<StreamPool>,
70    resource: Mutex<Box<dyn DeviceMemoryResource + Send + Sync>>,
71}
72
73impl XlogDeviceRuntime {
74    /// Compose an owned runtime around a caller-supplied resource
75    /// stack. **Not** a singleton — the returned value is *not*
76    /// stored in [`RUNTIMES`] and does not interact with `try_get`.
77    ///
78    /// Intended uses:
79    ///   * Tests that need to drive a specific backend (e.g.,
80    ///     `AsyncCudaResource`) through the same facade production
81    ///     code uses, instead of constructing the resource directly.
82    ///   * Future decorator stacks (`LoggingResource`,
83    ///     `GlobalDeviceBudget`, `DebugGuardResource`) that wrap the
84    ///     base resource before installation.
85    ///
86    /// The `device` and `stream_pool` arguments must be consistent
87    /// with `device_ordinal` (the pool must be bound to the same
88    /// device handle, and the device must be the one the resource
89    /// allocates against). The constructor does not verify this —
90    /// callers that compose mismatched parts get undefined
91    /// runtime-level behavior, but the per-resource device-ordinal
92    /// check on `deallocate` will still surface obvious mistakes as
93    /// `ResourceError::Driver`.
94    ///
95    /// The singleton path remains [`Self::try_get`], which today
96    /// always installs the cudarc default (non-pooled) backend
97    /// ([`DirectCudaResource`]). Swapping the singleton's default
98    /// resource is a separate later change gated on
99    /// `GlobalDeviceBudget` and `LoggingResource` landing.
100    pub fn with_resource(
101        device: Arc<CudaDevice>,
102        device_ordinal: u32,
103        stream_pool: Arc<StreamPool>,
104        resource: Box<dyn DeviceMemoryResource + Send + Sync>,
105    ) -> Self {
106        Self {
107            device_ordinal,
108            device,
109            stream_pool,
110            resource: Mutex::new(resource),
111        }
112    }
113
114    /// Get the singleton for `ordinal`, initializing it on first
115    /// access. Subsequent calls return the same `&'static`.
116    ///
117    /// Errors:
118    ///   * `XlogError::Kernel` if `ordinal >= MAX_DEVICE_ORDINALS`.
119    ///   * `XlogError::Kernel` if the CUDA device cannot be opened.
120    ///
121    /// Concurrency: at most one thread builds the runtime for a
122    /// given ordinal. Other concurrent first callers block on the
123    /// per-ordinal init mutex until the winner publishes via
124    /// `OnceLock::set`, after which they observe the published
125    /// runtime via the inside-mutex double-check or the lock-free
126    /// fast path on subsequent calls.
127    pub fn try_get(ordinal: u32) -> Result<&'static XlogDeviceRuntime> {
128        let idx = ordinal as usize;
129        if idx >= MAX_DEVICE_ORDINALS {
130            return Err(XlogError::Kernel(format!(
131                "XlogDeviceRuntime: ordinal {} exceeds MAX_DEVICE_ORDINALS={}",
132                ordinal, MAX_DEVICE_ORDINALS
133            )));
134        }
135        // Fast path: another thread already initialized this slot.
136        if let Some(rt) = RUNTIMES[idx].get() {
137            return Ok(*rt);
138        }
139
140        // Slow path: take the per-ordinal init mutex. Only one
141        // thread per ordinal builds the runtime; the rest wait here
142        // and observe the published value on the double-check below.
143        let _guard = INIT_LOCKS[idx]
144            .lock()
145            .expect("XlogDeviceRuntime init mutex poisoned");
146
147        // Double-check inside the lock: a previous holder may have
148        // initialized while we were waiting for the mutex.
149        if let Some(rt) = RUNTIMES[idx].get() {
150            return Ok(*rt);
151        }
152
153        // We are the first writer for this ordinal. Build the
154        // runtime; if any step fails, return the error and leave
155        // RUNTIMES[idx] uninitialized so the next caller can retry.
156        let device = Arc::new(CudaDevice::new(ordinal as usize).map_err(|e| {
157            XlogError::Kernel(format!(
158                "XlogDeviceRuntime: failed to open device {}: {}",
159                ordinal, e
160            ))
161        })?);
162        let stream_pool = Arc::new(StreamPool::with_defaults(Arc::clone(&device)));
163        let resource: Box<dyn DeviceMemoryResource + Send + Sync> =
164            Box::new(DirectCudaResource::new(Arc::clone(&device), ordinal));
165        let runtime = Box::new(XlogDeviceRuntime {
166            device_ordinal: ordinal,
167            device,
168            stream_pool,
169            resource: Mutex::new(resource),
170        });
171        let leaked: &'static XlogDeviceRuntime = Box::leak(runtime);
172
173        // We hold INIT_LOCKS[idx] and confirmed RUNTIMES[idx] is
174        // empty under that lock, so this `set` cannot fail. Fall
175        // through to a hard panic if it does — it indicates a
176        // process-internal bug we cannot recover from.
177        RUNTIMES[idx]
178            .set(leaked)
179            .map_err(|_| ())
180            .expect("XlogDeviceRuntime: OnceLock::set raced under INIT_LOCKS — bug");
181        Ok(leaked)
182    }
183
184    /// CUDA ordinal this runtime serves.
185    pub fn device_ordinal(&self) -> u32 {
186        self.device_ordinal
187    }
188
189    /// Borrow the device handle.
190    pub fn device(&self) -> &Arc<CudaDevice> {
191        &self.device
192    }
193
194    /// Borrow the stream pool.
195    pub fn stream_pool(&self) -> &Arc<StreamPool> {
196        &self.stream_pool
197    }
198
199    /// Allocate via the underlying resource. Stream-ordered: the
200    /// returned [`DeviceBlock`] is bound to `stream`.
201    pub fn allocate(
202        &self,
203        bytes: usize,
204        stream: StreamId,
205        tag: AllocTag,
206    ) -> ResourceResult<DeviceBlock> {
207        self.resource
208            .lock()
209            .expect("device-runtime resource poisoned")
210            .allocate(bytes, stream, tag)
211    }
212
213    /// Deallocate via the underlying resource.
214    pub fn deallocate(&self, block: DeviceBlock) -> ResourceResult<()> {
215        self.resource
216            .lock()
217            .expect("device-runtime resource poisoned")
218            .deallocate(block)
219    }
220
221    /// Sum of bytes currently outstanding on this device, as reported
222    /// by the underlying resource. Used by the global-budget adaptor
223    /// (later commit) and the parallel-stress acceptance test.
224    pub fn bytes_outstanding(&self) -> usize {
225        self.resource
226            .lock()
227            .expect("device-runtime resource poisoned")
228            .bytes_outstanding()
229    }
230
231    /// Drain pending async frees on the underlying resource. No-op
232    /// for synchronous backends. Callers that need an accurate
233    /// `bytes_outstanding` reading after a burst of asynchronous
234    /// deallocations should call this first.
235    pub fn reap_pending(&self) -> ResourceResult<()> {
236        self.resource
237            .lock()
238            .expect("device-runtime resource poisoned")
239            .reap_pending()
240    }
241
242    /// Record that work has been (or is being) submitted on
243    /// `use_stream` that touches `block`. Forwards to the
244    /// underlying resource stack
245    /// (`GlobalDeviceBudget` → `LoggingResource` → `AsyncCudaResource`),
246    /// where the stream-ordered backend attaches a CUDA event so
247    /// `block.alloc_stream` waits on it before the queued
248    /// `cuMemFreeAsync` runs. This is the production-reachable
249    /// hook the future xlog launch builder will call for
250    /// `read` / `write` / `read_write` buffer args; until that
251    /// lands, callers that submit raw CUDA work on a stream
252    /// other than `block.alloc_stream` should call this directly.
253    /// See [`DeviceMemoryResource::record_block_use`] for the
254    /// underlying contract.
255    pub fn record_block_use(
256        &self,
257        block: &DeviceBlock,
258        use_stream: StreamId,
259    ) -> ResourceResult<()> {
260        self.resource
261            .lock()
262            .expect("device-runtime resource poisoned")
263            .record_block_use(block, use_stream)
264    }
265
266    /// Whether the active resource stack tracks cross-stream
267    /// uses (i.e., supports `record_block_use`). The launch
268    /// recorder's preflight checks this BEFORE queuing CUDA
269    /// work, so a misconfigured runtime fails loudly at the
270    /// boundary rather than after the launch is in flight.
271    pub fn supports_block_use_tracking(&self) -> bool {
272        self.resource
273            .lock()
274            .expect("device-runtime resource poisoned")
275            .supports_block_use_tracking()
276    }
277
278    /// Pre-launch hook: queue cross-stream waits required for
279    /// `use_stream` to safely access `block` with `access`
280    /// semantics. MUST be called BEFORE the GPU work is enqueued
281    /// on `use_stream`. Forwards to the resource stack; see
282    /// [`DeviceMemoryResource::prepare_block_use`] for the
283    /// underlying contract.
284    pub fn prepare_block_use(
285        &self,
286        block: BlockId,
287        use_stream: StreamId,
288        access: Access,
289    ) -> ResourceResult<()> {
290        self.resource
291            .lock()
292            .expect("device-runtime resource poisoned")
293            .prepare_block_use(block, use_stream, access)
294    }
295
296    /// Post-launch hook: record an event on `use_stream`
297    /// capturing the work just enqueued and update `block`'s
298    /// dependency state. MUST be called AFTER the launch /
299    /// copy is queued. Forwards to the resource stack; see
300    /// [`DeviceMemoryResource::finish_block_use`] for the
301    /// underlying contract.
302    pub fn finish_block_use(
303        &self,
304        block: BlockId,
305        use_stream: StreamId,
306        access: Access,
307    ) -> ResourceResult<()> {
308        self.resource
309            .lock()
310            .expect("device-runtime resource poisoned")
311            .finish_block_use(block, use_stream, access)
312    }
313
314    /// Convenience for helper-internal scratch allocations that
315    /// will be immediately written / read on `use_stream`.
316    ///
317    /// Looks up the [`BlockId`] from the slice's runtime block
318    /// and calls [`Self::prepare_block_use`] with `access`. Use
319    /// this directly after `GpuMemoryManager::alloc` when the
320    /// buffer's first cross-stream consumer is the same operator
321    /// (e.g., a hash-table bucket array memset on `launch_stream`
322    /// against a buffer freshly allocated on the manager's
323    /// default stream).
324    ///
325    /// Returns `Err(ResourceError::StreamMisuse)` if `slice` is
326    /// not runtime-backed — strict callers should ensure their
327    /// memory manager carries a runtime.
328    pub fn prepare_first_use<T: cudarc::driver::DeviceRepr>(
329        &self,
330        slice: &crate::memory::TrackedCudaSlice<T>,
331        use_stream: StreamId,
332        access: Access,
333    ) -> ResourceResult<()> {
334        let block = slice.runtime_block().ok_or_else(|| {
335            super::resource::ResourceError::StreamMisuse(
336                "prepare_first_use: slice is not runtime-backed (the helper's \
337                 GpuMemoryManager must be built via with_runtime)"
338                    .to_string(),
339            )
340        })?;
341        self.prepare_block_use(BlockId::from_block(block), use_stream, access)
342    }
343
344    /// Convenience for helper-internal scratch finish: looks up
345    /// the [`BlockId`] from the slice and forwards to
346    /// [`Self::finish_block_use`].
347    pub fn finish_first_use<T: cudarc::driver::DeviceRepr>(
348        &self,
349        slice: &crate::memory::TrackedCudaSlice<T>,
350        use_stream: StreamId,
351        access: Access,
352    ) -> ResourceResult<()> {
353        let block = slice.runtime_block().ok_or_else(|| {
354            super::resource::ResourceError::StreamMisuse(
355                "finish_first_use: slice is not runtime-backed".to_string(),
356            )
357        })?;
358        self.finish_block_use(BlockId::from_block(block), use_stream, access)
359    }
360}
361
362#[cfg(test)]
363mod tests {
364    use super::*;
365
366    fn try_runtime() -> Option<&'static XlogDeviceRuntime> {
367        XlogDeviceRuntime::try_get(0).ok()
368    }
369
370    #[test]
371    fn try_get_returns_same_singleton() {
372        let Some(a) = try_runtime() else {
373            return;
374        };
375        let b = XlogDeviceRuntime::try_get(0).expect("re-get");
376        assert!(std::ptr::eq(a, b), "singleton must be stable for ordinal 0");
377        assert_eq!(a.device_ordinal(), 0);
378    }
379
380    #[test]
381    fn allocate_then_deallocate_via_runtime() {
382        let Some(rt) = try_runtime() else {
383            return;
384        };
385        let before = rt.bytes_outstanding();
386        let block = rt
387            .allocate(2048, StreamId::DEFAULT, AllocTag::UNTAGGED)
388            .expect("alloc");
389        assert_eq!(block.bytes, 2048);
390        assert_eq!(rt.bytes_outstanding(), before + 2048);
391        rt.deallocate(block).expect("dealloc");
392        rt.reap_pending().expect("reap pending");
393        assert_eq!(rt.bytes_outstanding(), before);
394    }
395
396    #[test]
397    fn try_get_rejects_out_of_range_ordinal() {
398        let err = XlogDeviceRuntime::try_get(MAX_DEVICE_ORDINALS as u32);
399        assert!(err.is_err());
400    }
401
402    #[test]
403    fn with_resource_composes_owned_runtime_outside_singleton() {
404        use super::super::async_resource::AsyncCudaResource;
405
406        let Some(rt) = try_runtime() else {
407            return;
408        };
409        let device = Arc::clone(rt.device());
410        let pool = Arc::new(StreamPool::with_defaults(Arc::clone(&device)));
411        let resource = Box::new(AsyncCudaResource::new(
412            Arc::clone(&device),
413            0,
414            Arc::clone(&pool),
415        ));
416
417        let owned = XlogDeviceRuntime::with_resource(device, 0, pool, resource);
418        assert_eq!(owned.device_ordinal(), 0);
419
420        let block = owned
421            .allocate(1024, StreamId::DEFAULT, AllocTag::UNTAGGED)
422            .expect("alloc through composed runtime");
423        assert_eq!(block.bytes, 1024);
424        assert_eq!(owned.bytes_outstanding(), 1024);
425        owned.deallocate(block).expect("dealloc");
426        owned.reap_pending().expect("reap");
427        assert_eq!(owned.bytes_outstanding(), 0);
428
429        // Composed runtime is not stored in the singleton table:
430        // the singleton for ordinal 0 is whatever `try_get` returns,
431        // which must be a different memory address.
432        let singleton = XlogDeviceRuntime::try_get(0).expect("singleton");
433        assert!(
434            !std::ptr::eq(&owned, singleton),
435            "with_resource must not aliase the singleton slot"
436        );
437    }
438
439    /// `try_get` installs `DirectCudaResource` by default. The
440    /// runtime's `record_block_use` must therefore return
441    /// `StreamMisuse` (the trait's default) rather than silently
442    /// claiming success — anything else would let a launch
443    /// builder running against the singleton observe `Ok(())`
444    /// while no event is actually recorded, reproducing the
445    /// cross-stream use-after-free this whole layer exists to
446    /// prevent. See the trait-level doc on
447    /// `DeviceMemoryResource::record_block_use`.
448    #[test]
449    fn try_get_runtime_record_block_use_rejected_with_stream_misuse() {
450        let Some(rt) = try_runtime() else {
451            return;
452        };
453        let block = rt
454            .allocate(64, StreamId::DEFAULT, AllocTag::UNTAGGED)
455            .expect("alloc through runtime");
456        let err = rt.record_block_use(&block, StreamId::DEFAULT);
457        match err {
458            Err(super::super::resource::ResourceError::StreamMisuse(msg)) => {
459                assert!(
460                    msg.contains("unsupported"),
461                    "expected 'unsupported' in StreamMisuse message, got {:?}",
462                    msg
463                );
464            }
465            other => panic!(
466                "XlogDeviceRuntime::try_get default (DirectCudaResource) must \
467                 reject record_block_use with StreamMisuse; got {:?}",
468                other
469            ),
470        }
471        rt.deallocate(block).expect("dealloc still works");
472    }
473}
xlog_cuda/device_runtime/runtime.rs

xlog_cuda/device_runtime/
runtime.rs