Struct CudaStream
pub struct CudaStream { /* private fields */ }Expand description
A wrapper around sys::CUstream that you can schedule work on.
- Create with [CudaContext::new_stream()], [CudaContext::default_stream()], or CudaStream::fork().
Work done on this is asynchronous with respect to the host.
See CUDA C/C++ Streams and Concurrency See 3. Stream synchronization behavior See 6.6. Event Management See Out-of-order execution See Dependence analysis
Implementations§
§impl CudaStream
impl CudaStream
pub fn fork(&self) -> Result<Arc<CudaStream>, DriverError>
pub fn fork(&self) -> Result<Arc<CudaStream>, DriverError>
Create’s a new stream and then makes the new stream wait on self
pub fn cu_stream(&self) -> *mut CUstream_st
pub fn cu_stream(&self) -> *mut CUstream_st
pub fn synchronize(&self) -> Result<(), DriverError>
pub fn synchronize(&self) -> Result<(), DriverError>
Will only block CPU if you call [CudaContext::set_flags()] with sys::CUctx_flags::CU_CTX_SCHED_BLOCKING_SYNC.
See cuda docs
pub fn record_event(
&self,
flags: Option<CUevent_flags_enum>,
) -> Result<CudaEvent, DriverError>
pub fn record_event( &self, flags: Option<CUevent_flags_enum>, ) -> Result<CudaEvent, DriverError>
Creates a new [CudaEvent] and records the current work in the stream to the event.
pub fn wait(&self, event: &CudaEvent) -> Result<(), DriverError>
pub fn wait(&self, event: &CudaEvent) -> Result<(), DriverError>
Waits for the work recorded in [CudaEvent] to be completed.
You can record new work in event after calling this method without
affecting this call.
See cuda docs
pub fn join(&self, other: &CudaStream) -> Result<(), DriverError>
pub fn join(&self, other: &CudaStream) -> Result<(), DriverError>
Ensures this stream waits for the current workload in other to complete.
This is shorthand for self.wait(other.record_event())
§impl CudaStream
impl CudaStream
pub fn null<T>(self: &Arc<CudaStream>) -> Result<CudaSlice<T>, DriverError>
pub fn null<T>(self: &Arc<CudaStream>) -> Result<CudaSlice<T>, DriverError>
Allocates an empty CudaSlice with 0 length.
pub unsafe fn alloc<T>(
self: &Arc<CudaStream>,
len: usize,
) -> Result<CudaSlice<T>, DriverError>where
T: DeviceRepr,
pub unsafe fn alloc<T>(
self: &Arc<CudaStream>,
len: usize,
) -> Result<CudaSlice<T>, DriverError>where
T: DeviceRepr,
pub fn alloc_zeros<T>(
self: &Arc<CudaStream>,
len: usize,
) -> Result<CudaSlice<T>, DriverError>where
T: DeviceRepr + ValidAsZeroBits,
pub fn alloc_zeros<T>(
self: &Arc<CudaStream>,
len: usize,
) -> Result<CudaSlice<T>, DriverError>where
T: DeviceRepr + ValidAsZeroBits,
Allocates a CudaSlice with len elements of type T. All values are zero’d out.
pub fn memset_zeros<T, Dst>(
self: &Arc<CudaStream>,
dst: &mut Dst,
) -> Result<(), DriverError>
pub fn memset_zeros<T, Dst>( self: &Arc<CudaStream>, dst: &mut Dst, ) -> Result<(), DriverError>
Set’s all the memory in dst to 0. dst can be a CudaSlice or CudaViewMut
pub fn memcpy_stod<T, Src>(
self: &Arc<CudaStream>,
src: &Src,
) -> Result<CudaSlice<T>, DriverError>where
T: DeviceRepr,
Src: HostSlice<T> + ?Sized,
👎Deprecated: Use clone_htod
pub fn memcpy_stod<T, Src>(
self: &Arc<CudaStream>,
src: &Src,
) -> Result<CudaSlice<T>, DriverError>where
T: DeviceRepr,
Src: HostSlice<T> + ?Sized,
Use clone_htod
Copy a [T]/Vec<T>/[PinnedHostSlice<T>] to a new CudaSlice.
pub fn clone_htod<T, Src>(
self: &Arc<CudaStream>,
src: &Src,
) -> Result<CudaSlice<T>, DriverError>where
T: DeviceRepr,
Src: HostSlice<T> + ?Sized,
pub fn clone_htod<T, Src>(
self: &Arc<CudaStream>,
src: &Src,
) -> Result<CudaSlice<T>, DriverError>where
T: DeviceRepr,
Src: HostSlice<T> + ?Sized,
Copy a [T]/Vec<T>/[PinnedHostSlice<T>] to a new CudaSlice.
pub fn memcpy_htod<T, Src, Dst>(
self: &Arc<CudaStream>,
src: &Src,
dst: &mut Dst,
) -> Result<(), DriverError>
pub fn memcpy_htod<T, Src, Dst>( self: &Arc<CudaStream>, src: &Src, dst: &mut Dst, ) -> Result<(), DriverError>
Copy a [T]/Vec<T>/[PinnedHostSlice<T>] into an existing CudaSlice/CudaViewMut.
pub fn memcpy_dtov<T, Src>(
self: &Arc<CudaStream>,
src: &Src,
) -> Result<Vec<T>, DriverError>where
T: DeviceRepr,
Src: DevicePtr<T>,
👎Deprecated: Use clone_dtoh
pub fn memcpy_dtov<T, Src>(
self: &Arc<CudaStream>,
src: &Src,
) -> Result<Vec<T>, DriverError>where
T: DeviceRepr,
Src: DevicePtr<T>,
Use clone_dtoh
pub fn clone_dtoh<T, Src>(
self: &Arc<CudaStream>,
src: &Src,
) -> Result<Vec<T>, DriverError>where
T: DeviceRepr,
Src: DevicePtr<T>,
pub fn clone_dtoh<T, Src>(
self: &Arc<CudaStream>,
src: &Src,
) -> Result<Vec<T>, DriverError>where
T: DeviceRepr,
Src: DevicePtr<T>,
pub fn memcpy_dtoh<T, Src, Dst>(
self: &Arc<CudaStream>,
src: &Src,
dst: &mut Dst,
) -> Result<(), DriverError>
pub fn memcpy_dtoh<T, Src, Dst>( self: &Arc<CudaStream>, src: &Src, dst: &mut Dst, ) -> Result<(), DriverError>
pub fn memcpy_dtod<T, Src, Dst>(
self: &Arc<CudaStream>,
src: &Src,
dst: &mut Dst,
) -> Result<(), DriverError>where
Src: DevicePtr<T>,
Dst: DevicePtrMut<T>,
pub fn memcpy_dtod<T, Src, Dst>(
self: &Arc<CudaStream>,
src: &Src,
dst: &mut Dst,
) -> Result<(), DriverError>where
Src: DevicePtr<T>,
Dst: DevicePtrMut<T>,
Copy a CudaSlice/CudaView to a existing CudaSlice/CudaViewMut.
pub fn clone_dtod<T, Src>(
self: &Arc<CudaStream>,
src: &Src,
) -> Result<CudaSlice<T>, DriverError>where
T: DeviceRepr,
Src: DevicePtr<T>,
pub fn clone_dtod<T, Src>(
self: &Arc<CudaStream>,
src: &Src,
) -> Result<CudaSlice<T>, DriverError>where
T: DeviceRepr,
Src: DevicePtr<T>,
§impl CudaStream
impl CudaStream
pub unsafe fn upgrade_device_ptr<T>(
self: &Arc<CudaStream>,
cu_device_ptr: u64,
len: usize,
) -> CudaSlice<T>
pub unsafe fn upgrade_device_ptr<T>( self: &Arc<CudaStream>, cu_device_ptr: u64, len: usize, ) -> CudaSlice<T>
Creates a CudaSlice from a sys::CUdeviceptr. Useful in conjunction with
CudaSlice::leak().
§Safety
cu_device_ptrmust be a valid allocationcu_device_ptrmust space forlen * std::mem::size_of<T>()bytes- The memory may not be valid for type
T, so some sort of memset operation should be called on the memory.
§impl CudaStream
impl CudaStream
pub fn begin_capture(
&self,
mode: CUstreamCaptureMode_enum,
) -> Result<(), DriverError>
pub fn begin_capture( &self, mode: CUstreamCaptureMode_enum, ) -> Result<(), DriverError>
See cuda docs
pub fn end_capture(
self: &Arc<CudaStream>,
flags: CUgraphInstantiate_flags_enum,
) -> Result<Option<CudaGraph>, DriverError>
pub fn end_capture( self: &Arc<CudaStream>, flags: CUgraphInstantiate_flags_enum, ) -> Result<Option<CudaGraph>, DriverError>
See cuda docs
flags is passed to cuGraphInstantiate
pub fn capture_status(&self) -> Result<CUstreamCaptureStatus_enum, DriverError>
pub fn capture_status(&self) -> Result<CUstreamCaptureStatus_enum, DriverError>
See cuda docs
§impl CudaStream
impl CudaStream
pub fn launch_builder<'a>(&'a self, func: &'a CudaFunction) -> LaunchArgs<'a>
pub fn launch_builder<'a>(&'a self, func: &'a CudaFunction) -> LaunchArgs<'a>
Creates a new kernel launch builder that will launch func on stream self.
Add arguments to the builder using [LaunchArgs::arg()], and submit it to the stream using [LaunchArgs::launch()].