solana/perf/src/cuda_runtime.rs

// Module for cuda-related helper functions and wrappers.
//
// cudaHostRegister/cudaHostUnregister -
//    apis for page-pinning memory. Cuda driver/hardware cannot overlap
//    copies from host memory to GPU memory unless the memory is page-pinned and
//    cannot be paged to disk. The cuda driver provides these interfaces to pin and unpin memory.

use crate::perf_libs;
use crate::recycler::Reset;
use std::ops::{Deref, DerefMut};

#[cfg(feature = "pin_gpu_memory")]
use std::os::raw::c_int;

#[cfg(feature = "pin_gpu_memory")]
const CUDA_SUCCESS: c_int = 0;

pub fn pin<T>(_mem: &mut Vec<T>) {
    #[cfg(feature = "pin_gpu_memory")]
    {
        if let Some(api) = perf_libs::api() {
            unsafe {
                use core::ffi::c_void;
                use std::mem::size_of;

                let err = (api.cuda_host_register)(
                    _mem.as_mut_ptr() as *mut c_void,
                    _mem.capacity() * size_of::<T>(),
                    0,
                );
                if err != CUDA_SUCCESS {
                    error!(
                        "cudaHostRegister error: {} ptr: {:?} bytes: {}",
                        err,
                        _mem.as_ptr(),
                        _mem.capacity() * size_of::<T>()
                    );
                }
            }
        }
    }
}

pub fn unpin<T>(_mem: *mut T) {
    #[cfg(feature = "pin_gpu_memory")]
    {
        if let Some(api) = perf_libs::api() {
            unsafe {
                use core::ffi::c_void;

                let err = (api.cuda_host_unregister)(_mem as *mut c_void);
                if err != CUDA_SUCCESS {
                    error!("cudaHostUnregister returned: {} ptr: {:?}", err, _mem);
                }
            }
        }
    }
}

// A vector wrapper where the underlying memory can be
// page-pinned. Controlled by flags in case user only wants
// to pin in certain circumstances.
#[derive(Debug)]
pub struct PinnedVec<T> {
    x: Vec<T>,
    pinned: bool,
    pinnable: bool,
}

impl<T: Default + Clone> Reset for PinnedVec<T> {
    fn reset(&mut self) {
        self.resize(0, T::default());
    }
}

impl<T: Clone> Default for PinnedVec<T> {
    fn default() -> Self {
        Self {
            x: Vec::new(),
            pinned: false,
            pinnable: false,
        }
    }
}

impl<T> Deref for PinnedVec<T> {
    type Target = Vec<T>;

    fn deref(&self) -> &Self::Target {
        &self.x
    }
}

impl<T> DerefMut for PinnedVec<T> {
    fn deref_mut(&mut self) -> &mut Vec<T> {
        &mut self.x
    }
}

pub struct PinnedIter<'a, T>(std::slice::Iter<'a, T>);

pub struct PinnedIterMut<'a, T>(std::slice::IterMut<'a, T>);

impl<'a, T> Iterator for PinnedIter<'a, T> {
    type Item = &'a T;

    fn next(&mut self) -> Option<Self::Item> {
        self.0.next()
    }
}

impl<'a, T> Iterator for PinnedIterMut<'a, T> {
    type Item = &'a mut T;

    fn next(&mut self) -> Option<Self::Item> {
        self.0.next()
    }
}

impl<'a, T> IntoIterator for &'a mut PinnedVec<T> {
    type Item = &'a T;
    type IntoIter = PinnedIter<'a, T>;

    fn into_iter(self) -> Self::IntoIter {
        PinnedIter(self.iter())
    }
}

impl<'a, T> IntoIterator for &'a PinnedVec<T> {
    type Item = &'a T;
    type IntoIter = PinnedIter<'a, T>;

    fn into_iter(self) -> Self::IntoIter {
        PinnedIter(self.iter())
    }
}

impl<T: Clone> PinnedVec<T> {
    pub fn reserve_and_pin(&mut self, size: usize) {
        if self.x.capacity() < size {
            if self.pinned {
                unpin(&mut self.x);
                self.pinned = false;
            }
            self.x.reserve(size);
        }
        self.set_pinnable();
        if !self.pinned {
            pin(&mut self.x);
            self.pinned = true;
        }
    }

    pub fn set_pinnable(&mut self) {
        self.pinnable = true;
    }

    pub fn from_vec(source: Vec<T>) -> Self {
        Self {
            x: source,
            pinned: false,
            pinnable: false,
        }
    }

    pub fn with_capacity(capacity: usize) -> Self {
        let x = Vec::with_capacity(capacity);
        Self {
            x,
            pinned: false,
            pinnable: false,
        }
    }

    pub fn iter(&self) -> PinnedIter<T> {
        PinnedIter(self.x.iter())
    }

    pub fn iter_mut(&mut self) -> PinnedIterMut<T> {
        PinnedIterMut(self.x.iter_mut())
    }

    pub fn is_empty(&self) -> bool {
        self.x.is_empty()
    }

    pub fn len(&self) -> usize {
        self.x.len()
    }

    pub fn as_ptr(&self) -> *const T {
        self.x.as_ptr()
    }

    pub fn as_mut_ptr(&mut self) -> *mut T {
        self.x.as_mut_ptr()
    }

    pub fn push(&mut self, x: T) {
        let old_ptr = self.x.as_mut_ptr();
        let old_capacity = self.x.capacity();
        // Predict realloc and unpin
        if self.pinned && self.x.capacity() == self.x.len() {
            unpin(old_ptr);
            self.pinned = false;
        }
        self.x.push(x);
        self.check_ptr(old_ptr, old_capacity, "push");
    }

    pub fn resize(&mut self, size: usize, elem: T) {
        let old_ptr = self.x.as_mut_ptr();
        let old_capacity = self.x.capacity();
        // Predict realloc and unpin.
        if self.pinned && self.x.capacity() < size {
            unpin(old_ptr);
            self.pinned = false;
        }
        self.x.resize(size, elem);
        self.check_ptr(old_ptr, old_capacity, "resize");
    }

    fn check_ptr(&mut self, _old_ptr: *mut T, _old_capacity: usize, _from: &'static str) {
        let api = perf_libs::api();
        if api.is_some()
            && self.pinnable
            && (self.x.as_ptr() != _old_ptr || self.x.capacity() != _old_capacity)
        {
            if self.pinned {
                unpin(_old_ptr);
            }

            trace!(
                "pinning from check_ptr old: {} size: {} from: {}",
                _old_capacity,
                self.x.capacity(),
                _from
            );
            pin(&mut self.x);
            self.pinned = true;
        }
    }
}

impl<T: Clone> Clone for PinnedVec<T> {
    fn clone(&self) -> Self {
        let mut x = self.x.clone();
        let pinned = if self.pinned {
            pin(&mut x);
            true
        } else {
            false
        };
        debug!(
            "clone PinnedVec: size: {} pinned?: {} pinnable?: {}",
            self.x.capacity(),
            self.pinned,
            self.pinnable
        );
        Self {
            x,
            pinned,
            pinnable: self.pinnable,
        }
    }
}

impl<T> Drop for PinnedVec<T> {
    fn drop(&mut self) {
        if self.pinned {
            unpin(self.x.as_mut_ptr());
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_pinned_vec() {
        let mut mem = PinnedVec::with_capacity(10);
        mem.set_pinnable();
        mem.push(50);
        mem.resize(2, 10);
        assert_eq!(mem[0], 50);
        assert_eq!(mem[1], 10);
        assert_eq!(mem.len(), 2);
        assert_eq!(mem.is_empty(), false);
        let mut iter = mem.iter();
        assert_eq!(*iter.next().unwrap(), 50);
        assert_eq!(*iter.next().unwrap(), 10);
        assert_eq!(iter.next(), None);
    }
}
Page-pin packet memory for cuda (#4250) * Page-pin packet memory for cuda Bring back recyclers and pin offset buffers * Add packet recycler to streamer * Add set_pinnable to sigverify vecs to pin them * Add packets reset test * Add test for recycler and reduce the gc lock critical section * Add comments/tests to cuda_runtime * Add recycler to recv_blobs path. * Add trace/names for debug and PacketsRecycler to bench-streamer * Predict realloc and unpin beforehand. * Add helper to reserve and pin * Cap buffered packets length * Call cuda wrapper functions 2019-06-27 00:32:32 -07:00			`// Module for cuda-related helper functions and wrappers.`
			`//`
			`// cudaHostRegister/cudaHostUnregister -`
			`// apis for page-pinning memory. Cuda driver/hardware cannot overlap`
			`// copies from host memory to GPU memory unless the memory is page-pinned and`
			`// cannot be paged to disk. The cuda driver provides these interfaces to pin and unpin memory.`

Pull perf into a separate module. (#6718) automerge 2019-11-04 20:13:43 -08:00			`use crate::perf_libs;`
Page-pin packet memory for cuda (#4250) * Page-pin packet memory for cuda Bring back recyclers and pin offset buffers * Add packet recycler to streamer * Add set_pinnable to sigverify vecs to pin them * Add packets reset test * Add test for recycler and reduce the gc lock critical section * Add comments/tests to cuda_runtime * Add recycler to recv_blobs path. * Add trace/names for debug and PacketsRecycler to bench-streamer * Predict realloc and unpin beforehand. * Add helper to reserve and pin * Cap buffered packets length * Call cuda wrapper functions 2019-06-27 00:32:32 -07:00			`use crate::recycler::Reset;`
			`use std::ops::{Deref, DerefMut};`

Remove CUDA feature (#6094) 2019-09-26 13:36:51 -07:00			`#[cfg(feature = "pin_gpu_memory")]`
Page-pin packet memory for cuda (#4250) * Page-pin packet memory for cuda Bring back recyclers and pin offset buffers * Add packet recycler to streamer * Add set_pinnable to sigverify vecs to pin them * Add packets reset test * Add test for recycler and reduce the gc lock critical section * Add comments/tests to cuda_runtime * Add recycler to recv_blobs path. * Add trace/names for debug and PacketsRecycler to bench-streamer * Predict realloc and unpin beforehand. * Add helper to reserve and pin * Cap buffered packets length * Call cuda wrapper functions 2019-06-27 00:32:32 -07:00			`use std::os::raw::c_int;`

Remove CUDA feature (#6094) 2019-09-26 13:36:51 -07:00			`#[cfg(feature = "pin_gpu_memory")]`
Page-pin packet memory for cuda (#4250) * Page-pin packet memory for cuda Bring back recyclers and pin offset buffers * Add packet recycler to streamer * Add set_pinnable to sigverify vecs to pin them * Add packets reset test * Add test for recycler and reduce the gc lock critical section * Add comments/tests to cuda_runtime * Add recycler to recv_blobs path. * Add trace/names for debug and PacketsRecycler to bench-streamer * Predict realloc and unpin beforehand. * Add helper to reserve and pin * Cap buffered packets length * Call cuda wrapper functions 2019-06-27 00:32:32 -07:00			`const CUDA_SUCCESS: c_int = 0;`

			`pub fn pin<T>(_mem: &mut Vec<T>) {`
Remove CUDA feature (#6094) 2019-09-26 13:36:51 -07:00			`#[cfg(feature = "pin_gpu_memory")]`
			`{`
			`if let Some(api) = perf_libs::api() {`
			`unsafe {`
			`use core::ffi::c_void;`
			`use std::mem::size_of;`
Disable pinned gpu memory (#5753) 2019-08-31 16:44:07 -07:00
Remove CUDA feature (#6094) 2019-09-26 13:36:51 -07:00			`let err = (api.cuda_host_register)(`
			`_mem.as_mut_ptr() as *mut c_void,`
			`_mem.capacity() * size_of::<T>(),`
			`0,`
			`);`
			`if err != CUDA_SUCCESS {`
			`error!(`
			`"cudaHostRegister error: {} ptr: {:?} bytes: {}",`
			`err,`
			`_mem.as_ptr(),`
			`_mem.capacity() * size_of::<T>()`
			`);`
			`}`
			`}`
Page-pin packet memory for cuda (#4250) * Page-pin packet memory for cuda Bring back recyclers and pin offset buffers * Add packet recycler to streamer * Add set_pinnable to sigverify vecs to pin them * Add packets reset test * Add test for recycler and reduce the gc lock critical section * Add comments/tests to cuda_runtime * Add recycler to recv_blobs path. * Add trace/names for debug and PacketsRecycler to bench-streamer * Predict realloc and unpin beforehand. * Add helper to reserve and pin * Cap buffered packets length * Call cuda wrapper functions 2019-06-27 00:32:32 -07:00			`}`
			`}`
			`}`

			`pub fn unpin<T>(_mem: *mut T) {`
Remove CUDA feature (#6094) 2019-09-26 13:36:51 -07:00			`#[cfg(feature = "pin_gpu_memory")]`
			`{`
			`if let Some(api) = perf_libs::api() {`
			`unsafe {`
			`use core::ffi::c_void;`
Disable pinned gpu memory (#5753) 2019-08-31 16:44:07 -07:00
Remove CUDA feature (#6094) 2019-09-26 13:36:51 -07:00			`let err = (api.cuda_host_unregister)(_mem as *mut c_void);`
			`if err != CUDA_SUCCESS {`
			`error!("cudaHostUnregister returned: {} ptr: {:?}", err, _mem);`
			`}`
			`}`
Page-pin packet memory for cuda (#4250) * Page-pin packet memory for cuda Bring back recyclers and pin offset buffers * Add packet recycler to streamer * Add set_pinnable to sigverify vecs to pin them * Add packets reset test * Add test for recycler and reduce the gc lock critical section * Add comments/tests to cuda_runtime * Add recycler to recv_blobs path. * Add trace/names for debug and PacketsRecycler to bench-streamer * Predict realloc and unpin beforehand. * Add helper to reserve and pin * Cap buffered packets length * Call cuda wrapper functions 2019-06-27 00:32:32 -07:00			`}`
			`}`
			`}`

			`// A vector wrapper where the underlying memory can be`
			`// page-pinned. Controlled by flags in case user only wants`
			`// to pin in certain circumstances.`
			`#[derive(Debug)]`
			`pub struct PinnedVec<T> {`
			`x: Vec<T>,`
			`pinned: bool,`
			`pinnable: bool,`
			`}`

Sign shreds on the GPU (#6595) * sign gpu shreds * wip * checks * tests build * test * tests * test * nits * sign cpu test * write out the sigs in parallel * clippy * cpu test * prepare secret for gpu * woot! * update * bump perf libs 2019-11-02 06:23:14 -07:00			`impl<T: Default + Clone> Reset for PinnedVec<T> {`
Page-pin packet memory for cuda (#4250) * Page-pin packet memory for cuda Bring back recyclers and pin offset buffers * Add packet recycler to streamer * Add set_pinnable to sigverify vecs to pin them * Add packets reset test * Add test for recycler and reduce the gc lock critical section * Add comments/tests to cuda_runtime * Add recycler to recv_blobs path. * Add trace/names for debug and PacketsRecycler to bench-streamer * Predict realloc and unpin beforehand. * Add helper to reserve and pin * Cap buffered packets length * Call cuda wrapper functions 2019-06-27 00:32:32 -07:00			`fn reset(&mut self) {`
Sign shreds on the GPU (#6595) * sign gpu shreds * wip * checks * tests build * test * tests * test * nits * sign cpu test * write out the sigs in parallel * clippy * cpu test * prepare secret for gpu * woot! * update * bump perf libs 2019-11-02 06:23:14 -07:00			`self.resize(0, T::default());`
Page-pin packet memory for cuda (#4250) * Page-pin packet memory for cuda Bring back recyclers and pin offset buffers * Add packet recycler to streamer * Add set_pinnable to sigverify vecs to pin them * Add packets reset test * Add test for recycler and reduce the gc lock critical section * Add comments/tests to cuda_runtime * Add recycler to recv_blobs path. * Add trace/names for debug and PacketsRecycler to bench-streamer * Predict realloc and unpin beforehand. * Add helper to reserve and pin * Cap buffered packets length * Call cuda wrapper functions 2019-06-27 00:32:32 -07:00			`}`
			`}`

			`impl<T: Clone> Default for PinnedVec<T> {`
			`fn default() -> Self {`
			`Self {`
			`x: Vec::new(),`
			`pinned: false,`
			`pinnable: false,`
			`}`
			`}`
			`}`

			`impl<T> Deref for PinnedVec<T> {`
			`type Target = Vec<T>;`

			`fn deref(&self) -> &Self::Target {`
			`&self.x`
			`}`
			`}`

			`impl<T> DerefMut for PinnedVec<T> {`
			`fn deref_mut(&mut self) -> &mut Vec<T> {`
			`&mut self.x`
			`}`
			`}`

			`pub struct PinnedIter<'a, T>(std::slice::Iter<'a, T>);`

			`pub struct PinnedIterMut<'a, T>(std::slice::IterMut<'a, T>);`

			`impl<'a, T> Iterator for PinnedIter<'a, T> {`
			`type Item = &'a T;`

			`fn next(&mut self) -> Option<Self::Item> {`
			`self.0.next()`
			`}`
			`}`

			`impl<'a, T> Iterator for PinnedIterMut<'a, T> {`
			`type Item = &'a mut T;`

			`fn next(&mut self) -> Option<Self::Item> {`
			`self.0.next()`
			`}`
			`}`

			`impl<'a, T> IntoIterator for &'a mut PinnedVec<T> {`
			`type Item = &'a T;`
			`type IntoIter = PinnedIter<'a, T>;`

			`fn into_iter(self) -> Self::IntoIter {`
			`PinnedIter(self.iter())`
			`}`
			`}`

			`impl<'a, T> IntoIterator for &'a PinnedVec<T> {`
			`type Item = &'a T;`
			`type IntoIter = PinnedIter<'a, T>;`

			`fn into_iter(self) -> Self::IntoIter {`
			`PinnedIter(self.iter())`
			`}`
			`}`

			`impl<T: Clone> PinnedVec<T> {`
			`pub fn reserve_and_pin(&mut self, size: usize) {`
			`if self.x.capacity() < size {`
			`if self.pinned {`
			`unpin(&mut self.x);`
			`self.pinned = false;`
			`}`
			`self.x.reserve(size);`
			`}`
			`self.set_pinnable();`
			`if !self.pinned {`
			`pin(&mut self.x);`
			`self.pinned = true;`
			`}`
			`}`

			`pub fn set_pinnable(&mut self) {`
			`self.pinnable = true;`
			`}`

			`pub fn from_vec(source: Vec<T>) -> Self {`
			`Self {`
			`x: source,`
			`pinned: false,`
			`pinnable: false,`
			`}`
			`}`

			`pub fn with_capacity(capacity: usize) -> Self {`
			`let x = Vec::with_capacity(capacity);`
			`Self {`
			`x,`
			`pinned: false,`
			`pinnable: false,`
			`}`
			`}`

			`pub fn iter(&self) -> PinnedIter<T> {`
			`PinnedIter(self.x.iter())`
			`}`

			`pub fn iter_mut(&mut self) -> PinnedIterMut<T> {`
			`PinnedIterMut(self.x.iter_mut())`
			`}`

			`pub fn is_empty(&self) -> bool {`
			`self.x.is_empty()`
			`}`

			`pub fn len(&self) -> usize {`
			`self.x.len()`
			`}`

			`pub fn as_ptr(&self) -> *const T {`
			`self.x.as_ptr()`
			`}`

			`pub fn as_mut_ptr(&mut self) -> *mut T {`
			`self.x.as_mut_ptr()`
			`}`

			`pub fn push(&mut self, x: T) {`
			`let old_ptr = self.x.as_mut_ptr();`
			`let old_capacity = self.x.capacity();`
			`// Predict realloc and unpin`
			`if self.pinned && self.x.capacity() == self.x.len() {`
			`unpin(old_ptr);`
			`self.pinned = false;`
			`}`
			`self.x.push(x);`
			`self.check_ptr(old_ptr, old_capacity, "push");`
			`}`

			`pub fn resize(&mut self, size: usize, elem: T) {`
			`let old_ptr = self.x.as_mut_ptr();`
			`let old_capacity = self.x.capacity();`
			`// Predict realloc and unpin.`
			`if self.pinned && self.x.capacity() < size {`
			`unpin(old_ptr);`
			`self.pinned = false;`
			`}`
			`self.x.resize(size, elem);`
			`self.check_ptr(old_ptr, old_capacity, "resize");`
			`}`

			`fn check_ptr(&mut self, _old_ptr: *mut T, _old_capacity: usize, _from: &'static str) {`
Remove remnants of the cuda feature flag (#6298) automerge 2019-10-09 16:09:36 -07:00			`let api = perf_libs::api();`
			`if api.is_some()`
			`&& self.pinnable`
			`&& (self.x.as_ptr() != _old_ptr \|\| self.x.capacity() != _old_capacity)`
Page-pin packet memory for cuda (#4250) * Page-pin packet memory for cuda Bring back recyclers and pin offset buffers * Add packet recycler to streamer * Add set_pinnable to sigverify vecs to pin them * Add packets reset test * Add test for recycler and reduce the gc lock critical section * Add comments/tests to cuda_runtime * Add recycler to recv_blobs path. * Add trace/names for debug and PacketsRecycler to bench-streamer * Predict realloc and unpin beforehand. * Add helper to reserve and pin * Cap buffered packets length * Call cuda wrapper functions 2019-06-27 00:32:32 -07:00			`{`
Remove remnants of the cuda feature flag (#6298) automerge 2019-10-09 16:09:36 -07:00			`if self.pinned {`
			`unpin(_old_ptr);`
Page-pin packet memory for cuda (#4250) * Page-pin packet memory for cuda Bring back recyclers and pin offset buffers * Add packet recycler to streamer * Add set_pinnable to sigverify vecs to pin them * Add packets reset test * Add test for recycler and reduce the gc lock critical section * Add comments/tests to cuda_runtime * Add recycler to recv_blobs path. * Add trace/names for debug and PacketsRecycler to bench-streamer * Predict realloc and unpin beforehand. * Add helper to reserve and pin * Cap buffered packets length * Call cuda wrapper functions 2019-06-27 00:32:32 -07:00			`}`
Remove remnants of the cuda feature flag (#6298) automerge 2019-10-09 16:09:36 -07:00
			`trace!(`
			`"pinning from check_ptr old: {} size: {} from: {}",`
			`_old_capacity,`
			`self.x.capacity(),`
			`_from`
			`);`
			`pin(&mut self.x);`
			`self.pinned = true;`
Page-pin packet memory for cuda (#4250) * Page-pin packet memory for cuda Bring back recyclers and pin offset buffers * Add packet recycler to streamer * Add set_pinnable to sigverify vecs to pin them * Add packets reset test * Add test for recycler and reduce the gc lock critical section * Add comments/tests to cuda_runtime * Add recycler to recv_blobs path. * Add trace/names for debug and PacketsRecycler to bench-streamer * Predict realloc and unpin beforehand. * Add helper to reserve and pin * Cap buffered packets length * Call cuda wrapper functions 2019-06-27 00:32:32 -07:00			`}`
			`}`
			`}`

			`impl<T: Clone> Clone for PinnedVec<T> {`
			`fn clone(&self) -> Self {`
			`let mut x = self.x.clone();`
			`let pinned = if self.pinned {`
			`pin(&mut x);`
			`true`
			`} else {`
			`false`
			`};`
			`debug!(`
			`"clone PinnedVec: size: {} pinned?: {} pinnable?: {}",`
			`self.x.capacity(),`
			`self.pinned,`
			`self.pinnable`
			`);`
			`Self {`
			`x,`
			`pinned,`
			`pinnable: self.pinnable,`
			`}`
			`}`
			`}`

			`impl<T> Drop for PinnedVec<T> {`
			`fn drop(&mut self) {`
			`if self.pinned {`
			`unpin(self.x.as_mut_ptr());`
			`}`
			`}`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn test_pinned_vec() {`
			`let mut mem = PinnedVec::with_capacity(10);`
			`mem.set_pinnable();`
			`mem.push(50);`
			`mem.resize(2, 10);`
			`assert_eq!(mem[0], 50);`
			`assert_eq!(mem[1], 10);`
			`assert_eq!(mem.len(), 2);`
			`assert_eq!(mem.is_empty(), false);`
			`let mut iter = mem.iter();`
			`assert_eq!(*iter.next().unwrap(), 50);`
			`assert_eq!(*iter.next().unwrap(), 10);`
			`assert_eq!(iter.next(), None);`
			`}`
			`}`