zcash-sync/src/gpu/cuda.rs

262 lines
8.2 KiB
Rust

use crate::chain::DecryptedBlock;
use crate::gpu::{collect_nf, GPUProcessor};
use crate::lw_rpc::CompactBlock;
use crate::{Hash, hash::GENERATORS_EXP};
use anyhow::Result;
use ff::BatchInverter;
use jubjub::Fq;
use lazy_static::lazy_static;
use rustacuda::context::CurrentContext;
use rustacuda::launch;
use rustacuda::prelude::*;
use std::convert::TryInto;
use std::ffi::CString;
use std::sync::Mutex;
use zcash_primitives::consensus::Network;
use zcash_primitives::sapling::SaplingIvk;
const THREADS_PER_BLOCK: usize = 256usize;
const BUFFER_SIZE: usize = 96usize;
lazy_static! {
pub static ref CUDA_CONTEXT: Mutex<Option<CudaContext>> = Mutex::new(CudaContext::new().ok());
}
pub struct CudaContext {
device: Device,
context: Context,
hash_module: Module,
trial_decrypt_module: Module,
stream: Stream,
generators: DeviceBuffer<u8>,
}
unsafe impl Send for CudaContext {}
impl CudaContext {
pub fn new() -> Result<Self> {
let r = Self::new_inner();
if let Err(ref err) = r {
log::info!("CUDA Initialization Error {:?}", err);
}
log::info!("CUDA Initialized");
r
}
fn new_inner() -> Result<Self> {
rustacuda::init(rustacuda::CudaFlags::empty())?;
let device = Device::get_device(0)?;
let context =
Context::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device)?;
let ptx = CString::new(include_str!("./cuda/hash.ptx"))?;
let hash_module = Module::load_from_string(&ptx)?;
let ptx = CString::new(include_str!("./cuda/trial_decrypt.ptx"))?;
let trial_decrypt_module = Module::load_from_string(&ptx)?;
let stream = Stream::new(StreamFlags::DEFAULT, None)?;
log::info!("Prepare Generators");
let generators_len = GENERATORS_EXP.len();
let mut gens = vec![0u8; generators_len * 128];
for i in 0..generators_len {
GENERATORS_EXP[i].copy_to_slice(&mut gens[i * 128..(i + 1) * 128]);
}
let generators = DeviceBuffer::from_slice(&gens)?;
Ok(CudaContext {
device,
context,
hash_module,
trial_decrypt_module,
stream,
generators,
})
}
pub fn total_memory(&self) -> Result<usize> {
let mem = self.device.total_memory()?.saturating_sub(500_000_000);
// leave 500 MB of GPU for other stuff;
log::info!("Cuda memory {}", mem);
Ok(mem)
}
pub fn batch_hash(&mut self, depth: u8, data: &[Hash]) -> Result<Vec<Hash>> {
CurrentContext::set_current(&self.context)?;
let n = data.len() / 2;
let mut in_data = DeviceBuffer::from_slice(data)?;
let mut out_data = unsafe { DeviceBuffer::<u8>::zeroed(n * 32 * 2)? };
unsafe {
// Launch the kernel again using the `function` form:
let function_name = CString::new("pedersen_hash")?;
let hash = self.hash_module.get_function(&function_name)?;
let blocks = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
let stream = &self.stream;
let result = launch!(hash<<<(blocks as u32, 1, 1), (THREADS_PER_BLOCK as u32, 1, 1), 1024, stream>>>(
n,
depth,
self.generators.as_device_ptr(),
in_data.as_device_ptr(),
out_data.as_device_ptr()
));
result?;
}
self.stream.synchronize()?;
let mut res = vec![0u8; n * 32 * 2];
out_data.copy_to(&mut res)?;
let mut p = vec![];
let mut q: Vec<AffinePoint> = vec![AffinePoint::default(); n];
for i in 0..n {
let b = i * 64;
let u = Fq::from_bytes(&res[b..b + 32].try_into().unwrap()).unwrap();
let z = Fq::from_bytes(&res[b + 32..b + 64].try_into().unwrap()).unwrap();
q[i].u = z;
p.push(u);
}
BatchInverter::invert_with_internal_scratch(&mut q, |q| &mut q.u, |q| &mut q.v);
let mut out = vec![];
for i in 0..n {
let hash: Hash = (p[i] * &q[i].u).to_bytes();
// println!("{} {} {} {}", i, hex::encode(&data[i * 2]), hex::encode(&data[i * 2 + 1]), hex::encode(&hash));
out.push(hash);
}
Ok(out)
}
}
pub struct CudaProcessor {
network: Network,
decrypted_blocks: Vec<DecryptedBlock>,
encrypted_data: Vec<u8>,
encrypted_data_device: DeviceBuffer<u8>,
ivk_device: DeviceBuffer<u8>,
decrypted_data: Vec<u8>,
n: usize,
block_count: usize,
}
impl CudaProcessor {
pub fn setup_decrypt(network: &Network, blocks: Vec<CompactBlock>) -> Result<Self> {
let m = CUDA_CONTEXT.lock().unwrap();
let cuda_context = m.as_ref().unwrap();
CurrentContext::set_current(&cuda_context.context).unwrap();
let decrypted_blocks = collect_nf(blocks)?;
let mut data_buffer = vec![];
let mut i = 0;
for db in decrypted_blocks.iter() {
let mut position_in_block = 0;
let b = &db.compact_block;
for tx in b.vtx.iter() {
for co in tx.outputs.iter() {
if !co.epk.is_empty() {
data_buffer.extend(&co.epk);
data_buffer.extend(&co.ciphertext);
data_buffer.extend(&u64::to_le_bytes(position_in_block as u64));
data_buffer.extend(&[0u8; 4]); // padding
i += 1;
}
position_in_block += 1;
}
}
}
let n = i;
let block_count = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
let encrypted_data_device = unsafe { DeviceBuffer::uninitialized(data_buffer.len())? };
let ivk_device = unsafe { DeviceBuffer::zeroed(32)? };
let decrypted_data = vec![0u8; n * BUFFER_SIZE];
let this = CudaProcessor {
network: network.clone(),
decrypted_blocks,
encrypted_data: data_buffer,
encrypted_data_device: encrypted_data_device,
ivk_device: ivk_device,
decrypted_data,
n,
block_count,
};
Ok(this)
}
}
impl GPUProcessor for CudaProcessor {
fn decrypt_account(&mut self, ivk: &SaplingIvk) -> Result<()> {
log::info!("n = {}", self.n);
if self.n == 0 {
return Ok(());
}
let mut ivk_fr = ivk.0;
ivk_fr = ivk_fr.double(); // multiply by cofactor
ivk_fr = ivk_fr.double();
ivk_fr = ivk_fr.double();
self.encrypted_data_device.copy_from(&self.encrypted_data)?;
self.ivk_device.copy_from(&ivk_fr.to_bytes())?;
let m = CUDA_CONTEXT.lock().unwrap();
let cuda_context = m.as_ref().unwrap();
// decrypt all the blocks for the current account
unsafe {
// Launch the kernel again using the `function` form:
let function_name = CString::new("trial_decrypt_full").unwrap();
let trial_decrypt_full = cuda_context
.trial_decrypt_module
.get_function(&function_name)
.unwrap();
let stream = &cuda_context.stream;
let result = launch!(trial_decrypt_full<<<(self.block_count as u32, 1, 1), (THREADS_PER_BLOCK as u32, 1, 1), 0, stream>>>(
self.n,
self.ivk_device.as_device_ptr(),
self.encrypted_data_device.as_device_ptr()
));
result.unwrap();
}
cuda_context.stream.synchronize().unwrap();
self.encrypted_data_device
.copy_to(&mut self.decrypted_data)
.unwrap();
Ok(())
}
fn get_decrypted_blocks(self) -> Result<Vec<DecryptedBlock>> {
Ok(self.decrypted_blocks)
}
fn network(&self) -> Network {
self.network
}
fn buffer_stride() -> usize {
BUFFER_SIZE
}
fn borrow_buffers(&mut self) -> (&[u8], &mut [DecryptedBlock]) {
(
self.decrypted_data.as_slice(),
self.decrypted_blocks.as_mut_slice(),
)
}
}
#[derive(Default, Clone)]
struct AffinePoint {
u: Fq,
v: Fq,
}