262 lines
8.2 KiB
Rust
262 lines
8.2 KiB
Rust
use crate::chain::DecryptedBlock;
|
|
use crate::gpu::{collect_nf, GPUProcessor};
|
|
use crate::lw_rpc::CompactBlock;
|
|
use crate::{Hash, hash::GENERATORS_EXP};
|
|
use anyhow::Result;
|
|
use ff::BatchInverter;
|
|
use jubjub::Fq;
|
|
use lazy_static::lazy_static;
|
|
use rustacuda::context::CurrentContext;
|
|
use rustacuda::launch;
|
|
use rustacuda::prelude::*;
|
|
use std::convert::TryInto;
|
|
use std::ffi::CString;
|
|
use std::sync::Mutex;
|
|
use zcash_primitives::consensus::Network;
|
|
use zcash_primitives::sapling::SaplingIvk;
|
|
|
|
const THREADS_PER_BLOCK: usize = 256usize;
|
|
const BUFFER_SIZE: usize = 96usize;
|
|
|
|
lazy_static! {
|
|
pub static ref CUDA_CONTEXT: Mutex<Option<CudaContext>> = Mutex::new(CudaContext::new().ok());
|
|
}
|
|
|
|
pub struct CudaContext {
|
|
device: Device,
|
|
context: Context,
|
|
hash_module: Module,
|
|
trial_decrypt_module: Module,
|
|
stream: Stream,
|
|
generators: DeviceBuffer<u8>,
|
|
}
|
|
|
|
unsafe impl Send for CudaContext {}
|
|
|
|
impl CudaContext {
|
|
pub fn new() -> Result<Self> {
|
|
let r = Self::new_inner();
|
|
if let Err(ref err) = r {
|
|
log::info!("CUDA Initialization Error {:?}", err);
|
|
}
|
|
log::info!("CUDA Initialized");
|
|
r
|
|
}
|
|
|
|
fn new_inner() -> Result<Self> {
|
|
rustacuda::init(rustacuda::CudaFlags::empty())?;
|
|
|
|
let device = Device::get_device(0)?;
|
|
let context =
|
|
Context::create_and_push(ContextFlags::MAP_HOST | ContextFlags::SCHED_AUTO, device)?;
|
|
|
|
let ptx = CString::new(include_str!("./cuda/hash.ptx"))?;
|
|
let hash_module = Module::load_from_string(&ptx)?;
|
|
|
|
let ptx = CString::new(include_str!("./cuda/trial_decrypt.ptx"))?;
|
|
let trial_decrypt_module = Module::load_from_string(&ptx)?;
|
|
|
|
let stream = Stream::new(StreamFlags::DEFAULT, None)?;
|
|
|
|
log::info!("Prepare Generators");
|
|
let generators_len = GENERATORS_EXP.len();
|
|
let mut gens = vec![0u8; generators_len * 128];
|
|
for i in 0..generators_len {
|
|
GENERATORS_EXP[i].copy_to_slice(&mut gens[i * 128..(i + 1) * 128]);
|
|
}
|
|
|
|
let generators = DeviceBuffer::from_slice(&gens)?;
|
|
Ok(CudaContext {
|
|
device,
|
|
context,
|
|
hash_module,
|
|
trial_decrypt_module,
|
|
stream,
|
|
generators,
|
|
})
|
|
}
|
|
|
|
pub fn total_memory(&self) -> Result<usize> {
|
|
let mem = self.device.total_memory()?.saturating_sub(500_000_000);
|
|
// leave 500 MB of GPU for other stuff;
|
|
log::info!("Cuda memory {}", mem);
|
|
Ok(mem)
|
|
}
|
|
|
|
pub fn batch_hash(&mut self, depth: u8, data: &[Hash]) -> Result<Vec<Hash>> {
|
|
CurrentContext::set_current(&self.context)?;
|
|
|
|
let n = data.len() / 2;
|
|
let mut in_data = DeviceBuffer::from_slice(data)?;
|
|
let mut out_data = unsafe { DeviceBuffer::<u8>::zeroed(n * 32 * 2)? };
|
|
|
|
unsafe {
|
|
// Launch the kernel again using the `function` form:
|
|
let function_name = CString::new("pedersen_hash")?;
|
|
let hash = self.hash_module.get_function(&function_name)?;
|
|
|
|
let blocks = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
let stream = &self.stream;
|
|
let result = launch!(hash<<<(blocks as u32, 1, 1), (THREADS_PER_BLOCK as u32, 1, 1), 1024, stream>>>(
|
|
n,
|
|
depth,
|
|
self.generators.as_device_ptr(),
|
|
in_data.as_device_ptr(),
|
|
out_data.as_device_ptr()
|
|
));
|
|
result?;
|
|
}
|
|
self.stream.synchronize()?;
|
|
|
|
let mut res = vec![0u8; n * 32 * 2];
|
|
out_data.copy_to(&mut res)?;
|
|
|
|
let mut p = vec![];
|
|
let mut q: Vec<AffinePoint> = vec![AffinePoint::default(); n];
|
|
for i in 0..n {
|
|
let b = i * 64;
|
|
let u = Fq::from_bytes(&res[b..b + 32].try_into().unwrap()).unwrap();
|
|
let z = Fq::from_bytes(&res[b + 32..b + 64].try_into().unwrap()).unwrap();
|
|
q[i].u = z;
|
|
p.push(u);
|
|
}
|
|
BatchInverter::invert_with_internal_scratch(&mut q, |q| &mut q.u, |q| &mut q.v);
|
|
let mut out = vec![];
|
|
for i in 0..n {
|
|
let hash: Hash = (p[i] * &q[i].u).to_bytes();
|
|
// println!("{} {} {} {}", i, hex::encode(&data[i * 2]), hex::encode(&data[i * 2 + 1]), hex::encode(&hash));
|
|
out.push(hash);
|
|
}
|
|
|
|
Ok(out)
|
|
}
|
|
}
|
|
|
|
pub struct CudaProcessor {
|
|
network: Network,
|
|
decrypted_blocks: Vec<DecryptedBlock>,
|
|
encrypted_data: Vec<u8>,
|
|
encrypted_data_device: DeviceBuffer<u8>,
|
|
ivk_device: DeviceBuffer<u8>,
|
|
decrypted_data: Vec<u8>,
|
|
n: usize,
|
|
block_count: usize,
|
|
}
|
|
|
|
impl CudaProcessor {
|
|
pub fn setup_decrypt(network: &Network, blocks: Vec<CompactBlock>) -> Result<Self> {
|
|
let m = CUDA_CONTEXT.lock().unwrap();
|
|
let cuda_context = m.as_ref().unwrap();
|
|
CurrentContext::set_current(&cuda_context.context).unwrap();
|
|
|
|
let decrypted_blocks = collect_nf(blocks)?;
|
|
|
|
let mut data_buffer = vec![];
|
|
let mut i = 0;
|
|
for db in decrypted_blocks.iter() {
|
|
let mut position_in_block = 0;
|
|
let b = &db.compact_block;
|
|
for tx in b.vtx.iter() {
|
|
for co in tx.outputs.iter() {
|
|
if !co.epk.is_empty() {
|
|
data_buffer.extend(&co.epk);
|
|
data_buffer.extend(&co.ciphertext);
|
|
data_buffer.extend(&u64::to_le_bytes(position_in_block as u64));
|
|
data_buffer.extend(&[0u8; 4]); // padding
|
|
i += 1;
|
|
}
|
|
position_in_block += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
let n = i;
|
|
let block_count = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
let encrypted_data_device = unsafe { DeviceBuffer::uninitialized(data_buffer.len())? };
|
|
let ivk_device = unsafe { DeviceBuffer::zeroed(32)? };
|
|
|
|
let decrypted_data = vec![0u8; n * BUFFER_SIZE];
|
|
let this = CudaProcessor {
|
|
network: network.clone(),
|
|
decrypted_blocks,
|
|
encrypted_data: data_buffer,
|
|
encrypted_data_device: encrypted_data_device,
|
|
ivk_device: ivk_device,
|
|
decrypted_data,
|
|
n,
|
|
block_count,
|
|
};
|
|
Ok(this)
|
|
}
|
|
}
|
|
|
|
impl GPUProcessor for CudaProcessor {
|
|
fn decrypt_account(&mut self, ivk: &SaplingIvk) -> Result<()> {
|
|
log::info!("n = {}", self.n);
|
|
if self.n == 0 {
|
|
return Ok(());
|
|
}
|
|
let mut ivk_fr = ivk.0;
|
|
ivk_fr = ivk_fr.double(); // multiply by cofactor
|
|
ivk_fr = ivk_fr.double();
|
|
ivk_fr = ivk_fr.double();
|
|
|
|
self.encrypted_data_device.copy_from(&self.encrypted_data)?;
|
|
self.ivk_device.copy_from(&ivk_fr.to_bytes())?;
|
|
|
|
let m = CUDA_CONTEXT.lock().unwrap();
|
|
let cuda_context = m.as_ref().unwrap();
|
|
// decrypt all the blocks for the current account
|
|
unsafe {
|
|
// Launch the kernel again using the `function` form:
|
|
let function_name = CString::new("trial_decrypt_full").unwrap();
|
|
let trial_decrypt_full = cuda_context
|
|
.trial_decrypt_module
|
|
.get_function(&function_name)
|
|
.unwrap();
|
|
|
|
let stream = &cuda_context.stream;
|
|
let result = launch!(trial_decrypt_full<<<(self.block_count as u32, 1, 1), (THREADS_PER_BLOCK as u32, 1, 1), 0, stream>>>(
|
|
self.n,
|
|
self.ivk_device.as_device_ptr(),
|
|
self.encrypted_data_device.as_device_ptr()
|
|
));
|
|
result.unwrap();
|
|
}
|
|
cuda_context.stream.synchronize().unwrap();
|
|
|
|
self.encrypted_data_device
|
|
.copy_to(&mut self.decrypted_data)
|
|
.unwrap();
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn get_decrypted_blocks(self) -> Result<Vec<DecryptedBlock>> {
|
|
Ok(self.decrypted_blocks)
|
|
}
|
|
|
|
fn network(&self) -> Network {
|
|
self.network
|
|
}
|
|
|
|
fn buffer_stride() -> usize {
|
|
BUFFER_SIZE
|
|
}
|
|
|
|
fn borrow_buffers(&mut self) -> (&[u8], &mut [DecryptedBlock]) {
|
|
(
|
|
self.decrypted_data.as_slice(),
|
|
self.decrypted_blocks.as_mut_slice(),
|
|
)
|
|
}
|
|
}
|
|
|
|
#[derive(Default, Clone)]
|
|
struct AffinePoint {
|
|
u: Fq,
|
|
v: Fq,
|
|
}
|