KvStore - A data-store to support BlockTree (#2897)
* Mostly implement key-value store and add integration points Essential key-value store functionality is implemented, needs more work to be integrated, tested, and activated. Behind the `kvstore` feature.
This commit is contained in:
parent
3073ebb20d
commit
56b0ba2601
|
@ -1098,6 +1098,15 @@ dependencies = [
|
|||
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memmap"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.2.1"
|
||||
|
@ -1976,6 +1985,7 @@ dependencies = [
|
|||
"libc 0.2.50 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"matches 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"memmap 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"nix 0.13.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
|
@ -3071,6 +3081,7 @@ dependencies = [
|
|||
"checksum matches 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08"
|
||||
"checksum memchr 2.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "db4c41318937f6e76648f42826b1d9ade5c09cafb5aef7e351240a70f39206e9"
|
||||
"checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff"
|
||||
"checksum memmap 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b"
|
||||
"checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3"
|
||||
"checksum mime 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ba626b8a6de5da682e1caa06bdb42a335aee5a84db8e5046a3e8ab17ba0a3ae0"
|
||||
"checksum mime 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)" = "3e27ca21f40a310bd06d9031785f4801710d566c184a6e15bad4f1d9b65f9425"
|
||||
|
|
|
@ -17,6 +17,7 @@ codecov = { repository = "solana-labs/solana", branch = "master", service = "git
|
|||
chacha = []
|
||||
cuda = []
|
||||
erasure = []
|
||||
kvstore = ["memmap"]
|
||||
|
||||
[dependencies]
|
||||
bincode = "1.1.2"
|
||||
|
@ -33,6 +34,7 @@ jsonrpc-pubsub = "10.1.0"
|
|||
jsonrpc-ws-server = "10.1.0"
|
||||
libc = "0.2.50"
|
||||
log = "0.4.2"
|
||||
memmap = { version = "0.7.0", optional = true }
|
||||
nix = "0.13.0"
|
||||
rand = "0.6.5"
|
||||
rand_chacha = "0.1.1"
|
||||
|
|
|
@ -0,0 +1,189 @@
|
|||
#![cfg(feature = "kvstore")]
|
||||
#![feature(test)]
|
||||
extern crate test;
|
||||
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use rand::{self, thread_rng, Rng};
|
||||
|
||||
use test::Bencher;
|
||||
|
||||
use solana::kvstore::{Config, Key, KvStore};
|
||||
|
||||
const SMALL_SIZE: usize = 512;
|
||||
const LARGE_SIZE: usize = 32 * 1024;
|
||||
const HUGE_SIZE: usize = 64 * 1024;
|
||||
|
||||
fn bench_write(bench: &mut Bencher, rows: &[(Key, Vec<u8>)], ledger_path: &str) {
|
||||
let store = KvStore::open_default(&ledger_path).unwrap();
|
||||
|
||||
bench.iter(move || {
|
||||
store.put_many(rows.iter()).expect("Failed to insert rows");
|
||||
});
|
||||
|
||||
teardown(&ledger_path);
|
||||
}
|
||||
|
||||
fn bench_write_partitioned(bench: &mut Bencher, rows: &[(Key, Vec<u8>)], ledger_path: &str) {
|
||||
let path = Path::new(ledger_path);
|
||||
let storage_dirs = (0..4)
|
||||
.map(|i| path.join(format!("parition-{}", i)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let store = KvStore::partitioned(&ledger_path, &storage_dirs, Config::default()).unwrap();
|
||||
|
||||
bench.iter(move || {
|
||||
store.put_many(rows.iter()).expect("Failed to insert rows");
|
||||
});
|
||||
|
||||
teardown(&ledger_path);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[ignore]
|
||||
fn bench_write_small(bench: &mut Bencher) {
|
||||
let ledger_path = setup("bench_write_small");
|
||||
let num_entries = 32 * 1024;
|
||||
let rows = gen_pairs(SMALL_SIZE).take(num_entries).collect::<Vec<_>>();
|
||||
bench_write(bench, &rows, &ledger_path.to_string_lossy());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[ignore]
|
||||
fn bench_write_small_partitioned(bench: &mut Bencher) {
|
||||
let ledger_path = setup("bench_write_small_partitioned");
|
||||
let num_entries = 32 * 1024;
|
||||
let rows = gen_pairs(SMALL_SIZE).take(num_entries).collect::<Vec<_>>();
|
||||
bench_write_partitioned(bench, &rows, &ledger_path.to_string_lossy());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[ignore]
|
||||
fn bench_write_large(bench: &mut Bencher) {
|
||||
let ledger_path = setup("bench_write_large");
|
||||
let num_entries = 32 * 1024;
|
||||
let rows = gen_pairs(LARGE_SIZE).take(num_entries).collect::<Vec<_>>();
|
||||
bench_write(bench, &rows, &ledger_path.to_string_lossy());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[ignore]
|
||||
fn bench_write_huge(bench: &mut Bencher) {
|
||||
let ledger_path = setup("bench_write_huge");
|
||||
let num_entries = 32 * 1024;
|
||||
let rows = gen_pairs(HUGE_SIZE).take(num_entries).collect::<Vec<_>>();
|
||||
bench_write(bench, &rows, &ledger_path.to_string_lossy());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[ignore]
|
||||
fn bench_read_sequential(bench: &mut Bencher) {
|
||||
let ledger_path = setup("bench_read_sequential");
|
||||
let store = KvStore::open_default(&ledger_path).unwrap();
|
||||
|
||||
// Insert some big and small blobs into the ledger
|
||||
let num_small_blobs = 32 * 1024;
|
||||
let num_large_blobs = 32 * 1024;
|
||||
let total_blobs = num_small_blobs + num_large_blobs;
|
||||
|
||||
let small = gen_data(SMALL_SIZE).take(num_small_blobs);
|
||||
let large = gen_data(LARGE_SIZE).take(num_large_blobs);
|
||||
let rows = gen_seq_keys().zip(small.chain(large));
|
||||
|
||||
let _ = store.put_many(rows);
|
||||
|
||||
let num_reads = total_blobs / 15;
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
bench.iter(move || {
|
||||
// Generate random starting point in the range [0, total_blobs - 1], read num_reads blobs sequentially
|
||||
let start_index = rng.gen_range(0, num_small_blobs + num_large_blobs);
|
||||
for i in start_index..start_index + num_reads {
|
||||
let i = i as u64;
|
||||
let k = Key::from((i, i, i));
|
||||
let _ = store.get(&k);
|
||||
}
|
||||
});
|
||||
|
||||
teardown(&ledger_path);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
#[ignore]
|
||||
fn bench_read_random(bench: &mut Bencher) {
|
||||
let ledger_path = setup("bench_read_sequential");
|
||||
let store = KvStore::open_default(&ledger_path).unwrap();
|
||||
|
||||
// Insert some big and small blobs into the ledger
|
||||
let num_small_blobs = 32 * 1024;
|
||||
let num_large_blobs = 32 * 1024;
|
||||
let total_blobs = num_small_blobs + num_large_blobs;
|
||||
|
||||
let small = gen_data(SMALL_SIZE).take(num_small_blobs);
|
||||
let large = gen_data(LARGE_SIZE).take(num_large_blobs);
|
||||
let rows = gen_seq_keys().zip(small.chain(large));
|
||||
|
||||
let _ = store.put_many(rows);
|
||||
|
||||
let num_reads = total_blobs / 15;
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// Generate a num_reads sized random sample of indexes in range [0, total_blobs - 1],
|
||||
// simulating random reads
|
||||
let indexes: Vec<u64> = (0..num_reads)
|
||||
.map(|_| rng.gen_range(0, total_blobs as u64))
|
||||
.collect();
|
||||
|
||||
bench.iter(move || {
|
||||
for &i in indexes.iter() {
|
||||
let i = i as u64;
|
||||
let k = Key::from((i, i, i));
|
||||
let _ = store.get(&k);
|
||||
}
|
||||
});
|
||||
|
||||
teardown(&ledger_path);
|
||||
}
|
||||
|
||||
fn setup(test_name: &str) -> PathBuf {
|
||||
let dir = Path::new("kvstore-bench").join(test_name);;
|
||||
|
||||
let _ig = fs::remove_dir_all(&dir);
|
||||
fs::create_dir_all(&dir).unwrap();
|
||||
|
||||
dir
|
||||
}
|
||||
|
||||
fn gen_seq_keys() -> impl Iterator<Item = Key> {
|
||||
let mut n = 0;
|
||||
|
||||
std::iter::repeat_with(move || {
|
||||
let key = Key::from((n, n, n));
|
||||
n += 1;
|
||||
|
||||
key
|
||||
})
|
||||
}
|
||||
|
||||
fn gen_keys() -> impl Iterator<Item = Key> {
|
||||
let mut rng = thread_rng();
|
||||
|
||||
std::iter::repeat_with(move || {
|
||||
let buf = rng.gen();
|
||||
|
||||
Key(buf)
|
||||
})
|
||||
}
|
||||
|
||||
fn gen_data(size: usize) -> impl Iterator<Item = Vec<u8>> {
|
||||
std::iter::repeat(vec![1u8; size])
|
||||
}
|
||||
|
||||
fn gen_pairs(data_size: usize) -> impl Iterator<Item = (Key, Vec<u8>)> {
|
||||
gen_keys().zip(gen_data(data_size))
|
||||
}
|
||||
|
||||
fn teardown<P: AsRef<Path>>(p: P) {
|
||||
KvStore::destroy(p).expect("Expect successful store destruction");
|
||||
}
|
|
@ -3,121 +3,81 @@
|
|||
//! access read to a persistent file-based ledger.
|
||||
|
||||
use crate::entry::Entry;
|
||||
#[cfg(feature = "kvstore")]
|
||||
use crate::kvstore;
|
||||
use crate::packet::{Blob, SharedBlob, BLOB_HEADER_SIZE};
|
||||
use crate::result::{Error, Result};
|
||||
|
||||
use bincode::{deserialize, serialize};
|
||||
use byteorder::{BigEndian, ByteOrder, ReadBytesExt};
|
||||
|
||||
use hashbrown::HashMap;
|
||||
use rocksdb::{
|
||||
ColumnFamily, ColumnFamilyDescriptor, DBRawIterator, IteratorMode, Options, WriteBatch, DB,
|
||||
};
|
||||
use serde::de::DeserializeOwned;
|
||||
|
||||
#[cfg(not(feature = "kvstore"))]
|
||||
use rocksdb;
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use solana_sdk::genesis_block::GenesisBlock;
|
||||
use solana_sdk::hash::Hash;
|
||||
use solana_sdk::signature::{Keypair, KeypairUtil};
|
||||
use solana_sdk::timing::DEFAULT_TICKS_PER_SLOT;
|
||||
|
||||
use std::borrow::{Borrow, Cow};
|
||||
use std::cell::RefCell;
|
||||
use std::cmp;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
use std::rc::Rc;
|
||||
use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
|
||||
use std::sync::Arc;
|
||||
|
||||
pub type BlocktreeRawIterator = rocksdb::DBRawIterator;
|
||||
mod db;
|
||||
#[cfg(feature = "kvstore")]
|
||||
mod kvs;
|
||||
#[cfg(not(feature = "kvstore"))]
|
||||
mod rocks;
|
||||
|
||||
#[cfg(feature = "kvstore")]
|
||||
use self::kvs::{DataCf, ErasureCf, Kvs, MetaCf};
|
||||
#[cfg(not(feature = "kvstore"))]
|
||||
use self::rocks::{DataCf, ErasureCf, MetaCf, Rocks};
|
||||
|
||||
pub use db::{
|
||||
Cursor, Database, IDataCf, IErasureCf, IMetaCf, IWriteBatch, LedgerColumnFamily,
|
||||
LedgerColumnFamilyRaw,
|
||||
};
|
||||
|
||||
#[cfg(not(feature = "kvstore"))]
|
||||
pub type BlocktreeRawIterator = <Rocks as Database>::Cursor;
|
||||
#[cfg(feature = "kvstore")]
|
||||
pub type BlocktreeRawIterator = <Kvs as Database>::Cursor;
|
||||
|
||||
#[cfg(not(feature = "kvstore"))]
|
||||
pub type WriteBatch = <Rocks as Database>::WriteBatch;
|
||||
#[cfg(feature = "kvstore")]
|
||||
pub type WriteBatch = <Kvs as Database>::WriteBatch;
|
||||
|
||||
#[cfg(not(feature = "kvstore"))]
|
||||
type KeyRef = <Rocks as Database>::KeyRef;
|
||||
#[cfg(feature = "kvstore")]
|
||||
type KeyRef = <Kvs as Database>::KeyRef;
|
||||
|
||||
#[cfg(not(feature = "kvstore"))]
|
||||
pub type Key = <Rocks as Database>::Key;
|
||||
#[cfg(feature = "kvstore")]
|
||||
pub type Key = <Kvs as Database>::Key;
|
||||
|
||||
#[cfg(not(feature = "kvstore"))]
|
||||
pub const BLOCKTREE_DIRECTORY: &str = "rocksdb";
|
||||
// A good value for this is the number of cores on the machine
|
||||
const TOTAL_THREADS: i32 = 8;
|
||||
const MAX_WRITE_BUFFER_SIZE: usize = 512 * 1024 * 1024;
|
||||
#[cfg(feature = "kvstore")]
|
||||
pub const BLOCKTREE_DIRECTORY: &str = "kvstore";
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum BlocktreeError {
|
||||
BlobForIndexExists,
|
||||
InvalidBlobData,
|
||||
RocksDb(rocksdb::Error),
|
||||
}
|
||||
|
||||
impl std::convert::From<rocksdb::Error> for Error {
|
||||
fn from(e: rocksdb::Error) -> Error {
|
||||
Error::BlocktreeError(BlocktreeError::RocksDb(e))
|
||||
}
|
||||
}
|
||||
|
||||
pub trait LedgerColumnFamily {
|
||||
type ValueType: DeserializeOwned + Serialize;
|
||||
|
||||
fn get(&self, key: &[u8]) -> Result<Option<Self::ValueType>> {
|
||||
let db = self.db();
|
||||
let data_bytes = db.get_cf(self.handle(), key)?;
|
||||
|
||||
if let Some(raw) = data_bytes {
|
||||
let result: Self::ValueType = deserialize(&raw)?;
|
||||
Ok(Some(result))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn get_bytes(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
|
||||
let db = self.db();
|
||||
let data_bytes = db.get_cf(self.handle(), key)?;
|
||||
Ok(data_bytes.map(|x| x.to_vec()))
|
||||
}
|
||||
|
||||
fn put_bytes(&self, key: &[u8], serialized_value: &[u8]) -> Result<()> {
|
||||
let db = self.db();
|
||||
db.put_cf(self.handle(), &key, &serialized_value)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn put(&self, key: &[u8], value: &Self::ValueType) -> Result<()> {
|
||||
let db = self.db();
|
||||
let serialized = serialize(value)?;
|
||||
db.put_cf(self.handle(), &key, &serialized)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete(&self, key: &[u8]) -> Result<()> {
|
||||
let db = self.db();
|
||||
db.delete_cf(self.handle(), &key)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn db(&self) -> &Arc<DB>;
|
||||
fn handle(&self) -> ColumnFamily;
|
||||
}
|
||||
|
||||
pub trait LedgerColumnFamilyRaw {
|
||||
fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
|
||||
let db = self.db();
|
||||
let data_bytes = db.get_cf(self.handle(), key)?;
|
||||
Ok(data_bytes.map(|x| x.to_vec()))
|
||||
}
|
||||
|
||||
fn put(&self, key: &[u8], serialized_value: &[u8]) -> Result<()> {
|
||||
let db = self.db();
|
||||
db.put_cf(self.handle(), &key, &serialized_value)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete(&self, key: &[u8]) -> Result<()> {
|
||||
let db = self.db();
|
||||
db.delete_cf(self.handle(), &key)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn raw_iterator(&self) -> BlocktreeRawIterator {
|
||||
let db = self.db();
|
||||
db.raw_iterator_cf(self.handle())
|
||||
.expect("Expected to be able to open database iterator")
|
||||
}
|
||||
|
||||
fn handle(&self) -> ColumnFamily;
|
||||
fn db(&self) -> &Arc<DB>;
|
||||
#[cfg(feature = "kvstore")]
|
||||
KvsDb(kvstore::Error),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
|
||||
|
@ -171,156 +131,13 @@ impl SlotMeta {
|
|||
}
|
||||
}
|
||||
|
||||
pub struct MetaCf {
|
||||
db: Arc<DB>,
|
||||
}
|
||||
|
||||
impl MetaCf {
|
||||
pub fn new(db: Arc<DB>) -> Self {
|
||||
MetaCf { db }
|
||||
}
|
||||
|
||||
pub fn key(slot: u64) -> Vec<u8> {
|
||||
let mut key = vec![0u8; 8];
|
||||
BigEndian::write_u64(&mut key[0..8], slot);
|
||||
key
|
||||
}
|
||||
|
||||
pub fn get_slot_meta(&self, slot: u64) -> Result<Option<SlotMeta>> {
|
||||
let key = Self::key(slot);
|
||||
self.get(&key)
|
||||
}
|
||||
|
||||
pub fn put_slot_meta(&self, slot: u64, slot_meta: &SlotMeta) -> Result<()> {
|
||||
let key = Self::key(slot);
|
||||
self.put(&key, slot_meta)
|
||||
}
|
||||
|
||||
pub fn index_from_key(key: &[u8]) -> Result<u64> {
|
||||
let mut rdr = io::Cursor::new(&key[..]);
|
||||
let index = rdr.read_u64::<BigEndian>()?;
|
||||
Ok(index)
|
||||
}
|
||||
}
|
||||
|
||||
impl LedgerColumnFamily for MetaCf {
|
||||
type ValueType = SlotMeta;
|
||||
|
||||
fn db(&self) -> &Arc<DB> {
|
||||
&self.db
|
||||
}
|
||||
|
||||
fn handle(&self) -> ColumnFamily {
|
||||
self.db.cf_handle(META_CF).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
// The data column family
|
||||
pub struct DataCf {
|
||||
db: Arc<DB>,
|
||||
}
|
||||
|
||||
impl DataCf {
|
||||
pub fn new(db: Arc<DB>) -> Self {
|
||||
DataCf { db }
|
||||
}
|
||||
|
||||
pub fn get_by_slot_index(&self, slot: u64, index: u64) -> Result<Option<Vec<u8>>> {
|
||||
let key = Self::key(slot, index);
|
||||
self.get(&key)
|
||||
}
|
||||
|
||||
pub fn delete_by_slot_index(&self, slot: u64, index: u64) -> Result<()> {
|
||||
let key = Self::key(slot, index);
|
||||
self.delete(&key)
|
||||
}
|
||||
|
||||
pub fn put_by_slot_index(&self, slot: u64, index: u64, serialized_value: &[u8]) -> Result<()> {
|
||||
let key = Self::key(slot, index);
|
||||
self.put(&key, serialized_value)
|
||||
}
|
||||
|
||||
pub fn key(slot: u64, index: u64) -> Vec<u8> {
|
||||
let mut key = vec![0u8; 16];
|
||||
BigEndian::write_u64(&mut key[0..8], slot);
|
||||
BigEndian::write_u64(&mut key[8..16], index);
|
||||
key
|
||||
}
|
||||
|
||||
pub fn slot_from_key(key: &[u8]) -> Result<u64> {
|
||||
let mut rdr = io::Cursor::new(&key[0..8]);
|
||||
let height = rdr.read_u64::<BigEndian>()?;
|
||||
Ok(height)
|
||||
}
|
||||
|
||||
pub fn index_from_key(key: &[u8]) -> Result<u64> {
|
||||
let mut rdr = io::Cursor::new(&key[8..16]);
|
||||
let index = rdr.read_u64::<BigEndian>()?;
|
||||
Ok(index)
|
||||
}
|
||||
}
|
||||
|
||||
impl LedgerColumnFamilyRaw for DataCf {
|
||||
fn db(&self) -> &Arc<DB> {
|
||||
&self.db
|
||||
}
|
||||
|
||||
fn handle(&self) -> ColumnFamily {
|
||||
self.db.cf_handle(DATA_CF).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
// The erasure column family
|
||||
pub struct ErasureCf {
|
||||
db: Arc<DB>,
|
||||
}
|
||||
|
||||
impl ErasureCf {
|
||||
pub fn new(db: Arc<DB>) -> Self {
|
||||
ErasureCf { db }
|
||||
}
|
||||
pub fn delete_by_slot_index(&self, slot: u64, index: u64) -> Result<()> {
|
||||
let key = Self::key(slot, index);
|
||||
self.delete(&key)
|
||||
}
|
||||
|
||||
pub fn get_by_slot_index(&self, slot: u64, index: u64) -> Result<Option<Vec<u8>>> {
|
||||
let key = Self::key(slot, index);
|
||||
self.get(&key)
|
||||
}
|
||||
|
||||
pub fn put_by_slot_index(&self, slot: u64, index: u64, serialized_value: &[u8]) -> Result<()> {
|
||||
let key = Self::key(slot, index);
|
||||
self.put(&key, serialized_value)
|
||||
}
|
||||
|
||||
pub fn key(slot: u64, index: u64) -> Vec<u8> {
|
||||
DataCf::key(slot, index)
|
||||
}
|
||||
|
||||
pub fn slot_from_key(key: &[u8]) -> Result<u64> {
|
||||
DataCf::slot_from_key(key)
|
||||
}
|
||||
|
||||
pub fn index_from_key(key: &[u8]) -> Result<u64> {
|
||||
DataCf::index_from_key(key)
|
||||
}
|
||||
}
|
||||
|
||||
impl LedgerColumnFamilyRaw for ErasureCf {
|
||||
fn db(&self) -> &Arc<DB> {
|
||||
&self.db
|
||||
}
|
||||
|
||||
fn handle(&self) -> ColumnFamily {
|
||||
self.db.cf_handle(ERASURE_CF).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
// ledger window
|
||||
pub struct Blocktree {
|
||||
// Underlying database is automatically closed in the Drop implementation of DB
|
||||
db: Arc<DB>,
|
||||
#[cfg(not(feature = "kvstore"))]
|
||||
db: Arc<Rocks>,
|
||||
#[cfg(feature = "kvstore")]
|
||||
db: Arc<Kvs>,
|
||||
meta_cf: MetaCf,
|
||||
data_cf: DataCf,
|
||||
erasure_cf: ErasureCf,
|
||||
|
@ -336,47 +153,6 @@ pub const DATA_CF: &str = "data";
|
|||
pub const ERASURE_CF: &str = "erasure";
|
||||
|
||||
impl Blocktree {
|
||||
// Opens a Ledger in directory, provides "infinite" window of blobs
|
||||
pub fn open(ledger_path: &str) -> Result<Self> {
|
||||
fs::create_dir_all(&ledger_path)?;
|
||||
let ledger_path = Path::new(ledger_path).join(BLOCKTREE_DIRECTORY);
|
||||
|
||||
// Use default database options
|
||||
let db_options = Self::get_db_options();
|
||||
|
||||
// Column family names
|
||||
let meta_cf_descriptor = ColumnFamilyDescriptor::new(META_CF, Self::get_cf_options());
|
||||
let data_cf_descriptor = ColumnFamilyDescriptor::new(DATA_CF, Self::get_cf_options());
|
||||
let erasure_cf_descriptor = ColumnFamilyDescriptor::new(ERASURE_CF, Self::get_cf_options());
|
||||
let cfs = vec![
|
||||
meta_cf_descriptor,
|
||||
data_cf_descriptor,
|
||||
erasure_cf_descriptor,
|
||||
];
|
||||
|
||||
// Open the database
|
||||
let db = Arc::new(DB::open_cf_descriptors(&db_options, ledger_path, cfs)?);
|
||||
|
||||
// Create the metadata column family
|
||||
let meta_cf = MetaCf::new(db.clone());
|
||||
|
||||
// Create the data column family
|
||||
let data_cf = DataCf::new(db.clone());
|
||||
|
||||
// Create the erasure column family
|
||||
let erasure_cf = ErasureCf::new(db.clone());
|
||||
|
||||
let ticks_per_slot = DEFAULT_TICKS_PER_SLOT;
|
||||
Ok(Blocktree {
|
||||
db,
|
||||
meta_cf,
|
||||
data_cf,
|
||||
erasure_cf,
|
||||
new_blobs_signals: vec![],
|
||||
ticks_per_slot,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open_with_signal(ledger_path: &str) -> Result<(Self, Receiver<bool>)> {
|
||||
let mut blocktree = Self::open(ledger_path)?;
|
||||
let (signal_sender, signal_receiver) = sync_channel(1);
|
||||
|
@ -422,14 +198,6 @@ impl Blocktree {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub fn destroy(ledger_path: &str) -> Result<()> {
|
||||
// DB::destroy() fails if `ledger_path` doesn't exist
|
||||
fs::create_dir_all(&ledger_path)?;
|
||||
let ledger_path = Path::new(ledger_path).join(BLOCKTREE_DIRECTORY);
|
||||
DB::destroy(&Options::default(), &ledger_path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_next_slot(&self, slot: u64) -> Result<Option<u64>> {
|
||||
let mut db_iterator = self.db.raw_iterator_cf(self.meta_cf.handle())?;
|
||||
db_iterator.seek(&MetaCf::key(slot + 1));
|
||||
|
@ -526,7 +294,7 @@ impl Blocktree {
|
|||
I: IntoIterator,
|
||||
I::Item: Borrow<Blob>,
|
||||
{
|
||||
let mut write_batch = WriteBatch::default();
|
||||
let mut write_batch = self.db.batch()?;
|
||||
// A map from slot to a 2-tuple of metadata: (working copy, backup copy),
|
||||
// so we can detect changes to the slot metadata later
|
||||
let mut slot_meta_working_set = HashMap::new();
|
||||
|
@ -672,24 +440,6 @@ impl Blocktree {
|
|||
Ok((total_blobs, total_current_size as u64))
|
||||
}
|
||||
|
||||
/// Return an iterator for all the entries in the given file.
|
||||
pub fn read_ledger(&self) -> Result<impl Iterator<Item = Entry>> {
|
||||
let mut db_iterator = self.db.raw_iterator_cf(self.data_cf.handle())?;
|
||||
|
||||
db_iterator.seek_to_first();
|
||||
Ok(EntryIterator {
|
||||
db_iterator,
|
||||
blockhash: None,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn read_ledger_blobs(&self) -> impl Iterator<Item = Blob> {
|
||||
self.db
|
||||
.iterator_cf(self.data_cf.handle(), IteratorMode::Start)
|
||||
.unwrap()
|
||||
.map(|(_, blob_data)| Blob::new(&blob_data))
|
||||
}
|
||||
|
||||
pub fn get_coding_blob_bytes(&self, slot: u64, index: u64) -> Result<Option<Vec<u8>>> {
|
||||
self.erasure_cf.get_by_slot_index(slot, index)
|
||||
}
|
||||
|
@ -703,7 +453,7 @@ impl Blocktree {
|
|||
self.erasure_cf.put_by_slot_index(slot, index, bytes)
|
||||
}
|
||||
|
||||
pub fn put_data_raw(&self, key: &[u8], value: &[u8]) -> Result<()> {
|
||||
pub fn put_data_raw(&self, key: &KeyRef, value: &[u8]) -> Result<()> {
|
||||
self.data_cf.put(key, value)
|
||||
}
|
||||
|
||||
|
@ -738,9 +488,9 @@ impl Blocktree {
|
|||
slot: u64,
|
||||
start_index: u64,
|
||||
end_index: u64,
|
||||
key: &dyn Fn(u64, u64) -> Vec<u8>,
|
||||
slot_from_key: &dyn Fn(&[u8]) -> Result<u64>,
|
||||
index_from_key: &dyn Fn(&[u8]) -> Result<u64>,
|
||||
key: &dyn Fn(u64, u64) -> Key,
|
||||
slot_from_key: &dyn Fn(&KeyRef) -> Result<u64>,
|
||||
index_from_key: &dyn Fn(&KeyRef) -> Result<u64>,
|
||||
max_missing: usize,
|
||||
) -> Vec<u64> {
|
||||
if start_index >= end_index || max_missing == 0 {
|
||||
|
@ -897,27 +647,6 @@ impl Blocktree {
|
|||
.collect()
|
||||
}
|
||||
|
||||
fn get_cf_options() -> Options {
|
||||
let mut options = Options::default();
|
||||
options.set_max_write_buffer_number(32);
|
||||
options.set_write_buffer_size(MAX_WRITE_BUFFER_SIZE);
|
||||
options.set_max_bytes_for_level_base(MAX_WRITE_BUFFER_SIZE as u64);
|
||||
options
|
||||
}
|
||||
|
||||
fn get_db_options() -> Options {
|
||||
let mut options = Options::default();
|
||||
options.create_if_missing(true);
|
||||
options.create_missing_column_families(true);
|
||||
options.increase_parallelism(TOTAL_THREADS);
|
||||
options.set_max_background_flushes(4);
|
||||
options.set_max_background_compactions(4);
|
||||
options.set_max_write_buffer_number(32);
|
||||
options.set_write_buffer_size(MAX_WRITE_BUFFER_SIZE);
|
||||
options.set_max_bytes_for_level_base(MAX_WRITE_BUFFER_SIZE as u64);
|
||||
options
|
||||
}
|
||||
|
||||
fn slot_has_updates(slot_meta: &SlotMeta, slot_meta_backup: &Option<SlotMeta>) -> bool {
|
||||
// We should signal that there are updates if we extended the chain of consecutive blocks starting
|
||||
// from block 0, which is true iff:
|
||||
|
@ -1204,7 +933,7 @@ impl Blocktree {
|
|||
bootstrap_meta.received = last.index() + 1;
|
||||
bootstrap_meta.is_rooted = true;
|
||||
|
||||
let mut batch = WriteBatch::default();
|
||||
let mut batch = self.db.batch()?;
|
||||
batch.put_cf(
|
||||
self.meta_cf.handle(),
|
||||
&meta_key,
|
||||
|
@ -1220,45 +949,6 @@ impl Blocktree {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: all this goes away with Blocktree
|
||||
struct EntryIterator {
|
||||
db_iterator: DBRawIterator,
|
||||
|
||||
// TODO: remove me when replay_stage is iterating by block (Blocktree)
|
||||
// this verification is duplicating that of replay_stage, which
|
||||
// can do this in parallel
|
||||
blockhash: Option<Hash>,
|
||||
// https://github.com/rust-rocksdb/rust-rocksdb/issues/234
|
||||
// rocksdb issue: the _blocktree member must be lower in the struct to prevent a crash
|
||||
// when the db_iterator member above is dropped.
|
||||
// _blocktree is unused, but dropping _blocktree results in a broken db_iterator
|
||||
// you have to hold the database open in order to iterate over it, and in order
|
||||
// for db_iterator to be able to run Drop
|
||||
// _blocktree: Blocktree,
|
||||
}
|
||||
|
||||
impl Iterator for EntryIterator {
|
||||
type Item = Entry;
|
||||
|
||||
fn next(&mut self) -> Option<Entry> {
|
||||
if self.db_iterator.valid() {
|
||||
if let Some(value) = self.db_iterator.value() {
|
||||
if let Ok(entry) = deserialize::<Entry>(&value[BLOB_HEADER_SIZE..]) {
|
||||
if let Some(blockhash) = self.blockhash {
|
||||
if !entry.verify(&blockhash) {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
self.db_iterator.next();
|
||||
self.blockhash = Some(entry.hash);
|
||||
return Some(entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// Creates a new ledger with slot 0 full of ticks (and only ticks).
|
||||
//
|
||||
// Returns the blockhash that can be used to append entries with.
|
||||
|
|
|
@ -0,0 +1,195 @@
|
|||
use crate::entry::Entry;
|
||||
use crate::result::{Error, Result};
|
||||
|
||||
use bincode::{deserialize, serialize};
|
||||
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde::Serialize;
|
||||
|
||||
use std::borrow::Borrow;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub trait Database: Sized + Send + Sync {
|
||||
type Error: Into<Error>;
|
||||
type Key: Borrow<Self::KeyRef>;
|
||||
type KeyRef: ?Sized;
|
||||
type ColumnFamily;
|
||||
type Cursor: Cursor<Self>;
|
||||
type EntryIter: Iterator<Item = Entry>;
|
||||
type WriteBatch: IWriteBatch<Self>;
|
||||
|
||||
fn cf_handle(&self, cf: &str) -> Option<Self::ColumnFamily>;
|
||||
|
||||
fn get_cf(&self, cf: Self::ColumnFamily, key: &Self::KeyRef) -> Result<Option<Vec<u8>>>;
|
||||
|
||||
fn put_cf(&self, cf: Self::ColumnFamily, key: &Self::KeyRef, data: &[u8]) -> Result<()>;
|
||||
|
||||
fn delete_cf(&self, cf: Self::ColumnFamily, key: &Self::KeyRef) -> Result<()>;
|
||||
|
||||
fn raw_iterator_cf(&self, cf: Self::ColumnFamily) -> Result<Self::Cursor>;
|
||||
|
||||
fn write(&self, batch: Self::WriteBatch) -> Result<()>;
|
||||
|
||||
fn batch(&self) -> Result<Self::WriteBatch>;
|
||||
}
|
||||
|
||||
pub trait Cursor<D: Database> {
|
||||
fn valid(&self) -> bool;
|
||||
|
||||
fn seek(&mut self, key: &D::KeyRef);
|
||||
|
||||
fn seek_to_first(&mut self);
|
||||
|
||||
fn next(&mut self);
|
||||
|
||||
fn key(&self) -> Option<D::Key>;
|
||||
|
||||
fn value(&self) -> Option<Vec<u8>>;
|
||||
}
|
||||
|
||||
pub trait IWriteBatch<D: Database> {
|
||||
fn put_cf(&mut self, cf: D::ColumnFamily, key: &D::KeyRef, data: &[u8]) -> Result<()>;
|
||||
}
|
||||
|
||||
pub trait IDataCf<D: Database>: LedgerColumnFamilyRaw<D> {
|
||||
fn new(db: Arc<D>) -> Self;
|
||||
|
||||
fn get_by_slot_index(&self, slot: u64, index: u64) -> Result<Option<Vec<u8>>> {
|
||||
let key = Self::key(slot, index);
|
||||
self.get(key.borrow())
|
||||
}
|
||||
|
||||
fn delete_by_slot_index(&self, slot: u64, index: u64) -> Result<()> {
|
||||
let key = Self::key(slot, index);
|
||||
self.delete(&key.borrow())
|
||||
}
|
||||
|
||||
fn put_by_slot_index(&self, slot: u64, index: u64, serialized_value: &[u8]) -> Result<()> {
|
||||
let key = Self::key(slot, index);
|
||||
self.put(key.borrow(), serialized_value)
|
||||
}
|
||||
|
||||
fn key(slot: u64, index: u64) -> D::Key;
|
||||
|
||||
fn slot_from_key(key: &D::KeyRef) -> Result<u64>;
|
||||
|
||||
fn index_from_key(key: &D::KeyRef) -> Result<u64>;
|
||||
}
|
||||
|
||||
pub trait IErasureCf<D: Database>: LedgerColumnFamilyRaw<D> {
|
||||
fn new(db: Arc<D>) -> Self;
|
||||
|
||||
fn delete_by_slot_index(&self, slot: u64, index: u64) -> Result<()> {
|
||||
let key = Self::key(slot, index);
|
||||
self.delete(key.borrow())
|
||||
}
|
||||
|
||||
fn get_by_slot_index(&self, slot: u64, index: u64) -> Result<Option<Vec<u8>>> {
|
||||
let key = Self::key(slot, index);
|
||||
self.get(key.borrow())
|
||||
}
|
||||
|
||||
fn put_by_slot_index(&self, slot: u64, index: u64, serialized_value: &[u8]) -> Result<()> {
|
||||
let key = Self::key(slot, index);
|
||||
self.put(key.borrow(), serialized_value)
|
||||
}
|
||||
|
||||
fn key(slot: u64, index: u64) -> D::Key;
|
||||
|
||||
fn slot_from_key(key: &D::KeyRef) -> Result<u64>;
|
||||
|
||||
fn index_from_key(key: &D::KeyRef) -> Result<u64>;
|
||||
}
|
||||
|
||||
pub trait IMetaCf<D: Database>: LedgerColumnFamily<D, ValueType = super::SlotMeta> {
|
||||
fn new(db: Arc<D>) -> Self;
|
||||
|
||||
fn key(slot: u64) -> D::Key;
|
||||
|
||||
fn get_slot_meta(&self, slot: u64) -> Result<Option<super::SlotMeta>> {
|
||||
let key = Self::key(slot);
|
||||
self.get(key.borrow())
|
||||
}
|
||||
|
||||
fn put_slot_meta(&self, slot: u64, slot_meta: &super::SlotMeta) -> Result<()> {
|
||||
let key = Self::key(slot);
|
||||
self.put(key.borrow(), slot_meta)
|
||||
}
|
||||
|
||||
fn index_from_key(key: &D::KeyRef) -> Result<u64>;
|
||||
}
|
||||
|
||||
pub trait LedgerColumnFamily<D: Database> {
|
||||
type ValueType: DeserializeOwned + Serialize;
|
||||
|
||||
fn get(&self, key: &D::KeyRef) -> Result<Option<Self::ValueType>> {
|
||||
let db = self.db();
|
||||
let data_bytes = db.get_cf(self.handle(), key)?;
|
||||
|
||||
if let Some(raw) = data_bytes {
|
||||
let result: Self::ValueType = deserialize(&raw)?;
|
||||
Ok(Some(result))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn get_bytes(&self, key: &D::KeyRef) -> Result<Option<Vec<u8>>> {
|
||||
let db = self.db();
|
||||
let data_bytes = db.get_cf(self.handle(), key)?;
|
||||
Ok(data_bytes.map(|x| x.to_vec()))
|
||||
}
|
||||
|
||||
fn put_bytes(&self, key: &D::KeyRef, serialized_value: &[u8]) -> Result<()> {
|
||||
let db = self.db();
|
||||
db.put_cf(self.handle(), key, &serialized_value)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn put(&self, key: &D::KeyRef, value: &Self::ValueType) -> Result<()> {
|
||||
let db = self.db();
|
||||
let serialized = serialize(value)?;
|
||||
db.put_cf(self.handle(), key, &serialized)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete(&self, key: &D::KeyRef) -> Result<()> {
|
||||
let db = self.db();
|
||||
db.delete_cf(self.handle(), key)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn db(&self) -> &Arc<D>;
|
||||
|
||||
fn handle(&self) -> D::ColumnFamily;
|
||||
}
|
||||
|
||||
pub trait LedgerColumnFamilyRaw<D: Database> {
|
||||
fn get(&self, key: &D::KeyRef) -> Result<Option<Vec<u8>>> {
|
||||
let db = self.db();
|
||||
let data_bytes = db.get_cf(self.handle(), key)?;
|
||||
Ok(data_bytes.map(|x| x.to_vec()))
|
||||
}
|
||||
|
||||
fn put(&self, key: &D::KeyRef, serialized_value: &[u8]) -> Result<()> {
|
||||
let db = self.db();
|
||||
db.put_cf(self.handle(), &key, &serialized_value)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete(&self, key: &D::KeyRef) -> Result<()> {
|
||||
let db = self.db();
|
||||
db.delete_cf(self.handle(), &key)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn raw_iterator(&self) -> D::Cursor {
|
||||
let db = self.db();
|
||||
db.raw_iterator_cf(self.handle())
|
||||
.expect("Expected to be able to open database iterator")
|
||||
}
|
||||
|
||||
fn handle(&self) -> D::ColumnFamily;
|
||||
|
||||
fn db(&self) -> &Arc<D>;
|
||||
}
|
|
@ -0,0 +1,265 @@
|
|||
use crate::entry::Entry;
|
||||
use crate::kvstore::{self, Key};
|
||||
use crate::packet::Blob;
|
||||
use crate::result::{Error, Result};
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::db::{
|
||||
Cursor, Database, IDataCf, IErasureCf, IMetaCf, IWriteBatch, LedgerColumnFamily,
|
||||
LedgerColumnFamilyRaw,
|
||||
};
|
||||
use super::{Blocktree, BlocktreeError};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Kvs(());
|
||||
|
||||
/// The metadata column family
|
||||
#[derive(Debug)]
|
||||
pub struct MetaCf {
|
||||
db: Arc<Kvs>,
|
||||
}
|
||||
|
||||
/// The data column family
|
||||
#[derive(Debug)]
|
||||
pub struct DataCf {
|
||||
db: Arc<Kvs>,
|
||||
}
|
||||
|
||||
/// The erasure column family
|
||||
#[derive(Debug)]
|
||||
pub struct ErasureCf {
|
||||
db: Arc<Kvs>,
|
||||
}
|
||||
|
||||
/// Dummy struct to get things compiling
|
||||
/// TODO: all this goes away with Blocktree
|
||||
pub struct EntryIterator(i32);
|
||||
/// Dummy struct to get things compiling
|
||||
pub struct KvsCursor;
|
||||
/// Dummy struct to get things compiling
|
||||
pub struct ColumnFamily;
|
||||
/// Dummy struct to get things compiling
|
||||
pub struct KvsWriteBatch;
|
||||
|
||||
impl Blocktree {
|
||||
/// Opens a Ledger in directory, provides "infinite" window of blobs
|
||||
pub fn open(_ledger_path: &str) -> Result<Blocktree> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
#[allow(unreachable_code)]
|
||||
pub fn read_ledger_blobs(&self) -> impl Iterator<Item = Blob> {
|
||||
unimplemented!();
|
||||
self.read_ledger().unwrap().map(|_| Blob::new(&[]))
|
||||
}
|
||||
|
||||
/// Return an iterator for all the entries in the given file.
|
||||
#[allow(unreachable_code)]
|
||||
pub fn read_ledger(&self) -> Result<impl Iterator<Item = Entry>> {
|
||||
Ok(EntryIterator(unimplemented!()))
|
||||
}
|
||||
|
||||
pub fn destroy(_ledger_path: &str) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Database for Kvs {
|
||||
type Error = kvstore::Error;
|
||||
type Key = Key;
|
||||
type KeyRef = Key;
|
||||
type ColumnFamily = ColumnFamily;
|
||||
type Cursor = KvsCursor;
|
||||
type EntryIter = EntryIterator;
|
||||
type WriteBatch = KvsWriteBatch;
|
||||
|
||||
fn cf_handle(&self, _cf: &str) -> Option<ColumnFamily> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn get_cf(&self, _cf: ColumnFamily, _key: &Key) -> Result<Option<Vec<u8>>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn put_cf(&self, _cf: ColumnFamily, _key: &Key, _data: &[u8]) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn delete_cf(&self, _cf: Self::ColumnFamily, _key: &Key) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn raw_iterator_cf(&self, _cf: Self::ColumnFamily) -> Result<Self::Cursor> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn write(&self, _batch: Self::WriteBatch) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn batch(&self) -> Result<Self::WriteBatch> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Cursor<Kvs> for KvsCursor {
|
||||
fn valid(&self) -> bool {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn seek(&mut self, _key: &Key) {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn seek_to_first(&mut self) {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn next(&mut self) {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn key(&self) -> Option<Key> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn value(&self) -> Option<Vec<u8>> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl IWriteBatch<Kvs> for KvsWriteBatch {
|
||||
fn put_cf(&mut self, _cf: ColumnFamily, _key: &Key, _data: &[u8]) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl IDataCf<Kvs> for DataCf {
|
||||
fn new(db: Arc<Kvs>) -> Self {
|
||||
DataCf { db }
|
||||
}
|
||||
|
||||
fn get_by_slot_index(&self, _slot: u64, _index: u64) -> Result<Option<Vec<u8>>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn delete_by_slot_index(&self, _slot: u64, _index: u64) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn put_by_slot_index(&self, _slot: u64, _index: u64, _serialized_value: &[u8]) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn key(_slot: u64, _index: u64) -> Key {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn slot_from_key(_key: &Key) -> Result<u64> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn index_from_key(_key: &Key) -> Result<u64> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl IErasureCf<Kvs> for ErasureCf {
|
||||
fn new(db: Arc<Kvs>) -> Self {
|
||||
ErasureCf { db }
|
||||
}
|
||||
|
||||
fn delete_by_slot_index(&self, _slot: u64, _index: u64) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn get_by_slot_index(&self, _slot: u64, _index: u64) -> Result<Option<Vec<u8>>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn put_by_slot_index(&self, _slot: u64, _index: u64, _serialized_value: &[u8]) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn key(slot: u64, index: u64) -> Key {
|
||||
DataCf::key(slot, index)
|
||||
}
|
||||
|
||||
fn slot_from_key(key: &Key) -> Result<u64> {
|
||||
DataCf::slot_from_key(key)
|
||||
}
|
||||
|
||||
fn index_from_key(key: &Key) -> Result<u64> {
|
||||
DataCf::index_from_key(key)
|
||||
}
|
||||
}
|
||||
|
||||
impl IMetaCf<Kvs> for MetaCf {
|
||||
fn new(db: Arc<Kvs>) -> Self {
|
||||
MetaCf { db }
|
||||
}
|
||||
|
||||
fn key(_slot: u64) -> Key {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn get_slot_meta(&self, _slot: u64) -> Result<Option<super::SlotMeta>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn put_slot_meta(&self, _slot: u64, _slot_meta: &super::SlotMeta) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn index_from_key(_key: &Key) -> Result<u64> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
impl LedgerColumnFamilyRaw<Kvs> for DataCf {
|
||||
fn db(&self) -> &Arc<Kvs> {
|
||||
&self.db
|
||||
}
|
||||
|
||||
fn handle(&self) -> ColumnFamily {
|
||||
self.db.cf_handle(super::DATA_CF).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl LedgerColumnFamilyRaw<Kvs> for ErasureCf {
|
||||
fn db(&self) -> &Arc<Kvs> {
|
||||
&self.db
|
||||
}
|
||||
|
||||
fn handle(&self) -> ColumnFamily {
|
||||
self.db.cf_handle(super::ERASURE_CF).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl LedgerColumnFamily<Kvs> for MetaCf {
|
||||
type ValueType = super::SlotMeta;
|
||||
|
||||
fn db(&self) -> &Arc<Kvs> {
|
||||
&self.db
|
||||
}
|
||||
|
||||
fn handle(&self) -> ColumnFamily {
|
||||
self.db.cf_handle(super::META_CF).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::convert::From<kvstore::Error> for Error {
|
||||
fn from(e: kvstore::Error) -> Error {
|
||||
Error::BlocktreeError(BlocktreeError::KvsDb(e))
|
||||
}
|
||||
}
|
||||
|
||||
/// TODO: all this goes away with Blocktree
|
||||
impl Iterator for EntryIterator {
|
||||
type Item = Entry;
|
||||
|
||||
fn next(&mut self) -> Option<Entry> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
|
@ -0,0 +1,400 @@
|
|||
use crate::entry::Entry;
|
||||
use crate::packet::{Blob, BLOB_HEADER_SIZE};
|
||||
use crate::result::{Error, Result};
|
||||
|
||||
use bincode::deserialize;
|
||||
|
||||
use byteorder::{BigEndian, ByteOrder, ReadBytesExt};
|
||||
|
||||
use rocksdb::{
|
||||
self, ColumnFamily, ColumnFamilyDescriptor, DBRawIterator, IteratorMode, Options,
|
||||
WriteBatch as RWriteBatch, DB,
|
||||
};
|
||||
|
||||
use solana_sdk::hash::Hash;
|
||||
use solana_sdk::timing::DEFAULT_TICKS_PER_SLOT;
|
||||
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::db::{
|
||||
Cursor, Database, IDataCf, IErasureCf, IMetaCf, IWriteBatch, LedgerColumnFamily,
|
||||
LedgerColumnFamilyRaw,
|
||||
};
|
||||
use super::{Blocktree, BlocktreeError};
|
||||
|
||||
// A good value for this is the number of cores on the machine
|
||||
const TOTAL_THREADS: i32 = 8;
|
||||
const MAX_WRITE_BUFFER_SIZE: usize = 512 * 1024 * 1024;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Rocks(rocksdb::DB);
|
||||
|
||||
/// The metadata column family
|
||||
#[derive(Debug)]
|
||||
pub struct MetaCf {
|
||||
db: Arc<Rocks>,
|
||||
}
|
||||
|
||||
/// The data column family
|
||||
#[derive(Debug)]
|
||||
pub struct DataCf {
|
||||
db: Arc<Rocks>,
|
||||
}
|
||||
|
||||
/// The erasure column family
|
||||
#[derive(Debug)]
|
||||
pub struct ErasureCf {
|
||||
db: Arc<Rocks>,
|
||||
}
|
||||
|
||||
/// TODO: all this goes away with Blocktree
|
||||
pub struct EntryIterator {
|
||||
db_iterator: DBRawIterator,
|
||||
|
||||
// TODO: remove me when replay_stage is iterating by block (Blocktree)
|
||||
// this verification is duplicating that of replay_stage, which
|
||||
// can do this in parallel
|
||||
blockhash: Option<Hash>,
|
||||
// https://github.com/rust-rocksdb/rust-rocksdb/issues/234
|
||||
// rocksdb issue: the _blocktree member must be lower in the struct to prevent a crash
|
||||
// when the db_iterator member above is dropped.
|
||||
// _blocktree is unused, but dropping _blocktree results in a broken db_iterator
|
||||
// you have to hold the database open in order to iterate over it, and in order
|
||||
// for db_iterator to be able to run Drop
|
||||
// _blocktree: Blocktree,
|
||||
}
|
||||
|
||||
impl Blocktree {
|
||||
/// Opens a Ledger in directory, provides "infinite" window of blobs
|
||||
pub fn open(ledger_path: &str) -> Result<Blocktree> {
|
||||
fs::create_dir_all(&ledger_path)?;
|
||||
let ledger_path = Path::new(ledger_path).join(super::BLOCKTREE_DIRECTORY);
|
||||
|
||||
// Use default database options
|
||||
let db_options = Blocktree::get_db_options();
|
||||
|
||||
// Column family names
|
||||
let meta_cf_descriptor =
|
||||
ColumnFamilyDescriptor::new(super::META_CF, Blocktree::get_cf_options());
|
||||
let data_cf_descriptor =
|
||||
ColumnFamilyDescriptor::new(super::DATA_CF, Blocktree::get_cf_options());
|
||||
let erasure_cf_descriptor =
|
||||
ColumnFamilyDescriptor::new(super::ERASURE_CF, Blocktree::get_cf_options());
|
||||
let cfs = vec![
|
||||
meta_cf_descriptor,
|
||||
data_cf_descriptor,
|
||||
erasure_cf_descriptor,
|
||||
];
|
||||
|
||||
// Open the database
|
||||
let db = Arc::new(Rocks(DB::open_cf_descriptors(
|
||||
&db_options,
|
||||
ledger_path,
|
||||
cfs,
|
||||
)?));
|
||||
|
||||
// Create the metadata column family
|
||||
let meta_cf = MetaCf::new(db.clone());
|
||||
|
||||
// Create the data column family
|
||||
let data_cf = DataCf::new(db.clone());
|
||||
|
||||
// Create the erasure column family
|
||||
let erasure_cf = ErasureCf::new(db.clone());
|
||||
|
||||
let ticks_per_slot = DEFAULT_TICKS_PER_SLOT;
|
||||
Ok(Blocktree {
|
||||
db,
|
||||
meta_cf,
|
||||
data_cf,
|
||||
erasure_cf,
|
||||
new_blobs_signals: vec![],
|
||||
ticks_per_slot,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn read_ledger_blobs(&self) -> impl Iterator<Item = Blob> {
|
||||
self.db
|
||||
.0
|
||||
.iterator_cf(self.data_cf.handle(), IteratorMode::Start)
|
||||
.unwrap()
|
||||
.map(|(_, blob_data)| Blob::new(&blob_data))
|
||||
}
|
||||
|
||||
/// Return an iterator for all the entries in the given file.
|
||||
pub fn read_ledger(&self) -> Result<impl Iterator<Item = Entry>> {
|
||||
let mut db_iterator = self.db.raw_iterator_cf(self.data_cf.handle())?;
|
||||
|
||||
db_iterator.seek_to_first();
|
||||
Ok(EntryIterator {
|
||||
db_iterator,
|
||||
blockhash: None,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn destroy(ledger_path: &str) -> Result<()> {
|
||||
// DB::destroy() fails if `ledger_path` doesn't exist
|
||||
fs::create_dir_all(&ledger_path)?;
|
||||
let ledger_path = Path::new(ledger_path).join(super::BLOCKTREE_DIRECTORY);
|
||||
DB::destroy(&Options::default(), &ledger_path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_cf_options() -> Options {
|
||||
let mut options = Options::default();
|
||||
options.set_max_write_buffer_number(32);
|
||||
options.set_write_buffer_size(MAX_WRITE_BUFFER_SIZE);
|
||||
options.set_max_bytes_for_level_base(MAX_WRITE_BUFFER_SIZE as u64);
|
||||
options
|
||||
}
|
||||
|
||||
fn get_db_options() -> Options {
|
||||
let mut options = Options::default();
|
||||
options.create_if_missing(true);
|
||||
options.create_missing_column_families(true);
|
||||
options.increase_parallelism(TOTAL_THREADS);
|
||||
options.set_max_background_flushes(4);
|
||||
options.set_max_background_compactions(4);
|
||||
options.set_max_write_buffer_number(32);
|
||||
options.set_write_buffer_size(MAX_WRITE_BUFFER_SIZE);
|
||||
options.set_max_bytes_for_level_base(MAX_WRITE_BUFFER_SIZE as u64);
|
||||
options
|
||||
}
|
||||
}
|
||||
|
||||
impl Database for Rocks {
|
||||
type Error = rocksdb::Error;
|
||||
type Key = Vec<u8>;
|
||||
type KeyRef = [u8];
|
||||
type ColumnFamily = ColumnFamily;
|
||||
type Cursor = DBRawIterator;
|
||||
type EntryIter = EntryIterator;
|
||||
type WriteBatch = RWriteBatch;
|
||||
|
||||
fn cf_handle(&self, cf: &str) -> Option<ColumnFamily> {
|
||||
self.0.cf_handle(cf)
|
||||
}
|
||||
|
||||
fn get_cf(&self, cf: ColumnFamily, key: &[u8]) -> Result<Option<Vec<u8>>> {
|
||||
let opt = self.0.get_cf(cf, key)?;
|
||||
Ok(opt.map(|dbvec| dbvec.to_vec()))
|
||||
}
|
||||
|
||||
fn put_cf(&self, cf: ColumnFamily, key: &[u8], data: &[u8]) -> Result<()> {
|
||||
self.0.put_cf(cf, key, data)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete_cf(&self, cf: Self::ColumnFamily, key: &[u8]) -> Result<()> {
|
||||
self.0.delete_cf(cf, key).map_err(From::from)
|
||||
}
|
||||
|
||||
fn raw_iterator_cf(&self, cf: Self::ColumnFamily) -> Result<Self::Cursor> {
|
||||
Ok(self.0.raw_iterator_cf(cf)?)
|
||||
}
|
||||
|
||||
fn write(&self, batch: Self::WriteBatch) -> Result<()> {
|
||||
self.0.write(batch).map_err(From::from)
|
||||
}
|
||||
|
||||
fn batch(&self) -> Result<Self::WriteBatch> {
|
||||
Ok(RWriteBatch::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl Cursor<Rocks> for DBRawIterator {
|
||||
fn valid(&self) -> bool {
|
||||
DBRawIterator::valid(self)
|
||||
}
|
||||
|
||||
fn seek(&mut self, key: &[u8]) {
|
||||
DBRawIterator::seek(self, key)
|
||||
}
|
||||
|
||||
fn seek_to_first(&mut self) {
|
||||
DBRawIterator::seek_to_first(self)
|
||||
}
|
||||
|
||||
fn next(&mut self) {
|
||||
DBRawIterator::next(self)
|
||||
}
|
||||
|
||||
fn key(&self) -> Option<Vec<u8>> {
|
||||
DBRawIterator::key(self)
|
||||
}
|
||||
|
||||
fn value(&self) -> Option<Vec<u8>> {
|
||||
DBRawIterator::value(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl IWriteBatch<Rocks> for RWriteBatch {
|
||||
fn put_cf(&mut self, cf: ColumnFamily, key: &[u8], data: &[u8]) -> Result<()> {
|
||||
RWriteBatch::put_cf(self, cf, key, data)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl IDataCf<Rocks> for DataCf {
|
||||
fn new(db: Arc<Rocks>) -> Self {
|
||||
DataCf { db }
|
||||
}
|
||||
|
||||
fn get_by_slot_index(&self, slot: u64, index: u64) -> Result<Option<Vec<u8>>> {
|
||||
let key = Self::key(slot, index);
|
||||
self.get(&key)
|
||||
}
|
||||
|
||||
fn delete_by_slot_index(&self, slot: u64, index: u64) -> Result<()> {
|
||||
let key = Self::key(slot, index);
|
||||
self.delete(&key)
|
||||
}
|
||||
|
||||
fn put_by_slot_index(&self, slot: u64, index: u64, serialized_value: &[u8]) -> Result<()> {
|
||||
let key = Self::key(slot, index);
|
||||
self.put(&key, serialized_value)
|
||||
}
|
||||
|
||||
fn key(slot: u64, index: u64) -> Vec<u8> {
|
||||
let mut key = vec![0u8; 16];
|
||||
BigEndian::write_u64(&mut key[0..8], slot);
|
||||
BigEndian::write_u64(&mut key[8..16], index);
|
||||
key
|
||||
}
|
||||
|
||||
fn slot_from_key(key: &[u8]) -> Result<u64> {
|
||||
let mut rdr = io::Cursor::new(&key[0..8]);
|
||||
let height = rdr.read_u64::<BigEndian>()?;
|
||||
Ok(height)
|
||||
}
|
||||
|
||||
fn index_from_key(key: &[u8]) -> Result<u64> {
|
||||
let mut rdr = io::Cursor::new(&key[8..16]);
|
||||
let index = rdr.read_u64::<BigEndian>()?;
|
||||
Ok(index)
|
||||
}
|
||||
}
|
||||
|
||||
impl IErasureCf<Rocks> for ErasureCf {
|
||||
fn new(db: Arc<Rocks>) -> Self {
|
||||
ErasureCf { db }
|
||||
}
|
||||
fn delete_by_slot_index(&self, slot: u64, index: u64) -> Result<()> {
|
||||
let key = Self::key(slot, index);
|
||||
self.delete(&key)
|
||||
}
|
||||
|
||||
fn get_by_slot_index(&self, slot: u64, index: u64) -> Result<Option<Vec<u8>>> {
|
||||
let key = Self::key(slot, index);
|
||||
self.get(&key)
|
||||
}
|
||||
|
||||
fn put_by_slot_index(&self, slot: u64, index: u64, serialized_value: &[u8]) -> Result<()> {
|
||||
let key = Self::key(slot, index);
|
||||
self.put(&key, serialized_value)
|
||||
}
|
||||
|
||||
fn key(slot: u64, index: u64) -> Vec<u8> {
|
||||
DataCf::key(slot, index)
|
||||
}
|
||||
|
||||
fn slot_from_key(key: &[u8]) -> Result<u64> {
|
||||
DataCf::slot_from_key(key)
|
||||
}
|
||||
|
||||
fn index_from_key(key: &[u8]) -> Result<u64> {
|
||||
DataCf::index_from_key(key)
|
||||
}
|
||||
}
|
||||
|
||||
impl IMetaCf<Rocks> for MetaCf {
|
||||
fn new(db: Arc<Rocks>) -> Self {
|
||||
MetaCf { db }
|
||||
}
|
||||
|
||||
fn key(slot: u64) -> Vec<u8> {
|
||||
let mut key = vec![0u8; 8];
|
||||
BigEndian::write_u64(&mut key[0..8], slot);
|
||||
key
|
||||
}
|
||||
|
||||
fn get_slot_meta(&self, slot: u64) -> Result<Option<super::SlotMeta>> {
|
||||
let key = Self::key(slot);
|
||||
self.get(&key)
|
||||
}
|
||||
|
||||
fn put_slot_meta(&self, slot: u64, slot_meta: &super::SlotMeta) -> Result<()> {
|
||||
let key = Self::key(slot);
|
||||
self.put(&key, slot_meta)
|
||||
}
|
||||
|
||||
fn index_from_key(key: &[u8]) -> Result<u64> {
|
||||
let mut rdr = io::Cursor::new(&key[..]);
|
||||
let index = rdr.read_u64::<BigEndian>()?;
|
||||
Ok(index)
|
||||
}
|
||||
}
|
||||
|
||||
impl LedgerColumnFamilyRaw<Rocks> for DataCf {
|
||||
fn db(&self) -> &Arc<Rocks> {
|
||||
&self.db
|
||||
}
|
||||
|
||||
fn handle(&self) -> ColumnFamily {
|
||||
self.db.cf_handle(super::DATA_CF).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl LedgerColumnFamilyRaw<Rocks> for ErasureCf {
|
||||
fn db(&self) -> &Arc<Rocks> {
|
||||
&self.db
|
||||
}
|
||||
|
||||
fn handle(&self) -> ColumnFamily {
|
||||
self.db.cf_handle(super::ERASURE_CF).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl LedgerColumnFamily<Rocks> for MetaCf {
|
||||
type ValueType = super::SlotMeta;
|
||||
|
||||
fn db(&self) -> &Arc<Rocks> {
|
||||
&self.db
|
||||
}
|
||||
|
||||
fn handle(&self) -> ColumnFamily {
|
||||
self.db.cf_handle(super::META_CF).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::convert::From<rocksdb::Error> for Error {
|
||||
fn from(e: rocksdb::Error) -> Error {
|
||||
Error::BlocktreeError(BlocktreeError::RocksDb(e))
|
||||
}
|
||||
}
|
||||
|
||||
/// TODO: all this goes away with Blocktree
|
||||
impl Iterator for EntryIterator {
|
||||
type Item = Entry;
|
||||
|
||||
fn next(&mut self) -> Option<Entry> {
|
||||
if self.db_iterator.valid() {
|
||||
if let Some(value) = self.db_iterator.value() {
|
||||
if let Ok(entry) = deserialize::<Entry>(&value[BLOB_HEADER_SIZE..]) {
|
||||
if let Some(blockhash) = self.blockhash {
|
||||
if !entry.verify(&blockhash) {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
self.db_iterator.next();
|
||||
self.blockhash = Some(entry.hash);
|
||||
return Some(entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
|
@ -0,0 +1,345 @@
|
|||
use crate::kvstore::mapper::{Disk, Mapper, Memory};
|
||||
use crate::kvstore::sstable::SSTable;
|
||||
use crate::kvstore::storage::WriteState;
|
||||
use crate::kvstore::writelog::WriteLog;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::ops::RangeInclusive;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::mpsc::{Receiver, Sender};
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread::JoinHandle;
|
||||
|
||||
mod compactor;
|
||||
mod error;
|
||||
mod io_utils;
|
||||
mod mapper;
|
||||
mod readtx;
|
||||
mod sstable;
|
||||
mod storage;
|
||||
mod writelog;
|
||||
mod writetx;
|
||||
|
||||
pub use self::error::{Error, Result};
|
||||
pub use self::readtx::ReadTx as Snapshot;
|
||||
pub use self::sstable::Key;
|
||||
pub use self::writetx::WriteTx;
|
||||
|
||||
const TABLES_FILE: &str = "tables.meta";
|
||||
const LOG_FILE: &str = "mem-log";
|
||||
const DEFAULT_TABLE_SIZE: usize = 64 * 1024 * 1024;
|
||||
const DEFAULT_MEM_SIZE: usize = 64 * 1024 * 1024;
|
||||
const DEFAULT_MAX_PAGES: usize = 10;
|
||||
|
||||
#[derive(Debug, PartialEq, Copy, Clone)]
|
||||
pub struct Config {
|
||||
pub max_mem: usize,
|
||||
pub max_tables: usize,
|
||||
pub page_size: usize,
|
||||
pub in_memory: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct KvStore {
|
||||
write: RwLock<WriteState>,
|
||||
tables: RwLock<Vec<BTreeMap<Key, SSTable>>>,
|
||||
config: Config,
|
||||
root: PathBuf,
|
||||
mapper: Arc<dyn Mapper>,
|
||||
req_tx: RwLock<Sender<compactor::Req>>,
|
||||
resp_rx: RwLock<Receiver<compactor::Resp>>,
|
||||
compactor_handle: JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl KvStore {
|
||||
pub fn open_default<P>(root: P) -> Result<Self>
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
{
|
||||
let mapper = Disk::single(root.as_ref());
|
||||
open(root.as_ref(), Arc::new(mapper), Config::default())
|
||||
}
|
||||
|
||||
pub fn open<P>(root: P, config: Config) -> Result<Self>
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
{
|
||||
let mapper: Arc<dyn Mapper> = if config.in_memory {
|
||||
Arc::new(Memory::new())
|
||||
} else {
|
||||
Arc::new(Disk::single(root.as_ref()))
|
||||
};
|
||||
open(root.as_ref(), mapper, config)
|
||||
}
|
||||
|
||||
pub fn partitioned<P, P2>(root: P, storage_dirs: &[P2], config: Config) -> Result<Self>
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
P2: AsRef<Path>,
|
||||
{
|
||||
let mapper = Disk::new(storage_dirs);
|
||||
open(root.as_ref(), Arc::new(mapper), config)
|
||||
}
|
||||
|
||||
pub fn config(&self) -> &Config {
|
||||
&self.config
|
||||
}
|
||||
|
||||
pub fn put(&self, key: &Key, data: &[u8]) -> Result<()> {
|
||||
self.ensure_mem()?;
|
||||
|
||||
let mut write = self.write.write().unwrap();
|
||||
|
||||
write.put(key, data)?;
|
||||
write.commit += 1;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_many<Iter, Tup, K, V>(&self, rows: Iter) -> Result<()>
|
||||
where
|
||||
Iter: Iterator<Item = Tup>,
|
||||
Tup: std::borrow::Borrow<(K, V)>,
|
||||
K: std::borrow::Borrow<Key>,
|
||||
V: std::borrow::Borrow<[u8]>,
|
||||
{
|
||||
{
|
||||
let mut write = self.write.write().unwrap();
|
||||
|
||||
for pair in rows {
|
||||
let tup = pair.borrow();
|
||||
let (key, data) = (tup.0.borrow(), tup.1.borrow());
|
||||
write.put(key, data)?;
|
||||
}
|
||||
write.commit += 1;
|
||||
}
|
||||
|
||||
self.ensure_mem()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &Key) -> Result<Option<Vec<u8>>> {
|
||||
self.query_compactor()?;
|
||||
|
||||
let (write_state, tables) = (self.write.read().unwrap(), self.tables.read().unwrap());
|
||||
|
||||
storage::get(&write_state.values, &*tables, key)
|
||||
}
|
||||
|
||||
pub fn delete(&self, key: &Key) -> Result<()> {
|
||||
self.query_compactor()?;
|
||||
|
||||
{
|
||||
let mut write = self.write.write().unwrap();
|
||||
|
||||
write.delete(key)?;
|
||||
write.commit += 1;
|
||||
}
|
||||
|
||||
self.ensure_mem()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn delete_many<Iter, K>(&self, rows: Iter) -> Result<()>
|
||||
where
|
||||
Iter: Iterator<Item = K>,
|
||||
K: std::borrow::Borrow<Key>,
|
||||
{
|
||||
self.query_compactor()?;
|
||||
|
||||
{
|
||||
let mut write = self.write.write().unwrap();
|
||||
for k in rows {
|
||||
let key = k.borrow();
|
||||
write.delete(key)?;
|
||||
}
|
||||
write.commit += 1;
|
||||
}
|
||||
|
||||
self.ensure_mem()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn transaction(&self) -> Result<WriteTx> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn commit(&self, _txn: WriteTx) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn snapshot(&self) -> Snapshot {
|
||||
let (state, tables) = (self.write.read().unwrap(), self.tables.read().unwrap());
|
||||
|
||||
Snapshot::new(state.values.clone(), tables.clone())
|
||||
}
|
||||
|
||||
pub fn range(
|
||||
&self,
|
||||
range: RangeInclusive<Key>,
|
||||
) -> Result<impl Iterator<Item = (Key, Vec<u8>)>> {
|
||||
self.query_compactor()?;
|
||||
|
||||
let (write_state, tables) = (self.write.read().unwrap(), self.tables.read().unwrap());
|
||||
storage::range(&write_state.values, &*tables, range)
|
||||
}
|
||||
|
||||
pub fn destroy<P>(path: P) -> Result<()>
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
{
|
||||
let path = path.as_ref();
|
||||
if !path.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
fs::remove_dir_all(path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn query_compactor(&self) -> Result<()> {
|
||||
if let (Ok(mut req_tx), Ok(mut resp_rx), Ok(mut tables)) = (
|
||||
self.req_tx.try_write(),
|
||||
self.resp_rx.try_write(),
|
||||
self.tables.try_write(),
|
||||
) {
|
||||
query_compactor(
|
||||
&self.root,
|
||||
&*self.mapper,
|
||||
&mut *tables,
|
||||
&mut *resp_rx,
|
||||
&mut *req_tx,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ensure_mem(&self) -> Result<()> {
|
||||
let trigger_compact = {
|
||||
let mut write_rw = self.write.write().unwrap();
|
||||
|
||||
if write_rw.mem_size < self.config.max_mem {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut tables = self.tables.write().unwrap();
|
||||
storage::flush_table(&write_rw.values, &*self.mapper, &mut *tables)?;
|
||||
|
||||
write_rw.reset()?;
|
||||
write_rw.commit += 1;
|
||||
|
||||
is_lvl0_full(&tables, &self.config)
|
||||
};
|
||||
|
||||
dump_tables(&self.root, &*self.mapper).unwrap();
|
||||
if trigger_compact {
|
||||
let tables_path = self.root.join(TABLES_FILE);
|
||||
self.req_tx
|
||||
.write()
|
||||
.unwrap()
|
||||
.send(compactor::Req::Start(tables_path))
|
||||
.expect("compactor thread dead");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Config {
|
||||
Config {
|
||||
max_mem: DEFAULT_MEM_SIZE,
|
||||
max_tables: DEFAULT_MAX_PAGES,
|
||||
page_size: DEFAULT_TABLE_SIZE,
|
||||
in_memory: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn open(root: &Path, mapper: Arc<dyn Mapper>, config: Config) -> Result<KvStore> {
|
||||
let root = root.to_path_buf();
|
||||
let log_path = root.join(LOG_FILE);
|
||||
if !root.exists() {
|
||||
fs::create_dir(&root)?;
|
||||
}
|
||||
|
||||
let write_log = WriteLog::open(&log_path, config.max_mem)?;
|
||||
let mem = write_log.materialize()?;
|
||||
|
||||
let write = RwLock::new(WriteState::new(write_log, mem));
|
||||
|
||||
let tables = load_tables(&root, &*mapper)?;
|
||||
let tables = RwLock::new(tables);
|
||||
|
||||
let cfg = compactor::Config {
|
||||
max_pages: config.max_tables,
|
||||
page_size: config.page_size,
|
||||
};
|
||||
let (req_tx, resp_rx, compactor_handle) = compactor::spawn_compactor(Arc::clone(&mapper), cfg)
|
||||
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
|
||||
let (req_tx, resp_rx) = (RwLock::new(req_tx), RwLock::new(resp_rx));
|
||||
|
||||
Ok(KvStore {
|
||||
write,
|
||||
tables,
|
||||
config,
|
||||
mapper,
|
||||
root,
|
||||
req_tx,
|
||||
resp_rx,
|
||||
compactor_handle,
|
||||
})
|
||||
}
|
||||
|
||||
fn load_tables(root: &Path, mapper: &dyn Mapper) -> Result<Vec<BTreeMap<Key, SSTable>>> {
|
||||
let mut tables = Vec::new();
|
||||
let meta_path = root.join(TABLES_FILE);
|
||||
|
||||
if meta_path.exists() {
|
||||
mapper.load_state_from(&meta_path)?;
|
||||
tables = SSTable::sorted_tables(&mapper.active_set()?);
|
||||
}
|
||||
|
||||
Ok(tables)
|
||||
}
|
||||
|
||||
fn dump_tables(root: &Path, mapper: &Mapper) -> Result<()> {
|
||||
mapper.serialize_state_to(&root.join(TABLES_FILE))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn query_compactor(
|
||||
root: &Path,
|
||||
mapper: &dyn Mapper,
|
||||
tables: &mut Vec<BTreeMap<Key, SSTable>>,
|
||||
resp_rx: &mut Receiver<compactor::Resp>,
|
||||
req_tx: &mut Sender<compactor::Req>,
|
||||
) -> Result<()> {
|
||||
match resp_rx.try_recv() {
|
||||
Ok(compactor::Resp::Done(new_tables)) => {
|
||||
std::mem::replace(tables, new_tables);
|
||||
dump_tables(root, mapper)?;
|
||||
req_tx.send(compactor::Req::Gc).unwrap();
|
||||
}
|
||||
Ok(compactor::Resp::Failed(e)) => {
|
||||
return Err(e);
|
||||
}
|
||||
// Nothing available, do nothing
|
||||
_ => {}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_lvl0_full(tables: &[BTreeMap<Key, SSTable>], config: &Config) -> bool {
|
||||
if tables.is_empty() {
|
||||
false
|
||||
} else {
|
||||
tables[0].len() > config.max_tables
|
||||
}
|
||||
}
|
|
@ -0,0 +1,223 @@
|
|||
use crate::kvstore::error::{Error, Result};
|
||||
use crate::kvstore::mapper::{Kind, Mapper};
|
||||
use crate::kvstore::sstable::{Key, Merged, SSTable};
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::mpsc::{channel, Receiver, Sender};
|
||||
use std::sync::Arc;
|
||||
use std::thread::{self, JoinHandle};
|
||||
|
||||
type TableVec = Vec<BTreeMap<Key, SSTable>>;
|
||||
type TableSlice<'a> = &'a [BTreeMap<Key, SSTable>];
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct Config {
|
||||
pub max_pages: usize,
|
||||
pub page_size: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Req {
|
||||
Start(PathBuf),
|
||||
Gc,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Resp {
|
||||
Done(TableVec),
|
||||
Failed(Error),
|
||||
}
|
||||
|
||||
pub fn spawn_compactor(
|
||||
mapper: Arc<dyn Mapper>,
|
||||
config: Config,
|
||||
) -> Result<(Sender<Req>, Receiver<Resp>, JoinHandle<()>)> {
|
||||
let (req_tx, req_rx) = channel();
|
||||
let (resp_tx, resp_rx) = channel();
|
||||
|
||||
let handle = thread::spawn(move || {
|
||||
let _ignored = run_loop(mapper, config, req_rx, resp_tx);
|
||||
});
|
||||
|
||||
Ok((req_tx, resp_rx, handle))
|
||||
}
|
||||
|
||||
fn run_loop(
|
||||
mapper: Arc<dyn Mapper>,
|
||||
config: Config,
|
||||
req_rx: Receiver<Req>,
|
||||
resp_tx: Sender<Resp>,
|
||||
) -> Result<()> {
|
||||
while let Ok(msg) = req_rx.recv() {
|
||||
match msg {
|
||||
Req::Start(_) => {
|
||||
let new_tables_res = run_compaction(&*mapper, &config);
|
||||
|
||||
match new_tables_res {
|
||||
Ok(new_tables) => {
|
||||
resp_tx.send(Resp::Done(new_tables))?;
|
||||
}
|
||||
Err(e) => {
|
||||
resp_tx.send(Resp::Failed(e))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Req::Gc => {
|
||||
let _ = mapper.empty_trash();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn run_compaction(mapper: &dyn Mapper, config: &Config) -> Result<TableVec> {
|
||||
let mut tables = load_tables(mapper)?;
|
||||
|
||||
compact_level_0(mapper, &mut tables, config)?;
|
||||
|
||||
for level in 1..tables.len() {
|
||||
while level_needs_compact(level as u8, config, &tables) {
|
||||
compact_upper_level(mapper, &mut tables, config, level as u8)?;
|
||||
}
|
||||
}
|
||||
|
||||
// move old tables to garbage
|
||||
mapper.rotate_tables()?;
|
||||
|
||||
Ok(tables)
|
||||
}
|
||||
|
||||
fn compact_level_0(mapper: &dyn Mapper, tables: &mut TableVec, config: &Config) -> Result<()> {
|
||||
assert!(!tables.is_empty());
|
||||
|
||||
if tables.len() == 1 {
|
||||
tables.push(BTreeMap::new());
|
||||
}
|
||||
|
||||
let mut new_tables = BTreeMap::new();
|
||||
{
|
||||
let sources = tables
|
||||
.iter()
|
||||
.take(2)
|
||||
.map(BTreeMap::values)
|
||||
.flatten()
|
||||
.map(|sst| sst.range(&(Key::ALL_INCLUSIVE)))
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
let mut iter = Merged::new(sources).peekable();
|
||||
while iter.peek().is_some() {
|
||||
let sst = mapper.make_table(Kind::Compaction, &mut |mut data_wtr, mut index_wtr| {
|
||||
SSTable::create_capped(
|
||||
&mut iter,
|
||||
1,
|
||||
config.page_size as u64,
|
||||
&mut data_wtr,
|
||||
&mut index_wtr,
|
||||
);
|
||||
})?;
|
||||
|
||||
new_tables.insert(sst.meta().start, sst);
|
||||
}
|
||||
}
|
||||
|
||||
tables[0].clear();
|
||||
tables[1].clear();
|
||||
|
||||
tables[1].append(&mut new_tables);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compact_upper_level(
|
||||
mapper: &dyn Mapper,
|
||||
pages: &mut TableVec,
|
||||
config: &Config,
|
||||
level: u8,
|
||||
) -> Result<()> {
|
||||
assert!(1 <= level && (level as usize) < pages.len());
|
||||
assert!(!pages[level as usize].is_empty());
|
||||
|
||||
let next_level = level + 1;
|
||||
let level = level as usize;
|
||||
|
||||
if next_level as usize == pages.len() {
|
||||
pages.push(BTreeMap::new());
|
||||
}
|
||||
|
||||
let (&key, chosen_sst) = pages[level].iter().next_back().unwrap();
|
||||
let (start, end) = {
|
||||
let meta = chosen_sst.meta();
|
||||
(meta.start, meta.end)
|
||||
};
|
||||
|
||||
let mut page_keys = Vec::new();
|
||||
let mut merge_with = Vec::new();
|
||||
|
||||
for (key, sst) in pages[next_level as usize].iter() {
|
||||
if sst.is_overlap(&(start..=end)) {
|
||||
page_keys.push(*key);
|
||||
merge_with.push(sst);
|
||||
}
|
||||
}
|
||||
|
||||
let mut new_tables = BTreeMap::new();
|
||||
{
|
||||
let sources = merge_with
|
||||
.into_iter()
|
||||
.chain(std::iter::once(chosen_sst))
|
||||
.map(|sst| sst.range(&(Key::ALL_INCLUSIVE)))
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
let mut iter = Merged::new(sources).peekable();
|
||||
|
||||
while iter.peek().is_some() {
|
||||
let sst = mapper.make_table(Kind::Compaction, &mut |mut data_wtr, mut index_wtr| {
|
||||
SSTable::create_capped(
|
||||
&mut iter,
|
||||
next_level,
|
||||
config.page_size as u64,
|
||||
&mut data_wtr,
|
||||
&mut index_wtr,
|
||||
);
|
||||
})?;
|
||||
|
||||
new_tables.insert(sst.meta().start, sst);
|
||||
}
|
||||
}
|
||||
|
||||
// delete merged page and merged pages in next level
|
||||
pages[level].remove(&key).unwrap();
|
||||
|
||||
for start_key in page_keys {
|
||||
pages[next_level as usize].remove(&start_key).unwrap();
|
||||
}
|
||||
|
||||
pages[next_level as usize].append(&mut new_tables);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_tables(mapper: &dyn Mapper) -> Result<TableVec> {
|
||||
Ok(SSTable::sorted_tables(&mapper.active_set()?))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn level_max(level: u8, config: &Config) -> usize {
|
||||
match level {
|
||||
0 => config.max_pages,
|
||||
x => 10usize.pow(u32::from(x)),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn level_needs_compact(level: u8, config: &Config, tables: TableSlice) -> bool {
|
||||
if level as usize >= tables.len() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let max = level_max(level, config);
|
||||
|
||||
tables[level as usize].len() > max
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
use std::error::Error as StdErr;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::result::Result as StdRes;
|
||||
use std::sync::mpsc::{RecvError, SendError, TryRecvError};
|
||||
|
||||
pub type Result<T> = StdRes<T, Error>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Error {
|
||||
Io(io::Error),
|
||||
Corrupted(bincode::Error),
|
||||
Channel(Box<dyn StdErr + Sync + Send>),
|
||||
Missing,
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Error::Corrupted(_) => write!(f, "Serialization error: Store may be corrupted"),
|
||||
Error::Channel(e) => write!(f, "Internal communication error: {}", e),
|
||||
Error::Io(e) => write!(f, "I/O error: {}", e),
|
||||
Error::Missing => write!(f, "Item not present in ledger"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StdErr for Error {
|
||||
fn source(&self) -> Option<&(dyn StdErr + 'static)> {
|
||||
match self {
|
||||
Error::Io(e) => Some(e),
|
||||
Error::Corrupted(ref e) => Some(e),
|
||||
Error::Channel(e) => Some(e.as_ref()),
|
||||
Error::Missing => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<io::Error> for Error {
|
||||
fn from(e: io::Error) -> Self {
|
||||
Error::Io(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl<W> From<io::IntoInnerError<W>> for Error {
|
||||
fn from(e: io::IntoInnerError<W>) -> Self {
|
||||
Error::Io(e.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<bincode::Error> for Error {
|
||||
fn from(e: bincode::Error) -> Self {
|
||||
Error::Corrupted(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> From<SendError<T>> for Error
|
||||
where
|
||||
T: Send + Sync + 'static,
|
||||
{
|
||||
fn from(e: SendError<T>) -> Self {
|
||||
Error::Channel(Box::new(e))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<RecvError> for Error {
|
||||
fn from(e: RecvError) -> Self {
|
||||
Error::Channel(Box::new(e))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TryRecvError> for Error {
|
||||
fn from(e: TryRecvError) -> Self {
|
||||
Error::Channel(Box::new(e))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,131 @@
|
|||
use memmap::Mmap;
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufWriter, Seek, SeekFrom, Write};
|
||||
use std::ops::Deref;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
const BACKING_ERR: &str = "In-memory table lock poisoned; concurrency error";
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum MemMap {
|
||||
Disk(Mmap),
|
||||
Mem(Arc<RwLock<Vec<u8>>>),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Writer {
|
||||
Disk(BufWriter<File>),
|
||||
Mem(SharedWriter),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct SharedWriter {
|
||||
buf: Arc<RwLock<Vec<u8>>>,
|
||||
pos: u64,
|
||||
}
|
||||
|
||||
impl SharedWriter {
|
||||
pub fn new(buf: Arc<RwLock<Vec<u8>>>) -> SharedWriter {
|
||||
SharedWriter { buf, pos: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for MemMap {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &[u8] {
|
||||
match self {
|
||||
MemMap::Disk(mmap) => mmap.deref(),
|
||||
MemMap::Mem(vec) => {
|
||||
let buf = vec.read().expect(BACKING_ERR);
|
||||
let slice = buf.as_slice();
|
||||
|
||||
// transmute lifetime. Relying on the RwLock + immutability for safety
|
||||
unsafe { std::mem::transmute(slice) }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Write for SharedWriter {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
use std::cmp;
|
||||
|
||||
let mut vec = self.buf.write().expect(BACKING_ERR);
|
||||
|
||||
// Calc ranges
|
||||
let space_remaining = vec.len() - self.pos as usize;
|
||||
let copy_len = cmp::min(buf.len(), space_remaining);
|
||||
let copy_src_range = 0..copy_len;
|
||||
let append_src_range = copy_len..buf.len();
|
||||
let copy_dest_range = self.pos as usize..(self.pos as usize + copy_len);
|
||||
|
||||
// Copy then append
|
||||
(&mut vec[copy_dest_range]).copy_from_slice(&buf[copy_src_range]);
|
||||
vec.extend_from_slice(&buf[append_src_range]);
|
||||
|
||||
let written = buf.len();
|
||||
|
||||
self.pos += written as u64;
|
||||
|
||||
Ok(written)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
let _written = self.write(buf)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Seek for SharedWriter {
|
||||
fn seek(&mut self, to: SeekFrom) -> io::Result<u64> {
|
||||
self.pos = match to {
|
||||
SeekFrom::Start(new_pos) => new_pos,
|
||||
SeekFrom::Current(diff) => (self.pos as i64 + diff) as u64,
|
||||
SeekFrom::End(rpos) => (self.buf.read().expect(BACKING_ERR).len() as i64 + rpos) as u64,
|
||||
};
|
||||
|
||||
Ok(self.pos)
|
||||
}
|
||||
}
|
||||
|
||||
impl Write for Writer {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
match self {
|
||||
Writer::Disk(ref mut wtr) => wtr.write(buf),
|
||||
Writer::Mem(ref mut wtr) => wtr.write(buf),
|
||||
}
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
match self {
|
||||
Writer::Disk(ref mut wtr) => {
|
||||
wtr.flush()?;
|
||||
wtr.get_mut().sync_data()?;
|
||||
Ok(())
|
||||
}
|
||||
Writer::Mem(ref mut wtr) => wtr.flush(),
|
||||
}
|
||||
}
|
||||
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
match self {
|
||||
Writer::Disk(ref mut wtr) => wtr.write_all(buf),
|
||||
Writer::Mem(ref mut wtr) => wtr.write_all(buf),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Seek for Writer {
|
||||
fn seek(&mut self, pos: SeekFrom) -> io::Result<u64> {
|
||||
match self {
|
||||
Writer::Disk(ref mut wtr) => wtr.seek(pos),
|
||||
Writer::Mem(ref mut wtr) => wtr.seek(pos),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
use crate::kvstore::io_utils::Writer;
|
||||
use crate::kvstore::sstable::SSTable;
|
||||
use crate::kvstore::Result;
|
||||
|
||||
use std::path::Path;
|
||||
use std::sync::RwLock;
|
||||
|
||||
mod disk;
|
||||
mod memory;
|
||||
|
||||
pub use self::disk::Disk;
|
||||
pub use self::memory::Memory;
|
||||
|
||||
pub trait Mapper: std::fmt::Debug + Send + Sync {
|
||||
fn make_table(&self, kind: Kind, func: &mut FnMut(Writer, Writer)) -> Result<SSTable>;
|
||||
fn rotate_tables(&self) -> Result<()>;
|
||||
fn empty_trash(&self) -> Result<()>;
|
||||
fn active_set(&self) -> Result<Vec<SSTable>>;
|
||||
fn serialize_state_to(&self, path: &Path) -> Result<()>;
|
||||
fn load_state_from(&self, path: &Path) -> Result<()>;
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)]
|
||||
pub enum Kind {
|
||||
Active,
|
||||
Compaction,
|
||||
Garbage,
|
||||
}
|
||||
|
||||
pub trait RwLockExt<T> {
|
||||
fn read_as<U, F: FnOnce(&T) -> U>(&self, f: F) -> U;
|
||||
fn write_as<U, F: FnOnce(&mut T) -> U>(&self, f: F) -> U;
|
||||
fn try_read_as<U, F: FnOnce(&T) -> U>(&self, f: F) -> U;
|
||||
fn try_write_as<U, F: FnOnce(&mut T) -> U>(&self, f: F) -> U;
|
||||
}
|
||||
|
||||
impl<T> RwLockExt<T> for RwLock<T> {
|
||||
fn read_as<U, F: FnOnce(&T) -> U>(&self, f: F) -> U {
|
||||
f(&*self.read().unwrap())
|
||||
}
|
||||
fn write_as<U, F: FnOnce(&mut T) -> U>(&self, f: F) -> U {
|
||||
f(&mut *self.write().unwrap())
|
||||
}
|
||||
fn try_read_as<U, F: FnOnce(&T) -> U>(&self, f: F) -> U {
|
||||
f(&*self.try_read().unwrap())
|
||||
}
|
||||
fn try_write_as<U, F: FnOnce(&mut T) -> U>(&self, f: F) -> U {
|
||||
f(&mut *self.try_write().unwrap())
|
||||
}
|
||||
}
|
|
@ -0,0 +1,215 @@
|
|||
use crate::kvstore::io_utils::{MemMap, Writer};
|
||||
use crate::kvstore::mapper::{Kind, Mapper, RwLockExt};
|
||||
use crate::kvstore::sstable::SSTable;
|
||||
use crate::kvstore::Result;
|
||||
|
||||
use memmap::Mmap;
|
||||
|
||||
use rand::{rngs::SmallRng, seq::SliceRandom, FromEntropy, Rng};
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{self, BufReader, BufWriter};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
struct Id {
|
||||
id: u32,
|
||||
kind: Kind,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Disk {
|
||||
rng: RwLock<SmallRng>,
|
||||
mappings: RwLock<HashMap<Id, PathInfo>>,
|
||||
storage_dirs: RwLock<Vec<PathBuf>>,
|
||||
}
|
||||
|
||||
impl Disk {
|
||||
pub fn single(dir: &Path) -> Self {
|
||||
Disk::new(&[dir])
|
||||
}
|
||||
|
||||
pub fn new<P: AsRef<Path>>(storage_dirs: &[P]) -> Self {
|
||||
if storage_dirs.is_empty() {
|
||||
panic!("Disk Mapper requires at least one storage director");
|
||||
}
|
||||
|
||||
let storage_dirs = storage_dirs
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.map(Path::to_path_buf)
|
||||
.collect();
|
||||
|
||||
Disk {
|
||||
storage_dirs: RwLock::new(storage_dirs),
|
||||
mappings: RwLock::new(HashMap::new()),
|
||||
rng: RwLock::new(SmallRng::from_entropy()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct PathInfo {
|
||||
pub data: PathBuf,
|
||||
pub index: PathBuf,
|
||||
}
|
||||
|
||||
impl Disk {
|
||||
#[inline]
|
||||
fn choose_storage(&self) -> PathBuf {
|
||||
let mut rng = rand::thread_rng();
|
||||
let path = self
|
||||
.storage_dirs
|
||||
.read_as(|storage| storage.choose(&mut rng).unwrap().to_path_buf());
|
||||
if !path.exists() {
|
||||
fs::create_dir_all(&path).expect("couldn't create table storage directory");
|
||||
}
|
||||
|
||||
path
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn add_mapping(&self, tref: Id, paths: PathInfo) {
|
||||
let mut map = self.mappings.write().unwrap();
|
||||
map.insert(tref, paths);
|
||||
}
|
||||
}
|
||||
|
||||
impl Mapper for Disk {
|
||||
fn make_table(&self, kind: Kind, func: &mut FnMut(Writer, Writer)) -> Result<SSTable> {
|
||||
let storage = self.choose_storage();
|
||||
|
||||
let id = next_id(kind);
|
||||
let paths = mk_paths(id, &storage);
|
||||
let (data, index) = mk_writers(&paths)?;
|
||||
|
||||
func(data, index);
|
||||
|
||||
self.add_mapping(id, paths.clone());
|
||||
|
||||
let (data, index) = mk_maps(&paths)?;
|
||||
let sst = SSTable::from_parts(Arc::new(data), Arc::new(index))?;
|
||||
Ok(sst)
|
||||
}
|
||||
|
||||
fn rotate_tables(&self) -> Result<()> {
|
||||
let mut map = self.mappings.write().unwrap();
|
||||
let mut new_map = HashMap::new();
|
||||
|
||||
for (tref, paths) in map.drain() {
|
||||
let new_kind = match tref.kind {
|
||||
Kind::Active => Kind::Garbage,
|
||||
Kind::Compaction => Kind::Active,
|
||||
k => k,
|
||||
};
|
||||
let new_ref = next_id(new_kind);
|
||||
new_map.insert(new_ref, paths);
|
||||
}
|
||||
*map = new_map;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn empty_trash(&self) -> Result<()> {
|
||||
self.mappings.write_as(|map| {
|
||||
let to_rm = map
|
||||
.keys()
|
||||
.filter(|tref| tref.kind == Kind::Garbage)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for tref in to_rm {
|
||||
let paths = map.remove(&tref).unwrap();
|
||||
fs::remove_file(&paths.index)?;
|
||||
fs::remove_file(&paths.data)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
fn active_set(&self) -> Result<Vec<SSTable>> {
|
||||
let map = self.mappings.read().unwrap();
|
||||
let active = map.iter().filter(|(tref, _)| tref.kind == Kind::Active);
|
||||
let mut vec = Vec::new();
|
||||
|
||||
for (_, paths) in active {
|
||||
let (data, index): (MemMap, MemMap) = mk_maps(paths)?;
|
||||
let sst = SSTable::from_parts(Arc::new(data), Arc::new(index))?;
|
||||
|
||||
vec.push(sst);
|
||||
}
|
||||
Ok(vec)
|
||||
}
|
||||
|
||||
fn serialize_state_to(&self, path: &Path) -> Result<()> {
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.truncate(true)
|
||||
.open(path)?;
|
||||
let wtr = BufWriter::new(file);
|
||||
|
||||
self.mappings.read_as(|mappings| {
|
||||
self.storage_dirs
|
||||
.read_as(|storage| bincode::serialize_into(wtr, &(storage, mappings)))
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_state_from(&self, path: &Path) -> Result<()> {
|
||||
let rdr = BufReader::new(File::open(path)?);
|
||||
let (new_storage, new_mappings) = bincode::deserialize_from(rdr)?;
|
||||
|
||||
self.storage_dirs.write_as(|storage| {
|
||||
self.mappings.write_as(|mappings| {
|
||||
*storage = new_storage;
|
||||
*mappings = new_mappings;
|
||||
})
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn mk_writers(paths: &PathInfo) -> io::Result<(Writer, Writer)> {
|
||||
let mut opts = OpenOptions::new();
|
||||
opts.create(true).append(true);
|
||||
|
||||
let data = BufWriter::new(opts.open(&paths.data)?);
|
||||
let index = BufWriter::new(opts.open(&paths.index)?);
|
||||
|
||||
Ok((Writer::Disk(data), Writer::Disk(index)))
|
||||
}
|
||||
|
||||
fn mk_maps(paths: &PathInfo) -> io::Result<(MemMap, MemMap)> {
|
||||
let (data_file, index_file) = (File::open(&paths.data)?, File::open(&paths.index)?);
|
||||
let (data, index) = unsafe { (Mmap::map(&data_file)?, Mmap::map(&index_file)?) };
|
||||
Ok((MemMap::Disk(data), MemMap::Disk(index)))
|
||||
}
|
||||
|
||||
fn mk_paths(tref: Id, dir: &Path) -> PathInfo {
|
||||
let (data_name, index_name) = mk_filenames(tref.id);
|
||||
PathInfo {
|
||||
data: dir.join(data_name),
|
||||
index: dir.join(index_name),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn mk_filenames(n: u32) -> (String, String) {
|
||||
let data = format!("{}.sstable", n,);
|
||||
let index = format!("{}.index", n,);
|
||||
(data, index)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_id(kind: Kind) -> Id {
|
||||
Id {
|
||||
id: rand::thread_rng().gen(),
|
||||
kind,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,144 @@
|
|||
use crate::kvstore::io_utils::{MemMap, SharedWriter, Writer};
|
||||
use crate::kvstore::mapper::{Kind, Mapper, RwLockExt};
|
||||
use crate::kvstore::sstable::SSTable;
|
||||
use crate::kvstore::Result;
|
||||
|
||||
use rand::{rngs::SmallRng, FromEntropy, Rng};
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
type Id = u32;
|
||||
type TableMap = HashMap<Id, (Arc<RwLock<Vec<u8>>>, Arc<RwLock<Vec<u8>>>)>;
|
||||
type Backing = Arc<RwLock<TableMap>>;
|
||||
|
||||
const BACKING_ERR_MSG: &str = "In-memory table lock poisoned; concurrency error";
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Memory {
|
||||
tables: Backing,
|
||||
compaction: Backing,
|
||||
garbage: Backing,
|
||||
meta: Arc<RwLock<Vec<u8>>>,
|
||||
rng: RwLock<SmallRng>,
|
||||
}
|
||||
|
||||
impl Memory {
|
||||
pub fn new() -> Self {
|
||||
fn init_backing() -> Backing {
|
||||
Arc::new(RwLock::new(HashMap::new()))
|
||||
}
|
||||
Memory {
|
||||
tables: init_backing(),
|
||||
compaction: init_backing(),
|
||||
garbage: init_backing(),
|
||||
meta: Arc::new(RwLock::new(vec![])),
|
||||
rng: RwLock::new(SmallRng::from_entropy()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Memory {
|
||||
#[inline]
|
||||
fn get_backing(&self, kind: Kind) -> &Backing {
|
||||
match kind {
|
||||
Kind::Active => &self.tables,
|
||||
Kind::Compaction => &self.compaction,
|
||||
Kind::Garbage => &self.garbage,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Mapper for Memory {
|
||||
fn make_table(&self, kind: Kind, func: &mut FnMut(Writer, Writer)) -> Result<SSTable> {
|
||||
let backing = self.get_backing(kind);
|
||||
let id = next_id();
|
||||
|
||||
let (data, index) = backing.write_as(|tables| get_memory_writers_for(id, tables))?;
|
||||
func(data, index);
|
||||
|
||||
backing.read_as(|map| get_table(id, map))
|
||||
}
|
||||
|
||||
fn rotate_tables(&self) -> Result<()> {
|
||||
use std::mem::swap;
|
||||
|
||||
let (mut active, mut compact, mut garbage) = (
|
||||
self.tables.write().expect(BACKING_ERR_MSG),
|
||||
self.compaction.write().expect(BACKING_ERR_MSG),
|
||||
self.garbage.write().expect(BACKING_ERR_MSG),
|
||||
);
|
||||
|
||||
// compacted tables => active set
|
||||
swap(&mut active, &mut compact);
|
||||
// old active set => garbage
|
||||
garbage.extend(compact.drain());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn empty_trash(&self) -> Result<()> {
|
||||
self.garbage.write().expect(BACKING_ERR_MSG).clear();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn active_set(&self) -> Result<Vec<SSTable>> {
|
||||
let active = self.tables.read().expect(BACKING_ERR_MSG);
|
||||
|
||||
let mut tables = Vec::with_capacity(active.len());
|
||||
for tref in active.keys() {
|
||||
let sst = get_table(*tref, &*active)?;
|
||||
tables.push(sst);
|
||||
}
|
||||
|
||||
Ok(tables)
|
||||
}
|
||||
|
||||
fn serialize_state_to(&self, _: &Path) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_state_from(&self, _: &Path) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn get_memory_writers_for(id: Id, backing: &mut TableMap) -> Result<(Writer, Writer)> {
|
||||
let data_buf = Arc::new(RwLock::new(vec![]));
|
||||
let index_buf = Arc::new(RwLock::new(vec![]));
|
||||
|
||||
backing.insert(id, (Arc::clone(&data_buf), Arc::clone(&index_buf)));
|
||||
|
||||
let data_wtr = SharedWriter::new(data_buf);
|
||||
let index_wtr = SharedWriter::new(index_buf);
|
||||
|
||||
let data = Writer::Mem(data_wtr);
|
||||
let index = Writer::Mem(index_wtr);
|
||||
|
||||
Ok((data, index))
|
||||
}
|
||||
|
||||
fn get_memmaps(id: Id, map: &TableMap) -> Result<(MemMap, MemMap)> {
|
||||
let entry = map
|
||||
.get(&id)
|
||||
.expect("Map should always be present, given a Id that's not destroyed");
|
||||
|
||||
let data = MemMap::Mem(Arc::clone(&entry.0));
|
||||
let index = MemMap::Mem(Arc::clone(&entry.1));
|
||||
|
||||
Ok((data, index))
|
||||
}
|
||||
|
||||
fn get_table(id: Id, map: &TableMap) -> Result<SSTable> {
|
||||
let (data, index) = get_memmaps(id, map)?;
|
||||
let sst = SSTable::from_parts(Arc::new(data), Arc::new(index))?;
|
||||
|
||||
Ok(sst)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_id() -> Id {
|
||||
rand::thread_rng().gen()
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
use crate::kvstore::error::Result;
|
||||
use crate::kvstore::sstable::{Key, SSTable, Value};
|
||||
use crate::kvstore::storage;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::RangeInclusive;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ReadTx {
|
||||
mem: Arc<BTreeMap<Key, Value>>,
|
||||
tables: Arc<[BTreeMap<Key, SSTable>]>,
|
||||
}
|
||||
|
||||
impl ReadTx {
|
||||
pub fn new(mem: BTreeMap<Key, Value>, tables: Vec<BTreeMap<Key, SSTable>>) -> ReadTx {
|
||||
ReadTx {
|
||||
mem: Arc::new(mem),
|
||||
tables: Arc::from(tables.into_boxed_slice()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &Key) -> Result<Option<Vec<u8>>> {
|
||||
storage::get(&self.mem, &*self.tables, key)
|
||||
}
|
||||
|
||||
pub fn range(
|
||||
&self,
|
||||
range: RangeInclusive<Key>,
|
||||
) -> Result<impl Iterator<Item = (Key, Vec<u8>)>> {
|
||||
storage::range(&self.mem, &*self.tables, range)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,476 @@
|
|||
use crate::kvstore::error::Result;
|
||||
use crate::kvstore::io_utils::{MemMap, Writer};
|
||||
|
||||
use byteorder::{BigEndian, ByteOrder, WriteBytesExt};
|
||||
|
||||
use std::borrow::Borrow;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::io::{prelude::*, Cursor, Seek, SeekFrom};
|
||||
use std::ops::RangeInclusive;
|
||||
use std::sync::Arc;
|
||||
use std::u64;
|
||||
|
||||
// ___________________________________________
|
||||
// | start_key | end_key | level | data_size |
|
||||
// -------------------------------------------
|
||||
const IDX_META_SIZE: usize = KEY_LEN + KEY_LEN + 1 + 8;
|
||||
|
||||
const KEY_LEN: usize = 3 * 8;
|
||||
// _________________
|
||||
// | offset | size |
|
||||
// -----------------
|
||||
const PTR_SIZE: usize = 2 * 8;
|
||||
// __________________________________________
|
||||
// | key | timestamp | pointer OR tombstone |
|
||||
// ------------------------------------------
|
||||
const INDEX_ENTRY_SIZE: usize = KEY_LEN + 8 + PTR_SIZE;
|
||||
// Represented by zero offset and size
|
||||
const TOMBSTONE: [u8; PTR_SIZE] = [0u8; PTR_SIZE];
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SSTable {
|
||||
data: Arc<MemMap>,
|
||||
index: Arc<MemMap>,
|
||||
meta: IndexMeta,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub struct IndexMeta {
|
||||
pub level: u8,
|
||||
pub data_size: u64,
|
||||
pub start: Key,
|
||||
pub end: Key,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, Hash)]
|
||||
pub struct Key(pub [u8; 24]);
|
||||
|
||||
#[derive(Debug, PartialEq, PartialOrd, Eq, Ord, Copy, Clone)]
|
||||
pub struct Index {
|
||||
pub offset: u64,
|
||||
pub size: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
pub struct Value {
|
||||
pub ts: i64,
|
||||
pub val: Option<Vec<u8>>,
|
||||
}
|
||||
|
||||
/// An iterator that produces logical view over a set of SSTables
|
||||
pub struct Merged<I> {
|
||||
sources: Vec<I>,
|
||||
heads: BTreeMap<(Key, usize), Value>,
|
||||
seen: HashMap<Key, i64>,
|
||||
}
|
||||
|
||||
impl SSTable {
|
||||
pub fn meta(&self) -> &IndexMeta {
|
||||
&self.meta
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn num_keys(&self) -> u64 {
|
||||
((self.index.len() - IDX_META_SIZE) / INDEX_ENTRY_SIZE) as u64
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &Key) -> Result<Option<Value>> {
|
||||
let range = *key..=*key;
|
||||
let found_opt = self.range(&range)?.find(|(k, _)| k == key).map(|(_, v)| v);
|
||||
Ok(found_opt)
|
||||
}
|
||||
|
||||
pub fn range(&self, range: &RangeInclusive<Key>) -> Result<impl Iterator<Item = (Key, Value)>> {
|
||||
Ok(Scan::new(
|
||||
range.clone(),
|
||||
Arc::clone(&self.data),
|
||||
Arc::clone(&self.index),
|
||||
))
|
||||
}
|
||||
|
||||
pub fn create_capped<I, K, V>(
|
||||
rows: &mut I,
|
||||
level: u8,
|
||||
max_table_size: u64,
|
||||
data_wtr: &mut Writer,
|
||||
index_wtr: &mut Writer,
|
||||
) where
|
||||
I: Iterator<Item = (K, V)>,
|
||||
K: Borrow<Key>,
|
||||
V: Borrow<Value>,
|
||||
{
|
||||
const DATA_ERR: &str = "Error writing table data";
|
||||
const INDEX_ERR: &str = "Error writing index data";
|
||||
|
||||
let (data_size, index) =
|
||||
flush_mem_table_capped(rows, data_wtr, max_table_size).expect(DATA_ERR);
|
||||
|
||||
data_wtr.flush().expect(DATA_ERR);
|
||||
|
||||
let (&start, &end) = (
|
||||
index.keys().next().unwrap(),
|
||||
index.keys().next_back().unwrap(),
|
||||
);
|
||||
|
||||
let meta = IndexMeta {
|
||||
start,
|
||||
end,
|
||||
level,
|
||||
data_size,
|
||||
};
|
||||
|
||||
flush_index(&index, &meta, index_wtr).expect(INDEX_ERR);
|
||||
index_wtr.flush().expect(INDEX_ERR);
|
||||
}
|
||||
|
||||
pub fn create<I, K, V>(rows: &mut I, level: u8, data_wtr: &mut Writer, index_wtr: &mut Writer)
|
||||
where
|
||||
I: Iterator<Item = (K, V)>,
|
||||
K: Borrow<Key>,
|
||||
V: Borrow<Value>,
|
||||
{
|
||||
SSTable::create_capped(rows, level, u64::MAX, data_wtr, index_wtr);
|
||||
}
|
||||
|
||||
pub fn from_parts(data: Arc<MemMap>, index: Arc<MemMap>) -> Result<Self> {
|
||||
sst_from_parts(data, index)
|
||||
}
|
||||
|
||||
pub fn could_contain(&self, key: &Key) -> bool {
|
||||
self.meta.start <= *key && *key <= self.meta.end
|
||||
}
|
||||
|
||||
pub fn is_overlap(&self, range: &RangeInclusive<Key>) -> bool {
|
||||
let r = self.meta.start..=self.meta.end;
|
||||
overlapping(&r, range)
|
||||
}
|
||||
|
||||
pub fn sorted_tables(tables: &[SSTable]) -> Vec<BTreeMap<Key, SSTable>> {
|
||||
let mut sorted = Vec::new();
|
||||
|
||||
for sst in tables {
|
||||
let (key, level) = {
|
||||
let meta = sst.meta();
|
||||
(meta.start, meta.level)
|
||||
};
|
||||
|
||||
while level as usize >= tables.len() {
|
||||
sorted.push(BTreeMap::new());
|
||||
}
|
||||
sorted[level as usize].insert(key, sst.clone());
|
||||
}
|
||||
|
||||
sorted
|
||||
}
|
||||
}
|
||||
|
||||
impl Key {
|
||||
pub const MIN: Key = Key([0u8; KEY_LEN as usize]);
|
||||
pub const MAX: Key = Key([255u8; KEY_LEN as usize]);
|
||||
pub const ALL_INCLUSIVE: RangeInclusive<Key> = RangeInclusive::new(Key::MIN, Key::MAX);
|
||||
|
||||
pub fn write<W: Write>(&self, wtr: &mut W) -> Result<()> {
|
||||
wtr.write_all(&self.0)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn read(bytes: &[u8]) -> Key {
|
||||
let mut key = Key::default();
|
||||
key.0.copy_from_slice(bytes);
|
||||
key
|
||||
}
|
||||
}
|
||||
|
||||
struct Scan {
|
||||
bounds: RangeInclusive<Key>,
|
||||
data: Arc<MemMap>,
|
||||
index: Arc<MemMap>,
|
||||
index_pos: usize,
|
||||
}
|
||||
|
||||
impl Scan {
|
||||
fn new(bounds: RangeInclusive<Key>, data: Arc<MemMap>, index: Arc<MemMap>) -> Self {
|
||||
Scan {
|
||||
bounds,
|
||||
data,
|
||||
index,
|
||||
index_pos: IDX_META_SIZE as usize,
|
||||
}
|
||||
}
|
||||
|
||||
fn step(&mut self) -> Result<Option<(Key, Value)>> {
|
||||
while self.index_pos < self.index.len() {
|
||||
let pos = self.index_pos as usize;
|
||||
let end = pos + INDEX_ENTRY_SIZE;
|
||||
let (key, ts, idx) = read_index_rec(&self.index[pos..end]);
|
||||
|
||||
if key < *self.bounds.start() {
|
||||
self.index_pos = end;
|
||||
continue;
|
||||
}
|
||||
|
||||
if *self.bounds.end() < key {
|
||||
self.index_pos = std::usize::MAX;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let bytes_opt = idx.map(|ptr| get_val(&self.data, ptr).to_vec());
|
||||
|
||||
let val = Value { ts, val: bytes_opt };
|
||||
|
||||
self.index_pos = end;
|
||||
|
||||
return Ok(Some((key, val)));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<(u64, u64, u64)> for Key {
|
||||
fn from((k0, k1, k2): (u64, u64, u64)) -> Self {
|
||||
let mut buf = [0u8; KEY_LEN as usize];
|
||||
|
||||
BigEndian::write_u64(&mut buf[..8], k0);
|
||||
BigEndian::write_u64(&mut buf[8..16], k1);
|
||||
BigEndian::write_u64(&mut buf[16..], k2);
|
||||
|
||||
Key(buf)
|
||||
}
|
||||
}
|
||||
|
||||
impl Index {
|
||||
fn write<W: Write>(&self, wtr: &mut W) -> Result<()> {
|
||||
wtr.write_u64::<BigEndian>(self.offset)?;
|
||||
wtr.write_u64::<BigEndian>(self.size)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read(bytes: &[u8]) -> Index {
|
||||
let offset = BigEndian::read_u64(&bytes[..8]);
|
||||
let size = BigEndian::read_u64(&bytes[8..16]);
|
||||
|
||||
Index { offset, size }
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexMeta {
|
||||
fn write<W: Write>(&self, wtr: &mut W) -> Result<()> {
|
||||
self.start.write(wtr)?;
|
||||
self.end.write(wtr)?;
|
||||
wtr.write_u8(self.level)?;
|
||||
wtr.write_u64::<BigEndian>(self.data_size)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read(data: &[u8]) -> Self {
|
||||
let start = Key::read(&data[..24]);
|
||||
let end = Key::read(&data[24..48]);
|
||||
let level = data[48];
|
||||
let data_size = BigEndian::read_u64(&data[49..57]);
|
||||
|
||||
IndexMeta {
|
||||
start,
|
||||
end,
|
||||
level,
|
||||
data_size,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> Merged<I>
|
||||
where
|
||||
I: Iterator<Item = (Key, Value)>,
|
||||
{
|
||||
pub fn new(mut sources: Vec<I>) -> Self {
|
||||
let mut heads = BTreeMap::new();
|
||||
|
||||
for (source_idx, source) in sources.iter_mut().enumerate() {
|
||||
if let Some((k, v)) = source.next() {
|
||||
heads.insert((k, source_idx), v);
|
||||
}
|
||||
}
|
||||
|
||||
Merged {
|
||||
sources,
|
||||
heads,
|
||||
seen: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> Iterator for Merged<I>
|
||||
where
|
||||
I: Iterator<Item = (Key, Value)>,
|
||||
{
|
||||
type Item = (Key, Value);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
while !self.heads.is_empty() {
|
||||
let (key, source_idx) = *self.heads.keys().next().unwrap();
|
||||
let val = self.heads.remove(&(key, source_idx)).unwrap();
|
||||
|
||||
// replace
|
||||
if let Some((k, v)) = self.sources[source_idx].next() {
|
||||
self.heads.insert((k, source_idx), v);
|
||||
}
|
||||
|
||||
// merge logic
|
||||
// if deleted, remember
|
||||
let (deleted, stale) = match self.seen.get(&key) {
|
||||
Some(&seen_ts) if seen_ts < val.ts => {
|
||||
// fresh val
|
||||
self.seen.insert(key, val.ts);
|
||||
(val.val.is_none(), false)
|
||||
}
|
||||
Some(_) => (val.val.is_none(), true),
|
||||
None => {
|
||||
self.seen.insert(key, val.ts);
|
||||
(val.val.is_none(), false)
|
||||
}
|
||||
};
|
||||
|
||||
if !(stale || deleted) {
|
||||
return Some((key, val));
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for Scan {
|
||||
type Item = (Key, Value);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.index_pos as usize >= self.index.len() {
|
||||
return None;
|
||||
}
|
||||
|
||||
match self.step() {
|
||||
Ok(opt) => opt,
|
||||
Err(_) => {
|
||||
self.index_pos = std::usize::MAX;
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn sst_from_parts(data: Arc<MemMap>, index: Arc<MemMap>) -> Result<SSTable> {
|
||||
let len = index.len() as usize;
|
||||
|
||||
assert!(len > IDX_META_SIZE);
|
||||
assert_eq!((len - IDX_META_SIZE) % INDEX_ENTRY_SIZE, 0);
|
||||
|
||||
let mut rdr = Cursor::new(&**index);
|
||||
let mut idx_buf = [0; IDX_META_SIZE];
|
||||
rdr.read_exact(&mut idx_buf)?;
|
||||
|
||||
let meta = IndexMeta::read(&idx_buf);
|
||||
|
||||
Ok(SSTable { data, index, meta })
|
||||
}
|
||||
|
||||
fn flush_index(
|
||||
index: &BTreeMap<Key, (i64, Option<Index>)>,
|
||||
meta: &IndexMeta,
|
||||
wtr: &mut Writer,
|
||||
) -> Result<()> {
|
||||
meta.write(wtr)?;
|
||||
|
||||
for (&key, &(ts, idx)) in index.iter() {
|
||||
write_index_rec(wtr, (key, ts, idx))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn flush_mem_table_capped<I, K, V>(
|
||||
rows: &mut I,
|
||||
wtr: &mut Writer,
|
||||
max_table_size: u64,
|
||||
) -> Result<(u64, BTreeMap<Key, (i64, Option<Index>)>)>
|
||||
where
|
||||
I: Iterator<Item = (K, V)>,
|
||||
K: Borrow<Key>,
|
||||
V: Borrow<Value>,
|
||||
{
|
||||
let mut ssi = BTreeMap::new();
|
||||
let mut size = 0;
|
||||
|
||||
for (key, val) in rows {
|
||||
let (key, val) = (key.borrow(), val.borrow());
|
||||
let ts = val.ts;
|
||||
|
||||
let (index, item_size) = match val.val {
|
||||
Some(ref bytes) => (Some(write_val(wtr, bytes)?), bytes.len()),
|
||||
None => (None, 0),
|
||||
};
|
||||
|
||||
size += item_size as u64;
|
||||
ssi.insert(*key, (ts, index));
|
||||
|
||||
if size >= max_table_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((size, ssi))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn overlapping<T: Ord + Eq>(r1: &RangeInclusive<T>, r2: &RangeInclusive<T>) -> bool {
|
||||
r1.start() <= r2.end() && r2.start() <= r1.end()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_val<W: Write + Seek>(wtr: &mut W, val: &[u8]) -> Result<Index> {
|
||||
let offset = wtr.seek(SeekFrom::Current(0))?;
|
||||
let size = val.len() as u64;
|
||||
|
||||
wtr.write_all(val)?;
|
||||
Ok(Index { offset, size })
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_val(mmap: &MemMap, idx: Index) -> &[u8] {
|
||||
let row = &mmap[idx.offset as usize..(idx.offset + idx.size) as usize];
|
||||
assert_eq!(row.len(), idx.size as usize);
|
||||
row
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_index_rec<W: Write>(wtr: &mut W, (key, ts, ptr): (Key, i64, Option<Index>)) -> Result<()> {
|
||||
key.write(wtr)?;
|
||||
|
||||
wtr.write_i64::<BigEndian>(ts)?;
|
||||
|
||||
match ptr {
|
||||
Some(idx) => idx.write(wtr)?,
|
||||
None => wtr.write_all(&TOMBSTONE)?,
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read_index_rec(bytes: &[u8]) -> (Key, i64, Option<Index>) {
|
||||
assert_eq!(bytes.len(), INDEX_ENTRY_SIZE);
|
||||
const TS_END: usize = KEY_LEN + 8;
|
||||
|
||||
let mut key_buf = [0; KEY_LEN as usize];
|
||||
key_buf.copy_from_slice(&bytes[..KEY_LEN as usize]);
|
||||
let key = Key(key_buf);
|
||||
let ts = BigEndian::read_i64(&bytes[KEY_LEN..TS_END]);
|
||||
|
||||
let idx_slice = &bytes[TS_END..INDEX_ENTRY_SIZE];
|
||||
let idx = if idx_slice == TOMBSTONE {
|
||||
None
|
||||
} else {
|
||||
Some(Index::read(idx_slice))
|
||||
};
|
||||
|
||||
(key, ts, idx)
|
||||
}
|
|
@ -0,0 +1,175 @@
|
|||
use crate::kvstore::error::Result;
|
||||
use crate::kvstore::mapper::{Kind, Mapper};
|
||||
use crate::kvstore::sstable::{Key, Merged, SSTable, Value};
|
||||
use crate::kvstore::writelog::WriteLog;
|
||||
|
||||
use chrono::Utc;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
type MemTable = BTreeMap<Key, Value>;
|
||||
|
||||
// Size of timestamp + size of key
|
||||
const OVERHEAD: usize = 8 + 3 * 8;
|
||||
const LOG_ERR: &str = "Write to log failed! Halting.";
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct WriteState {
|
||||
pub commit: i64,
|
||||
pub log: WriteLog,
|
||||
pub values: MemTable,
|
||||
pub mem_size: usize,
|
||||
}
|
||||
|
||||
impl WriteState {
|
||||
pub fn new(log: WriteLog, values: BTreeMap<Key, Value>) -> WriteState {
|
||||
let mem_size = values.values().fold(0, |acc, elem| acc + val_mem_use(elem));
|
||||
WriteState {
|
||||
commit: Utc::now().timestamp(),
|
||||
log,
|
||||
mem_size,
|
||||
values,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn put(&mut self, key: &Key, data: &[u8]) -> Result<()> {
|
||||
use std::collections::btree_map::Entry;
|
||||
let ts = self.commit;
|
||||
let value = Value {
|
||||
ts,
|
||||
val: Some(data.to_vec()),
|
||||
};
|
||||
self.log.log_put(key, ts, data).expect(LOG_ERR);
|
||||
|
||||
self.mem_size += val_mem_use(&value);
|
||||
|
||||
match self.values.entry(*key) {
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(value);
|
||||
}
|
||||
Entry::Occupied(mut entry) => {
|
||||
let old = entry.insert(value);
|
||||
self.mem_size -= val_mem_use(&old);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn delete(&mut self, key: &Key) -> Result<()> {
|
||||
use std::collections::btree_map::Entry;
|
||||
let ts = self.commit;
|
||||
let value = Value { ts, val: None };
|
||||
|
||||
self.log.log_delete(key, ts).expect(LOG_ERR);
|
||||
|
||||
self.mem_size += val_mem_use(&value);
|
||||
|
||||
match self.values.entry(*key) {
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(value);
|
||||
}
|
||||
Entry::Occupied(mut entry) => {
|
||||
let old = entry.insert(value);
|
||||
self.mem_size -= val_mem_use(&old);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn reset(&mut self) -> Result<()> {
|
||||
self.values.clear();
|
||||
self.log.reset()?;
|
||||
self.mem_size = 0;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn flush_table(
|
||||
mem: &MemTable,
|
||||
mapper: &dyn Mapper,
|
||||
pages: &mut Vec<BTreeMap<Key, SSTable>>,
|
||||
) -> Result<()> {
|
||||
if mem.is_empty() {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
if pages.is_empty() {
|
||||
pages.push(BTreeMap::new());
|
||||
}
|
||||
|
||||
let mut iter = mem.iter();
|
||||
let sst = mapper.make_table(Kind::Active, &mut |mut data_wtr, mut index_wtr| {
|
||||
SSTable::create(&mut iter, 0, &mut data_wtr, &mut index_wtr);
|
||||
})?;
|
||||
|
||||
let first = sst.meta().start;
|
||||
|
||||
pages[0].insert(first, sst);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get(mem: &MemTable, pages: &[BTreeMap<Key, SSTable>], key: &Key) -> Result<Option<Vec<u8>>> {
|
||||
if let Some(idx) = mem.get(key) {
|
||||
return Ok(idx.val.clone());
|
||||
}
|
||||
|
||||
let mut candidates = Vec::new();
|
||||
|
||||
for level in pages.iter() {
|
||||
for (_, sst) in level.iter().rev() {
|
||||
if sst.could_contain(key) {
|
||||
if let Some(val) = sst.get(&key)? {
|
||||
candidates.push((*key, val));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let merged = Merged::new(vec![candidates.into_iter()])
|
||||
.next()
|
||||
.map(|(_, v)| v.val.unwrap());
|
||||
Ok(merged)
|
||||
}
|
||||
|
||||
pub fn range(
|
||||
mem: &MemTable,
|
||||
tables: &[BTreeMap<Key, SSTable>],
|
||||
range: std::ops::RangeInclusive<Key>,
|
||||
) -> Result<impl Iterator<Item = (Key, Vec<u8>)>> {
|
||||
let mut sources: Vec<Box<dyn Iterator<Item = (Key, Value)>>> = Vec::new();
|
||||
|
||||
let mem = mem
|
||||
.range(range.clone())
|
||||
.map(|(k, v)| (*k, v.clone()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut disk = Vec::new();
|
||||
|
||||
for level in tables.iter() {
|
||||
for sst in level.values() {
|
||||
let iter = sst.range(&range)?;
|
||||
let iter = Box::new(iter) as Box<dyn Iterator<Item = (Key, Value)>>;
|
||||
|
||||
disk.push(iter);
|
||||
}
|
||||
}
|
||||
|
||||
sources.push(Box::new(mem.into_iter()));
|
||||
sources.extend(disk);
|
||||
|
||||
let rows = Merged::new(sources).map(|(k, v)| (k, v.val.unwrap()));
|
||||
|
||||
Ok(rows)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn val_mem_use(val: &Value) -> usize {
|
||||
OVERHEAD + val.val.as_ref().map(Vec::len).unwrap_or(0)
|
||||
}
|
||||
|
||||
// TODO: Write basic tests using mem-table
|
||||
// 1. test put + delete works right
|
||||
// 2. test delete of unknown key recorded
|
||||
// 3. check memory usage calcs
|
|
@ -0,0 +1,105 @@
|
|||
use crate::kvstore::error::Result;
|
||||
use crate::kvstore::sstable::Value;
|
||||
use crate::kvstore::Key;
|
||||
|
||||
use byteorder::{BigEndian, ByteOrder, ReadBytesExt};
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct WriteLog {
|
||||
log_path: PathBuf,
|
||||
log_writer: BufWriter<File>,
|
||||
max_batch_size: usize,
|
||||
}
|
||||
|
||||
impl WriteLog {
|
||||
pub fn open(path: &Path, max_batch_size: usize) -> Result<Self> {
|
||||
let log_writer = BufWriter::new(
|
||||
fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(path)?,
|
||||
);
|
||||
let log_path = path.to_path_buf();
|
||||
|
||||
Ok(WriteLog {
|
||||
log_writer,
|
||||
log_path,
|
||||
max_batch_size,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn reset(&mut self) -> Result<()> {
|
||||
self.log_writer.flush()?;
|
||||
let file = self.log_writer.get_mut();
|
||||
file.set_len(0)?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn log_put(&mut self, key: &Key, ts: i64, val: &[u8]) -> Result<()> {
|
||||
let rec_len = 24 + 8 + 1 + val.len() as u64;
|
||||
let mut buf = vec![0u8; rec_len as usize + 8];
|
||||
|
||||
log_to_buffer(&mut buf, rec_len, key, ts, val);
|
||||
|
||||
self.log_writer.write_all(&buf)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn log_delete(&mut self, key: &Key, ts: i64) -> Result<()> {
|
||||
self.log_put(key, ts, &[])
|
||||
}
|
||||
|
||||
// TODO: decide how to configure/schedule calling this
|
||||
#[allow(dead_code)]
|
||||
pub fn sync(&mut self) -> Result<()> {
|
||||
self.log_writer.flush()?;
|
||||
self.log_writer.get_mut().sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn materialize(&self) -> Result<BTreeMap<Key, Value>> {
|
||||
let mut table = BTreeMap::new();
|
||||
if !self.log_path.exists() {
|
||||
return Ok(table);
|
||||
}
|
||||
|
||||
let mut rdr = BufReader::new(File::open(&self.log_path)?);
|
||||
let mut buf = vec![];
|
||||
|
||||
while let Ok(rec_len) = rdr.read_u64::<BigEndian>() {
|
||||
buf.resize(rec_len as usize, 0);
|
||||
rdr.read_exact(&mut buf)?;
|
||||
|
||||
let key = Key::read(&buf[0..24]);
|
||||
let ts = BigEndian::read_i64(&buf[24..32]);
|
||||
let exists = buf[32] != 0;
|
||||
|
||||
let val = if exists {
|
||||
Some(buf[33..].to_vec())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let value = Value { ts, val };
|
||||
|
||||
table.insert(key, value);
|
||||
}
|
||||
|
||||
Ok(table)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn log_to_buffer(buf: &mut [u8], rec_len: u64, key: &Key, ts: i64, val: &[u8]) {
|
||||
BigEndian::write_u64(&mut buf[..8], rec_len);
|
||||
(&mut buf[8..32]).copy_from_slice(&key.0);
|
||||
BigEndian::write_i64(&mut buf[32..40], ts);
|
||||
buf[40] = (!val.is_empty()) as u8;
|
||||
(&mut buf[41..]).copy_from_slice(val);
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
use crate::kvstore::error::Result;
|
||||
use crate::kvstore::sstable::Key;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct WriteTx<'a> {
|
||||
_dummy: &'a mut (),
|
||||
}
|
||||
|
||||
impl<'a> WriteTx<'a> {
|
||||
pub fn put(&mut self, _key: &Key, _data: &[u8]) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn delete(&mut self, _key: &Key) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
|
@ -39,6 +39,8 @@ pub mod fetch_stage;
|
|||
pub mod fullnode;
|
||||
pub mod gen_keys;
|
||||
pub mod gossip_service;
|
||||
#[cfg(feature = "kvstore")]
|
||||
pub mod kvstore;
|
||||
pub mod leader_confirmation_service;
|
||||
pub mod leader_schedule;
|
||||
pub mod leader_schedule_utils;
|
||||
|
|
|
@ -0,0 +1,252 @@
|
|||
#![cfg(feature = "kvstore")]
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use solana::kvstore::{Config, Key, KvStore};
|
||||
|
||||
const KB: usize = 1024;
|
||||
const HALF_KB: usize = 512;
|
||||
|
||||
#[test]
|
||||
fn test_put_get() {
|
||||
let path = setup("test_put_get");
|
||||
|
||||
let cfg = Config {
|
||||
max_mem: 64 * KB,
|
||||
max_tables: 5,
|
||||
page_size: 64 * KB,
|
||||
..Config::default()
|
||||
};
|
||||
|
||||
let lsm = KvStore::open(&path, cfg).unwrap();
|
||||
let (key, bytes) = gen_pairs(HALF_KB).take(1).next().unwrap();
|
||||
|
||||
lsm.put(&key, &bytes).expect("put fail");
|
||||
let out_bytes = lsm.get(&key).expect("get fail").expect("missing");
|
||||
|
||||
assert_eq!(bytes, out_bytes);
|
||||
|
||||
teardown(&path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_put_get_many() {
|
||||
let path = setup("test_put_get_many");
|
||||
|
||||
let cfg = Config {
|
||||
max_mem: 64 * KB,
|
||||
max_tables: 5,
|
||||
page_size: 64 * KB,
|
||||
..Config::default()
|
||||
};
|
||||
let lsm = KvStore::open(&path, cfg).unwrap();
|
||||
|
||||
let mut pairs: Vec<_> = gen_pairs(HALF_KB).take(1024).collect();
|
||||
pairs.sort_unstable_by_key(|(k, _)| *k);
|
||||
|
||||
lsm.put_many(pairs.clone().drain(..))
|
||||
.expect("put_many fail");
|
||||
|
||||
let retrieved: Vec<(Key, Vec<u8>)> =
|
||||
lsm.range(Key::ALL_INCLUSIVE).expect("range fail").collect();
|
||||
|
||||
assert!(!retrieved.is_empty());
|
||||
assert_eq!(pairs.len(), retrieved.len());
|
||||
assert_eq!(pairs, retrieved);
|
||||
|
||||
teardown(&path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete() {
|
||||
let path = setup("test_delete");
|
||||
|
||||
let cfg = Config {
|
||||
max_mem: 64 * KB,
|
||||
max_tables: 5,
|
||||
page_size: 64 * KB,
|
||||
..Config::default()
|
||||
};
|
||||
let lsm = KvStore::open(&path, cfg).unwrap();
|
||||
|
||||
let mut pairs: Vec<_> = gen_pairs(HALF_KB).take(64 * 6).collect();
|
||||
pairs.sort_unstable_by_key(|(k, _)| *k);
|
||||
|
||||
for (k, i) in pairs.iter() {
|
||||
lsm.put(k, i).expect("put fail");
|
||||
}
|
||||
|
||||
// drain iterator deletes from `pairs`
|
||||
for (k, _) in pairs.drain(64..128) {
|
||||
lsm.delete(&k).expect("delete fail");
|
||||
}
|
||||
|
||||
let retrieved: Vec<(Key, Vec<u8>)> =
|
||||
lsm.range(Key::ALL_INCLUSIVE).expect("range fail").collect();
|
||||
|
||||
assert!(!retrieved.is_empty());
|
||||
assert_eq!(pairs.len(), retrieved.len());
|
||||
assert_eq!(pairs, retrieved);
|
||||
|
||||
teardown(&path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_many() {
|
||||
let path = setup("test_delete_many");
|
||||
|
||||
let cfg = Config {
|
||||
max_mem: 64 * KB,
|
||||
max_tables: 5,
|
||||
page_size: 64 * KB,
|
||||
..Config::default()
|
||||
};
|
||||
let lsm = KvStore::open(&path, cfg).unwrap();
|
||||
|
||||
let mut pairs: Vec<_> = gen_pairs(HALF_KB).take(64 * 6).collect();
|
||||
pairs.sort_unstable_by_key(|(k, _)| *k);
|
||||
|
||||
for (k, i) in pairs.iter() {
|
||||
lsm.put(k, i).expect("put fail");
|
||||
}
|
||||
|
||||
// drain iterator deletes from `pairs`
|
||||
let keys_to_delete = pairs.drain(320..384).map(|(k, _)| k);
|
||||
|
||||
lsm.delete_many(keys_to_delete).expect("delete_many fail");
|
||||
|
||||
let retrieved: Vec<(Key, Vec<u8>)> =
|
||||
lsm.range(Key::ALL_INCLUSIVE).expect("range fail").collect();
|
||||
|
||||
assert!(!retrieved.is_empty());
|
||||
assert_eq!(pairs.len(), retrieved.len());
|
||||
assert_eq!(pairs, retrieved);
|
||||
|
||||
teardown(&path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_close_reopen() {
|
||||
let path = setup("test_close_reopen");
|
||||
let cfg = Config::default();
|
||||
let lsm = KvStore::open(&path, cfg).unwrap();
|
||||
|
||||
let mut pairs: Vec<_> = gen_pairs(KB).take(1024).collect();
|
||||
pairs.sort_unstable_by_key(|(k, _)| *k);
|
||||
|
||||
for (k, i) in pairs.iter() {
|
||||
lsm.put(k, i).expect("put fail");
|
||||
}
|
||||
|
||||
for (k, _) in pairs.drain(64..128) {
|
||||
lsm.delete(&k).expect("delete fail");
|
||||
}
|
||||
|
||||
// Drop and re-open
|
||||
drop(lsm);
|
||||
let lsm = KvStore::open(&path, cfg).unwrap();
|
||||
|
||||
let retrieved: Vec<(Key, Vec<u8>)> =
|
||||
lsm.range(Key::ALL_INCLUSIVE).expect("range fail").collect();
|
||||
|
||||
assert!(!retrieved.is_empty());
|
||||
assert_eq!(pairs.len(), retrieved.len());
|
||||
assert_eq!(pairs, retrieved);
|
||||
|
||||
teardown(&path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_partitioned() {
|
||||
let path = setup("test_partitioned");
|
||||
|
||||
let cfg = Config {
|
||||
max_mem: 64 * KB,
|
||||
max_tables: 5,
|
||||
page_size: 64 * KB,
|
||||
..Config::default()
|
||||
};
|
||||
|
||||
let storage_dirs = (0..4)
|
||||
.map(|i| path.join(format!("parition-{}", i)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let lsm = KvStore::partitioned(&path, &storage_dirs, cfg).unwrap();
|
||||
|
||||
let mut pairs: Vec<_> = gen_pairs(HALF_KB).take(64 * 12).collect();
|
||||
pairs.sort_unstable_by_key(|(k, _)| *k);
|
||||
|
||||
lsm.put_many(pairs.iter()).expect("put_many fail");
|
||||
|
||||
// drain iterator deletes from `pairs`
|
||||
let keys_to_delete = pairs.drain(320..384).map(|(k, _)| k);
|
||||
|
||||
lsm.delete_many(keys_to_delete).expect("delete_many fail");
|
||||
|
||||
let retrieved: Vec<(Key, Vec<u8>)> =
|
||||
lsm.range(Key::ALL_INCLUSIVE).expect("range fail").collect();
|
||||
|
||||
assert!(!retrieved.is_empty());
|
||||
assert_eq!(pairs.len(), retrieved.len());
|
||||
assert_eq!(pairs, retrieved);
|
||||
|
||||
teardown(&path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_in_memory() {
|
||||
let path = setup("test_in_memory");
|
||||
|
||||
let cfg = Config {
|
||||
max_mem: 64 * KB,
|
||||
max_tables: 5,
|
||||
page_size: 64 * KB,
|
||||
in_memory: true,
|
||||
};
|
||||
let lsm = KvStore::open(&path, cfg).unwrap();
|
||||
|
||||
let mut pairs: Vec<_> = gen_pairs(HALF_KB).take(64 * 12).collect();
|
||||
pairs.sort_unstable_by_key(|(k, _)| *k);
|
||||
|
||||
lsm.put_many(pairs.iter()).expect("put_many fail");
|
||||
|
||||
// drain iterator deletes from `pairs`
|
||||
let keys_to_delete = pairs.drain(320..384).map(|(k, _)| k);
|
||||
|
||||
lsm.delete_many(keys_to_delete).expect("delete_many fail");
|
||||
|
||||
let retrieved: Vec<(Key, Vec<u8>)> =
|
||||
lsm.range(Key::ALL_INCLUSIVE).expect("range fail").collect();
|
||||
|
||||
assert!(!retrieved.is_empty());
|
||||
assert_eq!(pairs.len(), retrieved.len());
|
||||
assert_eq!(pairs, retrieved);
|
||||
|
||||
teardown(&path);
|
||||
}
|
||||
|
||||
fn setup(test_name: &str) -> PathBuf {
|
||||
let dir = Path::new("kvstore-test").join(test_name);;
|
||||
|
||||
let _ig = fs::remove_dir_all(&dir);
|
||||
fs::create_dir_all(&dir).unwrap();
|
||||
|
||||
dir
|
||||
}
|
||||
|
||||
fn teardown(p: &Path) {
|
||||
KvStore::destroy(p).expect("Expect successful store destruction");
|
||||
}
|
||||
|
||||
fn gen_pairs(data_size: usize) -> impl Iterator<Item = (Key, Vec<u8>)> {
|
||||
let mut rng = thread_rng();
|
||||
|
||||
std::iter::repeat_with(move || {
|
||||
let data = vec![0u8; data_size];
|
||||
let buf = rng.gen();
|
||||
|
||||
(Key(buf), data)
|
||||
})
|
||||
}
|
Loading…
Reference in New Issue