Clean up locks in KvStore (#3358)

* Lift all shared mutable state into Kvstore

commit is now an AtomicUsize

In-memory table and write-log are now struct members behind individual RwLocks
This commit is contained in:
Mark E. Sinclair 2019-03-18 19:04:31 -05:00 committed by GitHub
parent ef111dcbe1
commit 5d73ab299b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 148 additions and 169 deletions

View File

@ -1,12 +1,13 @@
use crate::mapper::{Disk, Mapper, Memory};
use crate::sstable::SSTable;
use crate::storage::WriteState;
use crate::storage::MemTable;
use crate::writelog::WriteLog;
use std::collections::BTreeMap;
use std::fs;
use std::io;
use std::ops::RangeInclusive;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::mpsc::{Receiver, Sender};
use std::sync::{Arc, Mutex, RwLock};
use std::thread::JoinHandle;
@ -35,6 +36,7 @@ const LOG_FILE: &str = "mem-log";
const DEFAULT_TABLE_SIZE: usize = 64 * 1024 * 1024;
const DEFAULT_MEM_SIZE: usize = 64 * 1024 * 1024;
const DEFAULT_MAX_PAGES: usize = 10;
const COMMIT_ORDERING: Ordering = Ordering::Relaxed;
#[derive(Debug, PartialEq, Copy, Clone)]
pub struct Config {
@ -47,10 +49,12 @@ pub struct Config {
#[derive(Debug)]
pub struct KvStore {
write: RwLock<WriteState>,
tables: RwLock<Vec<BTreeMap<Key, SSTable>>>,
config: Config,
root: PathBuf,
commit: AtomicUsize,
mem: RwLock<MemTable>,
log: Arc<RwLock<WriteLog>>,
tables: RwLock<Vec<BTreeMap<Key, SSTable>>>,
mapper: Arc<dyn Mapper>,
sender: Mutex<Sender<compactor::Req>>,
receiver: Mutex<Receiver<compactor::Resp>>,
@ -92,12 +96,13 @@ impl KvStore {
}
pub fn put(&self, key: &Key, data: &[u8]) -> Result<()> {
self.ensure_mem()?;
let mut memtable = self.mem.write().unwrap();
let mut log = self.log.write().unwrap();
let commit = self.commit.fetch_add(1, COMMIT_ORDERING) as i64;
let mut write = self.write.write().unwrap();
storage::put(&mut *memtable, &mut *log, key, commit as i64, data)?;
write.put(key, data)?;
write.commit += 1;
self.ensure_memtable(&mut *memtable, &mut *log)?;
Ok(())
}
@ -109,18 +114,23 @@ impl KvStore {
K: std::borrow::Borrow<Key>,
V: std::borrow::Borrow<[u8]>,
{
{
let mut write = self.write.write().unwrap();
let mut memtable = self.mem.write().unwrap();
let mut log = self.log.write().unwrap();
let commit = self.commit.fetch_add(1, COMMIT_ORDERING) as i64;
for pair in rows {
let tup = pair.borrow();
let (key, data) = (tup.0.borrow(), tup.1.borrow());
write.put(key, data)?;
}
write.commit += 1;
let (ref key, ref data) = pair.borrow();
storage::put(
&mut *memtable,
&mut *log,
key.borrow(),
commit,
data.borrow(),
)?;
}
self.ensure_mem()?;
self.ensure_memtable(&mut *memtable, &mut *log)?;
Ok(())
}
@ -128,22 +138,20 @@ impl KvStore {
pub fn get(&self, key: &Key) -> Result<Option<Vec<u8>>> {
self.query_compactor()?;
let (write_state, tables) = (self.write.read().unwrap(), self.tables.read().unwrap());
let (memtable, tables) = (self.mem.read().unwrap(), self.tables.read().unwrap());
storage::get(&write_state.values, &*tables, key)
storage::get(&memtable.values, &*tables, key)
}
pub fn delete(&self, key: &Key) -> Result<()> {
self.query_compactor()?;
let mut memtable = self.mem.write().unwrap();
let mut log = self.log.write().unwrap();
let commit = self.commit.fetch_add(1, COMMIT_ORDERING) as i64;
{
let mut write = self.write.write().unwrap();
storage::delete(&mut *memtable, &mut *log, key, commit)?;
write.delete(key)?;
write.commit += 1;
}
self.ensure_memtable(&mut *memtable, &mut *log)?;
self.ensure_mem()?;
Ok(())
}
@ -152,18 +160,16 @@ impl KvStore {
Iter: Iterator<Item = K>,
K: std::borrow::Borrow<Key>,
{
self.query_compactor()?;
let mut memtable = self.mem.write().unwrap();
let mut log = self.log.write().unwrap();
let commit = self.commit.fetch_add(1, COMMIT_ORDERING) as i64;
{
let mut write = self.write.write().unwrap();
for k in rows {
let key = k.borrow();
write.delete(key)?;
}
write.commit += 1;
for key in rows {
storage::delete(&mut *memtable, &mut *log, key.borrow(), commit)?;
}
self.ensure_mem()?;
self.ensure_memtable(&mut *memtable, &mut *log)?;
Ok(())
}
@ -176,9 +182,12 @@ impl KvStore {
}
pub fn snapshot(&self) -> Snapshot {
let (state, tables) = (self.write.read().unwrap(), self.tables.read().unwrap());
let (memtable, tables) = (
self.mem.read().unwrap().values.clone(),
self.tables.read().unwrap().clone(),
);
Snapshot::new(state.values.clone(), tables.clone())
Snapshot::new(memtable, tables)
}
pub fn range(
@ -187,8 +196,9 @@ impl KvStore {
) -> Result<impl Iterator<Item = (Key, Vec<u8>)>> {
self.query_compactor()?;
let (write_state, tables) = (self.write.read().unwrap(), self.tables.read().unwrap());
storage::range(&write_state.values, &*tables, range)
let (memtable, tables) = (self.mem.read().unwrap(), self.tables.read().unwrap());
storage::range(&memtable.values, &*tables, range)
}
pub fn destroy<P>(path: P) -> Result<()>
@ -222,31 +232,22 @@ impl KvStore {
Ok(())
}
fn ensure_mem(&self) -> Result<()> {
let trigger_compact = {
let mut write_rw = self.write.write().unwrap();
if write_rw.mem_size < self.config.max_mem {
fn ensure_memtable(&self, mem: &mut MemTable, log: &mut WriteLog) -> Result<()> {
if mem.mem_size < self.config.max_mem {
return Ok(());
}
let mut tables = self.tables.write().unwrap();
storage::flush_table(&write_rw.values, &*self.mapper, &mut *tables)?;
write_rw.reset()?;
write_rw.commit += 1;
storage::flush_table(&mem.values, &*self.mapper, &mut *tables)?;
mem.values.clear();
mem.mem_size = 0;
log.reset().expect("Write-log rotation failed");
is_lvl0_full(&tables, &self.config)
};
if is_lvl0_full(&tables, &self.config) {
let sender = self.sender.lock().unwrap();
dump_tables(&self.root, &*self.mapper).unwrap();
if trigger_compact {
let tables_path = self.root.join(TABLES_FILE);
self.sender
.lock()
.unwrap()
.send(compactor::Req::Start(tables_path))
.expect("compactor thread dead");
sender.send(compactor::Req::Start(PathBuf::new()))?;
}
Ok(())
@ -274,17 +275,16 @@ fn open(root: &Path, mapper: Arc<dyn Mapper>, config: Config) -> Result<KvStore>
fs::create_dir(&root)?;
}
let write_log = WriteLog::open(&log_path, config.log_config)?;
let mem = if restore_log && !config.in_memory {
write_log.materialize()?
let commit = chrono::Utc::now().timestamp();
let mut log = WriteLog::open(&log_path, config.log_config)?;
let values = if restore_log && !config.in_memory {
log.materialize()?
} else {
BTreeMap::new()
};
let write = RwLock::new(WriteState::new(write_log, mem));
let mem = MemTable::new(values);
let tables = load_tables(&root, &*mapper)?;
let tables = RwLock::new(tables);
let cfg = compactor::Config {
max_pages: config.max_tables,
@ -292,16 +292,17 @@ fn open(root: &Path, mapper: Arc<dyn Mapper>, config: Config) -> Result<KvStore>
};
let (sender, receiver, compactor_handle) = compactor::spawn_compactor(Arc::clone(&mapper), cfg)
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
let (sender, receiver) = (Mutex::new(sender), Mutex::new(receiver));
Ok(KvStore {
write,
tables,
config,
mapper,
root,
sender,
receiver,
commit: AtomicUsize::new(commit as usize),
mem: RwLock::new(mem),
log: Arc::new(RwLock::new(log)),
tables: RwLock::new(tables),
mapper,
sender: Mutex::new(sender),
receiver: Mutex::new(receiver),
compactor_handle,
})
}

View File

@ -154,7 +154,7 @@ impl SSTable {
(meta.start, meta.level)
};
while level as usize >= tables.len() {
while level as usize >= sorted.len() {
sorted.push(BTreeMap::new());
}
sorted[level as usize].insert(key, sst.clone());

View File

@ -2,92 +2,79 @@ use crate::error::Result;
use crate::mapper::{Kind, Mapper};
use crate::sstable::{Key, Merged, SSTable, Value};
use crate::writelog::WriteLog;
use chrono::Utc;
use std::collections::btree_map::Entry;
use std::collections::BTreeMap;
type MemTable = BTreeMap<Key, Value>;
// Size of timestamp + size of key
const OVERHEAD: usize = 8 + 3 * 8;
const LOG_ERR: &str = "Write to log failed! Halting.";
#[derive(Debug)]
pub struct WriteState {
pub commit: i64,
pub log: WriteLog,
pub values: MemTable,
pub struct MemTable {
pub mem_size: usize,
pub values: BTreeMap<Key, Value>,
}
impl WriteState {
pub fn new(log: WriteLog, values: BTreeMap<Key, Value>) -> WriteState {
impl MemTable {
pub fn new(values: BTreeMap<Key, Value>) -> MemTable {
let mem_size = values.values().fold(0, |acc, elem| acc + val_mem_use(elem));
WriteState {
commit: Utc::now().timestamp(),
log,
mem_size,
values,
MemTable { mem_size, values }
}
}
pub fn put(&mut self, key: &Key, data: &[u8]) -> Result<()> {
use std::collections::btree_map::Entry;
let ts = self.commit;
pub fn put(
mem: &mut MemTable,
log: &mut WriteLog,
key: &Key,
commit: i64,
data: &[u8],
) -> Result<()> {
log.log_put(key, commit, data).expect(LOG_ERR);
let value = Value {
ts,
ts: commit,
val: Some(data.to_vec()),
};
self.log.log_put(key, ts, data).expect(LOG_ERR);
self.mem_size += val_mem_use(&value);
mem.mem_size += val_mem_use(&value);
match self.values.entry(*key) {
match mem.values.entry(*key) {
Entry::Vacant(entry) => {
entry.insert(value);
}
Entry::Occupied(mut entry) => {
let old = entry.insert(value);
self.mem_size -= val_mem_use(&old);
mem.mem_size -= val_mem_use(&old);
}
}
Ok(())
}
pub fn delete(&mut self, key: &Key) -> Result<()> {
use std::collections::btree_map::Entry;
let ts = self.commit;
let value = Value { ts, val: None };
pub fn delete(mem: &mut MemTable, log: &mut WriteLog, key: &Key, commit: i64) -> Result<()> {
log.log_delete(key, commit).expect(LOG_ERR);
let value = Value {
ts: commit,
val: None,
};
self.log.log_delete(key, ts).expect(LOG_ERR);
mem.mem_size += val_mem_use(&value);
self.mem_size += val_mem_use(&value);
match self.values.entry(*key) {
match mem.values.entry(*key) {
Entry::Vacant(entry) => {
entry.insert(value);
}
Entry::Occupied(mut entry) => {
let old = entry.insert(value);
self.mem_size -= val_mem_use(&old);
mem.mem_size -= val_mem_use(&old);
}
}
Ok(())
}
pub fn reset(&mut self) -> Result<()> {
self.values.clear();
self.log.reset()?;
self.mem_size = 0;
Ok(())
}
}
pub fn flush_table(
mem: &MemTable,
mem: &BTreeMap<Key, Value>,
mapper: &dyn Mapper,
pages: &mut Vec<BTreeMap<Key, SSTable>>,
) -> Result<()> {
@ -110,7 +97,11 @@ pub fn flush_table(
Ok(())
}
pub fn get(mem: &MemTable, pages: &[BTreeMap<Key, SSTable>], key: &Key) -> Result<Option<Vec<u8>>> {
pub fn get(
mem: &BTreeMap<Key, Value>,
pages: &[BTreeMap<Key, SSTable>],
key: &Key,
) -> Result<Option<Vec<u8>>> {
if let Some(idx) = mem.get(key) {
return Ok(idx.val.clone());
}
@ -134,7 +125,7 @@ pub fn get(mem: &MemTable, pages: &[BTreeMap<Key, SSTable>], key: &Key) -> Resul
}
pub fn range(
mem: &MemTable,
mem: &BTreeMap<Key, Value>,
tables: &[BTreeMap<Key, SSTable>],
range: std::ops::RangeInclusive<Key>,
) -> Result<impl Iterator<Item = (Key, Vec<u8>)>> {
@ -144,21 +135,17 @@ pub fn range(
.range(range.clone())
.map(|(k, v)| (*k, v.clone()))
.collect::<Vec<_>>();
let mut disk = Vec::new();
sources.push(Box::new(mem.into_iter()));
for level in tables.iter() {
for sst in level.values() {
let iter = sst.range(&range)?;
let iter = Box::new(iter) as Box<dyn Iterator<Item = (Key, Value)>>;
disk.push(iter);
sources.push(iter);
}
}
sources.push(Box::new(mem.into_iter()));
sources.extend(disk);
let rows = Merged::new(sources).map(|(k, v)| (k, v.val.unwrap()));
Ok(rows)

View File

@ -8,7 +8,6 @@ use std::collections::BTreeMap;
use std::fs::{self, File};
use std::io::{Read, Write};
use std::path::{Path, PathBuf};
use std::sync::RwLock;
// RocksDb's log uses this size.
// May be worth making configurable and experimenting
@ -17,7 +16,7 @@ const BLOCK_SIZE: usize = 32 * 1024;
#[derive(Debug)]
pub struct WriteLog {
log_path: PathBuf,
logger: RwLock<Logger>,
logger: Logger,
config: Config,
in_memory: bool,
}
@ -35,7 +34,7 @@ impl WriteLog {
Ok(WriteLog {
config,
log_path: path.to_path_buf(),
logger: RwLock::new(Logger::disk(file)),
logger: Logger::disk(file),
in_memory: false,
})
}
@ -44,15 +43,13 @@ impl WriteLog {
pub fn memory(config: Config) -> WriteLog {
WriteLog {
config,
logger: RwLock::new(Logger::memory()),
logger: Logger::memory(),
log_path: Path::new("").to_path_buf(),
in_memory: true,
}
}
pub fn reset(&self) -> Result<()> {
let mut logger = self.logger.write().unwrap();
pub fn reset(&mut self) -> Result<()> {
let new_logger = if self.in_memory {
Logger::memory()
} else {
@ -60,44 +57,38 @@ impl WriteLog {
Logger::disk(file)
};
*logger = new_logger;
self.logger = new_logger;
Ok(())
}
pub fn log_put(&self, key: &Key, ts: i64, val: &[u8]) -> Result<()> {
let mut logger = self.logger.write().unwrap();
log(&mut logger, key, ts, Some(val))?;
pub fn log_put(&mut self, key: &Key, ts: i64, val: &[u8]) -> Result<()> {
log(&mut self.logger, key, ts, Some(val))?;
if self.config.sync_every_write {
sync(&mut logger, self.config.use_fsync)?;
sync(&mut self.logger, self.config.use_fsync)?;
}
Ok(())
}
pub fn log_delete(&self, key: &Key, ts: i64) -> Result<()> {
let mut logger = self.logger.write().unwrap();
log(&mut logger, key, ts, None)?;
pub fn log_delete(&mut self, key: &Key, ts: i64) -> Result<()> {
log(&mut self.logger, key, ts, None)?;
if self.config.sync_every_write {
sync(&mut logger, self.config.use_fsync)?;
sync(&mut self.logger, self.config.use_fsync)?;
}
Ok(())
}
#[allow(dead_code)]
pub fn sync(&self) -> Result<()> {
let mut logger = self.logger.write().unwrap();
sync(&mut logger, self.config.use_fsync)
pub fn sync(&mut self) -> Result<()> {
sync(&mut self.logger, self.config.use_fsync)
}
pub fn materialize(&self) -> Result<BTreeMap<Key, Value>> {
let mmap = self.logger.write().unwrap().writer.mmap()?;
pub fn materialize(&mut self) -> Result<BTreeMap<Key, Value>> {
let mmap = self.logger.writer.mmap()?;
read_log(&mmap)
}
}
@ -281,7 +272,7 @@ mod test {
#[test]
fn test_log_round_trip() {
let wal = WriteLog::memory(Config::default());
let mut wal = WriteLog::memory(Config::default());
let values: BTreeMap<Key, Value> = (0u64..100)
.map(|n| {
@ -313,7 +304,7 @@ mod test {
fn test_reset() {
use crate::error::Error;
let wal = WriteLog::memory(Config::default());
let mut wal = WriteLog::memory(Config::default());
let values: BTreeMap<Key, Value> = (0u64..100)
.map(|n| {