From 58d1bbfe6f30a68f4e04a08d399b1be832e87527 Mon Sep 17 00:00:00 2001 From: Cory Fields Date: Wed, 17 Feb 2016 14:35:35 -0500 Subject: [PATCH 01/19] leveldb: integrate leveldb into our buildsystem leveldb's buildsystem causes us a few problems: - breaks out-of-tree builds - forces flags used for some tools - limits cross builds Rather than continuing to add wrappers around it, simply integrate it into our build. --- configure.ac | 22 ++++++++-- src/Makefile.am | 19 ++------- src/Makefile.leveldb.include | 81 ++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 18 deletions(-) create mode 100644 src/Makefile.leveldb.include diff --git a/configure.ac b/configure.ac index b25c75956..fb2e10bf5 100644 --- a/configure.ac +++ b/configure.ac @@ -356,7 +356,7 @@ case $host in fi CPPFLAGS="$CPPFLAGS -D_MT -DWIN32 -D_WINDOWS -DBOOST_THREAD_USE_LIB" - LEVELDB_TARGET_FLAGS="TARGET_OS=OS_WINDOWS_CROSSCOMPILE" + LEVELDB_TARGET_FLAGS="-DOS_WINDOWS" if test "x$CXXFLAGS_overridden" = "xno"; then CXXFLAGS="$CXXFLAGS -w" fi @@ -378,7 +378,7 @@ case $host in ;; *darwin*) TARGET_OS=darwin - LEVELDB_TARGET_FLAGS="TARGET_OS=Darwin" + LEVELDB_TARGET_FLAGS="-DOS_MACOSX" if test x$cross_compiling != xyes; then BUILD_OS=darwin AC_CHECK_PROG([PORT],port, port) @@ -435,9 +435,11 @@ case $host in OBJCXXFLAGS="$CXXFLAGS" ;; *linux*) - TARGET_OS=linux + LEVELDB_TARGET_FLAGS="-DOS_LINUX" ;; *) + OTHER_OS=`echo ${host_os} | awk '{print toupper($0)}'` + LEVELDB_TARGET_FLAGS="-DOS_${OTHER_OS}" ;; esac @@ -647,6 +649,18 @@ if test x$use_reduce_exports = xyes; then [AC_MSG_ERROR([Cannot set default symbol visibility. Use --disable-reduce-exports.])]) fi +dnl This can go away when we require c++11 +TEMP_CXXFLAGS="$CXXFLAGS" +CXXFLAGS="$CXXFLAGS -std=c++0x" +AC_MSG_CHECKING(for c++11 atomics) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + #include + ]],[[]])], + [ AC_MSG_RESULT(yes); LEVELDB_ATOMIC_CPPFLAGS="-DLEVELDB_ATOMIC_PRESENT"; LEVELDB_ATOMIC_CXXFLAGS="-std=c++0x"], + [ AC_MSG_RESULT(no)] +) +CXXFLAGS="$TEMP_CXXFLAGS" + LEVELDB_CPPFLAGS= LIBLEVELDB= LIBMEMENV= @@ -929,6 +943,8 @@ AC_SUBST(LIBTOOL_APP_LDFLAGS) AC_SUBST(BOOST_LIBS) AC_SUBST(TESTDEFS) AC_SUBST(LEVELDB_TARGET_FLAGS) +AC_SUBST(LEVELDB_ATOMIC_CPPFLAGS) +AC_SUBST(LEVELDB_ATOMIC_CXXFLAGS) AC_SUBST(CRYPTO_LIBS) AC_SUBST(SSL_LIBS) AC_SUBST(EVENT_LIBS) diff --git a/src/Makefile.am b/src/Makefile.am index 35a657f5d..24007c4d5 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -11,21 +11,6 @@ AM_CXXFLAGS = $(DEBUG_CXXFLAGS) $(HARDENED_CXXFLAGS) $(ERROR_CXXFLAGS) $(GPROF_C AM_CPPFLAGS = $(DEBUG_CPPFLAGS) $(HARDENED_CPPFLAGS) EXTRA_LIBRARIES = -if EMBEDDED_LEVELDB -LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/include -LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/helpers/memenv -LIBLEVELDB += $(builddir)/leveldb/libleveldb.a -LIBMEMENV += $(builddir)/leveldb/libmemenv.a - -# NOTE: This dependency is not strictly necessary, but without it make may try to build both in parallel, which breaks the LevelDB build system in a race -$(LIBLEVELDB): $(LIBMEMENV) - -$(LIBLEVELDB) $(LIBMEMENV): - @echo "Building LevelDB ..." && $(MAKE) -C $(@D) $(@F) CXX="$(CXX)" \ - CC="$(CC)" PLATFORM=$(TARGET_OS) AR="$(AR)" $(LEVELDB_TARGET_FLAGS) \ - OPT="$(AM_CXXFLAGS) $(PIE_FLAGS) $(CXXFLAGS) $(AM_CPPFLAGS) $(CPPFLAGS) -D__STDC_LIMIT_MACROS" -endif - BITCOIN_CONFIG_INCLUDES=-I$(builddir)/config BITCOIN_INCLUDES=-I$(builddir) -I$(builddir)/obj $(BDB_CPPFLAGS) $(BOOST_CPPFLAGS) $(LEVELDB_CPPFLAGS) $(CRYPTO_CFLAGS) $(SSL_CFLAGS) @@ -645,6 +630,10 @@ endif @test -f $(PROTOC) $(AM_V_GEN) $(PROTOC) --cpp_out=$(@D) --proto_path=$(abspath $( Date: Thu, 28 Apr 2016 17:46:35 -0400 Subject: [PATCH 02/19] build: No need to check for leveldb atomics They're guaranteed with c++11 --- configure.ac | 14 -------------- src/Makefile.leveldb.include | 7 ++----- 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/configure.ac b/configure.ac index fb2e10bf5..54f1729a7 100644 --- a/configure.ac +++ b/configure.ac @@ -649,18 +649,6 @@ if test x$use_reduce_exports = xyes; then [AC_MSG_ERROR([Cannot set default symbol visibility. Use --disable-reduce-exports.])]) fi -dnl This can go away when we require c++11 -TEMP_CXXFLAGS="$CXXFLAGS" -CXXFLAGS="$CXXFLAGS -std=c++0x" -AC_MSG_CHECKING(for c++11 atomics) -AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ - #include - ]],[[]])], - [ AC_MSG_RESULT(yes); LEVELDB_ATOMIC_CPPFLAGS="-DLEVELDB_ATOMIC_PRESENT"; LEVELDB_ATOMIC_CXXFLAGS="-std=c++0x"], - [ AC_MSG_RESULT(no)] -) -CXXFLAGS="$TEMP_CXXFLAGS" - LEVELDB_CPPFLAGS= LIBLEVELDB= LIBMEMENV= @@ -943,8 +931,6 @@ AC_SUBST(LIBTOOL_APP_LDFLAGS) AC_SUBST(BOOST_LIBS) AC_SUBST(TESTDEFS) AC_SUBST(LEVELDB_TARGET_FLAGS) -AC_SUBST(LEVELDB_ATOMIC_CPPFLAGS) -AC_SUBST(LEVELDB_ATOMIC_CXXFLAGS) AC_SUBST(CRYPTO_LIBS) AC_SUBST(SSL_LIBS) AC_SUBST(EVENT_LIBS) diff --git a/src/Makefile.leveldb.include b/src/Makefile.leveldb.include index 36a6bc409..88bb0c193 100644 --- a/src/Makefile.leveldb.include +++ b/src/Makefile.leveldb.include @@ -13,7 +13,7 @@ LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/helpers/memenv LEVELDB_CPPFLAGS_INT = LEVELDB_CPPFLAGS_INT += -I$(srcdir)/leveldb LEVELDB_CPPFLAGS_INT += $(LEVELDB_TARGET_FLAGS) -LEVELDB_CPPFLAGS_INT += $(LEVELDB_ATOMIC_CPPFLAGS) +LEVELDB_CPPFLAGS_INT += -DLEVELDB_ATOMIC_PRESENT LEVELDB_CPPFLAGS_INT += -D__STDC_LIMIT_MACROS if TARGET_WINDOWS @@ -22,11 +22,8 @@ else LEVELDB_CPPFLAGS_INT += -DLEVELDB_PLATFORM_POSIX endif -LEVELDB_CXXFLAGS_INT = -LEVELDB_CXXFLAGS_INT += $(LEVELDB_ATOMIC_CXXFLAGS) - leveldb_libleveldb_a_CPPFLAGS = $(AM_CPPFLAGS) $(LEVELDB_CPPFLAGS_INT) $(LEVELDB_CPPFLAGS) -leveldb_libleveldb_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) $(LEVELDB_CXXFLAGS_INT) +leveldb_libleveldb_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) leveldb_libleveldb_a_SOURCES= leveldb_libleveldb_a_SOURCES += leveldb/db/builder.cc From 9cc749769290af1a51f2fe98f80931e49ad11d71 Mon Sep 17 00:00:00 2001 From: Cory Fields Date: Wed, 1 Jun 2016 18:05:09 -0400 Subject: [PATCH 03/19] build: out-of-tree fixups Don't glob the leveldb for dist. That means we need to enumerate the headers. --- Makefile.am | 10 ------- src/Makefile.am | 2 +- src/Makefile.leveldb.include | 56 ++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 11 deletions(-) diff --git a/Makefile.am b/Makefile.am index 141f1a849..e8ecd4950 100644 --- a/Makefile.am +++ b/Makefile.am @@ -33,18 +33,8 @@ COVERAGE_INFO = baseline_filtered_combined.info baseline.info \ zcash-gtest.info zcash-gtest_filtered.info zcash-gtest_coverage.info dist-hook: - -$(MAKE) -C $(top_distdir)/src/leveldb clean - -$(MAKE) -C $(top_distdir)/src/secp256k1 distclean -$(GIT) archive --format=tar HEAD -- src/clientversion.cpp | $(AMTAR) -C $(top_distdir) -xf - -distcheck-hook: - $(MKDIR_P) $(top_distdir)/_build/src/leveldb - cp -rf $(top_srcdir)/src/leveldb/* $(top_distdir)/_build/src/leveldb/ - -$(MAKE) -C $(top_distdir)/_build/src/leveldb clean - -distcleancheck: - @: - $(if $(findstring src/,$(MAKECMDGOALS)),$(MAKECMDGOALS), none): FORCE $(MAKE) -C src $(patsubst src/%,%,$@) diff --git a/src/Makefile.am b/src/Makefile.am index 24007c4d5..d2b5633f2 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -590,7 +590,7 @@ CTAES_DIST += crypto/ctaes/ctaes.h CTAES_DIST += crypto/ctaes/README.md CTAES_DIST += crypto/ctaes/test.c -CLEANFILES = leveldb/libleveldb.a leveldb/libmemenv.a *.gcda *.gcno */*.gcno wallet/*/*.gcno +CLEANFILES = *.gcda *.gcno */*.gcno wallet/*/*.gcno DISTCLEANFILES = obj/build.h diff --git a/src/Makefile.leveldb.include b/src/Makefile.leveldb.include index 88bb0c193..4b3cd6364 100644 --- a/src/Makefile.leveldb.include +++ b/src/Makefile.leveldb.include @@ -26,6 +26,61 @@ leveldb_libleveldb_a_CPPFLAGS = $(AM_CPPFLAGS) $(LEVELDB_CPPFLAGS_INT) $(LEVELDB leveldb_libleveldb_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) leveldb_libleveldb_a_SOURCES= +leveldb_libleveldb_a_SOURCES += leveldb/port/atomic_pointer.h +leveldb_libleveldb_a_SOURCES += leveldb/port/port_example.h +leveldb_libleveldb_a_SOURCES += leveldb/port/port_posix.h +leveldb_libleveldb_a_SOURCES += leveldb/port/win/stdint.h +leveldb_libleveldb_a_SOURCES += leveldb/port/port.h +leveldb_libleveldb_a_SOURCES += leveldb/port/port_win.h +leveldb_libleveldb_a_SOURCES += leveldb/port/thread_annotations.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/db.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/options.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/comparator.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/filter_policy.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/slice.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table_builder.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/env.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/c.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/iterator.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/cache.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/dumpfile.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/table.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/write_batch.h +leveldb_libleveldb_a_SOURCES += leveldb/include/leveldb/status.h +leveldb_libleveldb_a_SOURCES += leveldb/db/log_format.h +leveldb_libleveldb_a_SOURCES += leveldb/db/memtable.h +leveldb_libleveldb_a_SOURCES += leveldb/db/version_set.h +leveldb_libleveldb_a_SOURCES += leveldb/db/write_batch_internal.h +leveldb_libleveldb_a_SOURCES += leveldb/db/filename.h +leveldb_libleveldb_a_SOURCES += leveldb/db/version_edit.h +leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.h +leveldb_libleveldb_a_SOURCES += leveldb/db/builder.h +leveldb_libleveldb_a_SOURCES += leveldb/db/log_writer.h +leveldb_libleveldb_a_SOURCES += leveldb/db/db_iter.h +leveldb_libleveldb_a_SOURCES += leveldb/db/skiplist.h +leveldb_libleveldb_a_SOURCES += leveldb/db/db_impl.h +leveldb_libleveldb_a_SOURCES += leveldb/db/table_cache.h +leveldb_libleveldb_a_SOURCES += leveldb/db/snapshot.h +leveldb_libleveldb_a_SOURCES += leveldb/db/log_reader.h +leveldb_libleveldb_a_SOURCES += leveldb/table/filter_block.h +leveldb_libleveldb_a_SOURCES += leveldb/table/block_builder.h +leveldb_libleveldb_a_SOURCES += leveldb/table/block.h +leveldb_libleveldb_a_SOURCES += leveldb/table/two_level_iterator.h +leveldb_libleveldb_a_SOURCES += leveldb/table/merger.h +leveldb_libleveldb_a_SOURCES += leveldb/table/format.h +leveldb_libleveldb_a_SOURCES += leveldb/table/iterator_wrapper.h +leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.h +leveldb_libleveldb_a_SOURCES += leveldb/util/arena.h +leveldb_libleveldb_a_SOURCES += leveldb/util/random.h +leveldb_libleveldb_a_SOURCES += leveldb/util/posix_logger.h +leveldb_libleveldb_a_SOURCES += leveldb/util/hash.h +leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.h +leveldb_libleveldb_a_SOURCES += leveldb/util/coding.h +leveldb_libleveldb_a_SOURCES += leveldb/util/testutil.h +leveldb_libleveldb_a_SOURCES += leveldb/util/mutexlock.h +leveldb_libleveldb_a_SOURCES += leveldb/util/logging.h +leveldb_libleveldb_a_SOURCES += leveldb/util/testharness.h + leveldb_libleveldb_a_SOURCES += leveldb/db/builder.cc leveldb_libleveldb_a_SOURCES += leveldb/db/c.cc leveldb_libleveldb_a_SOURCES += leveldb/db/dbformat.cc @@ -76,3 +131,4 @@ endif leveldb_libmemenv_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS) leveldb_libmemenv_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS) leveldb_libmemenv_a_SOURCES = leveldb/helpers/memenv/memenv.cc +leveldb_libmemenv_a_SOURCES += leveldb/helpers/memenv/memenv.h From 7bf4f0b7e7849b63ddb4adebe53dade349739799 Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Wed, 21 Sep 2016 22:31:23 +0000 Subject: [PATCH 04/19] Add MIT license to Makefiles --- src/Makefile.bench.include | 4 ++++ src/Makefile.leveldb.include | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/src/Makefile.bench.include b/src/Makefile.bench.include index 5a8356aa5..9a4f3ccd5 100644 --- a/src/Makefile.bench.include +++ b/src/Makefile.bench.include @@ -1,3 +1,7 @@ +# Copyright (c) 2015-2016 The Bitcoin Core developers +# Distributed under the MIT software license, see the accompanying +# file COPYING or http://www.opensource.org/licenses/mit-license.php. + bin_PROGRAMS += bench/bench_bitcoin BENCH_SRCDIR = bench BENCH_BINARY = bench/bench_bitcoin$(EXEEXT) diff --git a/src/Makefile.leveldb.include b/src/Makefile.leveldb.include index 4b3cd6364..d7346aa18 100644 --- a/src/Makefile.leveldb.include +++ b/src/Makefile.leveldb.include @@ -1,3 +1,7 @@ +# Copyright (c) 2016 The Bitcoin Core developers +# Distributed under the MIT software license, see the accompanying +# file COPYING or http://www.opensource.org/licenses/mit-license.php. + LIBLEVELDB_INT = leveldb/libleveldb.a LIBMEMENV_INT = leveldb/libmemenv.a From 1e66c822c3127e959f92ac48feb46c6589f9b895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Jan=C3=ADk?= Date: Wed, 28 Sep 2016 16:40:20 +0200 Subject: [PATCH 05/19] Do not include env_win.cc on non-Windows systems --- src/Makefile.leveldb.include | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Makefile.leveldb.include b/src/Makefile.leveldb.include index d7346aa18..358f39cbe 100644 --- a/src/Makefile.leveldb.include +++ b/src/Makefile.leveldb.include @@ -117,7 +117,6 @@ leveldb_libleveldb_a_SOURCES += leveldb/util/comparator.cc leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.cc leveldb_libleveldb_a_SOURCES += leveldb/util/env.cc leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix.cc -leveldb_libleveldb_a_SOURCES += leveldb/util/env_win.cc leveldb_libleveldb_a_SOURCES += leveldb/util/filter_policy.cc leveldb_libleveldb_a_SOURCES += leveldb/util/hash.cc leveldb_libleveldb_a_SOURCES += leveldb/util/histogram.cc From b4be0846d5cd74825f786f0c0ae8e63807e6ebd3 Mon Sep 17 00:00:00 2001 From: Jack Grigg Date: Wed, 23 Sep 2020 00:04:47 +0100 Subject: [PATCH 06/19] Squashed 'src/leveldb/' changes from 20ca81f08..a31c8aa40 a31c8aa40 Add NewAppendableFile for win32 environment 1913d718e Merge upstream LevelDB 1.19 3080a45b6 Increase leveldb version to 1.19. fa6dc010a A zippy change broke test assumptions about the size of compressed output. Fix the tests by allowing more slop in zippy's behavior. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=123432472 06a191b8d fix problems in LevelDB's caching code a7bff697b Fix LevelDB build when asserts are enabled in release builds. (#367) ea992b467 Change std::uint64_t to uint64_t (#354) e84b5bdb5 This CL fixes a bug encountered when reading records from leveldb files that have been split, as in a [] input task split. 321134390 Deleted redundant null ptr check prior to delete. 7306ef856 Merge pull request #348 from randomascii/master 6b18316d0 Fix signed/unsigned mismatch on VC++ builds adbe3eb07 Putting build artifacts in subdirectory. 2d0320a45 Merge pull request #329 from ralphtheninja/travis-badge dd1c3c357 add travis build badge 43fcf23af Merge pull request #328 from cmumford/master 9fcae6164 Added a Travis CI build file. dac40d25f Merge pull request #284 from ideawu/master 8ec241a3b Merge pull request #317 from falvojr/patch-1 5d36bedd1 Merge pull request #272 from vapier/master 4753c9b61 Added a contributors section to README.md e2446d084 Merge pull request #275 from paulirish/patch-1 706b7f8d4 Resolve race when getting approximate-memory-usage property 3c9ff3c69 Only compiling TrimSpace on linux. f8d205cf8 Including atomic_pointer.h in port_posix 889de31a5 Let LevelDB use xcrun to determine Xcode.app path instead of using a hardcoded path. 528c2bc6a Add "approximate-memory-usage" property to leveldb::DB::GetProperty 359b6bcec Add leveldb::Cache::Prune 50e77a826 Fix size_t/int comparison/conversion issues in leveldb. 5208e7952 Added leveldb::Status::IsInvalidArgument() method. ce45404bb Suppress error reporting after seeking but before a valid First or Full record is encountered. b9afa1f2e include -> edf2939c0 Update README.md 65190ac48 Will not reuse manifest if reuse_logs options is false. ac1d69da3 LevelDB now attempts to reuse the preceding MANIFEST and log file when re-opened. 76bba139c fix indent 8fcceb2a6 log compaction output file's level along with number 0e0f07417 documentation. improved link c85addcdf readme: improved documentation link ceff6f121 Fix Android/MIPS build. 77948e7ee Add benchmark that measures cost of repeatedly opening the database. 34ad72e3e Move header guard below copyright banner. a75d435d1 Clean up layering of storage/leveldb/... b234f65b3 Added a new fault injection test. c4c38f9c1 Add arm64 support to leveldb. cea9b10e5 Fixed incorrect comment wording for Iterator::Seek. c00c569f2 Deleted old README file. git-subtree-dir: src/leveldb git-subtree-split: a31c8aa408d5594830f7cb20ead1ef1dff51b79e --- .travis.yml | 13 + Makefile | 463 +++++++++++++++------ README | 51 --- README.md | 39 +- build_detect_platform | 2 +- db/corruption_test.cc | 2 +- db/db_bench.cc | 24 +- db/db_impl.cc | 194 +++++---- db/db_impl.h | 8 +- db/db_test.cc | 32 +- db/fault_injection_test.cc | 554 +++++++++++++++++++++++++ db/{leveldb_main.cc => leveldbutil.cc} | 0 db/log_reader.cc | 22 +- db/log_reader.h | 5 + db/log_test.cc | 101 ++++- db/log_writer.cc | 17 +- db/log_writer.h | 6 + db/memtable.h | 5 +- db/recovery_test.cc | 324 +++++++++++++++ db/skiplist.h | 8 +- db/skiplist_test.cc | 2 +- db/snapshot.h | 1 + db/version_set.cc | 40 +- db/version_set.h | 4 +- db/write_batch_internal.h | 1 + doc/index.html | 2 +- helpers/memenv/memenv.cc | 13 + helpers/memenv/memenv_test.cc | 13 +- include/leveldb/cache.h | 11 + include/leveldb/db.h | 4 +- include/leveldb/env.h | 18 + include/leveldb/iterator.h | 2 +- include/leveldb/options.h | 6 + include/leveldb/status.h | 6 + port/atomic_pointer.h | 19 + port/port_posix.cc | 1 - table/filter_block.cc | 4 +- table/format.cc | 3 +- table/iterator_wrapper.h | 3 + table/table.cc | 2 +- table/table_test.cc | 20 +- util/arena.cc | 6 +- util/arena.h | 10 +- util/bloom.cc | 2 +- util/bloom_test.cc | 3 +- util/cache.cc | 138 ++++-- util/cache_test.cc | 42 +- util/env.cc | 4 + util/env_posix.cc | 13 + util/env_win.cc | 37 +- util/options.cc | 2 +- util/testutil.h | 10 + 52 files changed, 1941 insertions(+), 371 deletions(-) create mode 100644 .travis.yml delete mode 100644 README create mode 100644 db/fault_injection_test.cc rename db/{leveldb_main.cc => leveldbutil.cc} (100%) create mode 100644 db/recovery_test.cc diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..f5bd74c45 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,13 @@ +language: cpp +compiler: +- clang +- gcc +os: +- linux +- osx +sudo: false +before_install: +- echo $LANG +- echo $LC_ALL +script: +- make -j 4 check diff --git a/Makefile b/Makefile index 2bd2cadcd..07a5a1ead 100644 --- a/Makefile +++ b/Makefile @@ -20,208 +20,395 @@ $(shell CC="$(CC)" CXX="$(CXX)" TARGET_OS="$(TARGET_OS)" \ # this file is generated by the previous line to set build flags and sources include build_config.mk +TESTS = \ + db/autocompact_test \ + db/c_test \ + db/corruption_test \ + db/db_test \ + db/dbformat_test \ + db/fault_injection_test \ + db/filename_test \ + db/log_test \ + db/recovery_test \ + db/skiplist_test \ + db/version_edit_test \ + db/version_set_test \ + db/write_batch_test \ + helpers/memenv/memenv_test \ + issues/issue178_test \ + issues/issue200_test \ + table/filter_block_test \ + table/table_test \ + util/arena_test \ + util/bloom_test \ + util/cache_test \ + util/coding_test \ + util/crc32c_test \ + util/env_test \ + util/hash_test + +UTILS = \ + db/db_bench \ + db/leveldbutil + +# Put the object files in a subdirectory, but the application at the top of the object dir. +PROGNAMES := $(notdir $(TESTS) $(UTILS)) + +# On Linux may need libkyotocabinet-dev for dependency. +BENCHMARKS = \ + doc/bench/db_bench_sqlite3 \ + doc/bench/db_bench_tree_db + CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) LDFLAGS += $(PLATFORM_LDFLAGS) LIBS += $(PLATFORM_LIBS) -LIBOBJECTS = $(SOURCES:.cc=.o) -MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o) +SIMULATOR_OUTDIR=out-ios-x86 +DEVICE_OUTDIR=out-ios-arm -TESTUTIL = ./util/testutil.o -TESTHARNESS = ./util/testharness.o $(TESTUTIL) - -# Note: iOS should probably be using libtool, not ar. ifeq ($(PLATFORM), IOS) +# Note: iOS should probably be using libtool, not ar. AR=xcrun ar +SIMULATORSDK=$(shell xcrun -sdk iphonesimulator --show-sdk-path) +DEVICESDK=$(shell xcrun -sdk iphoneos --show-sdk-path) +DEVICE_CFLAGS = -isysroot "$(DEVICESDK)" -arch armv6 -arch armv7 -arch armv7s -arch arm64 +SIMULATOR_CFLAGS = -isysroot "$(SIMULATORSDK)" -arch i686 -arch x86_64 +STATIC_OUTDIR=out-ios-universal +else +STATIC_OUTDIR=out-static +SHARED_OUTDIR=out-shared +STATIC_PROGRAMS := $(addprefix $(STATIC_OUTDIR)/, $(PROGNAMES)) +SHARED_PROGRAMS := $(addprefix $(SHARED_OUTDIR)/, db_bench) endif -TESTS = \ - arena_test \ - autocompact_test \ - bloom_test \ - c_test \ - cache_test \ - coding_test \ - corruption_test \ - crc32c_test \ - db_test \ - dbformat_test \ - env_test \ - filename_test \ - filter_block_test \ - hash_test \ - issue178_test \ - issue200_test \ - log_test \ - memenv_test \ - skiplist_test \ - table_test \ - version_edit_test \ - version_set_test \ - write_batch_test +STATIC_LIBOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(SOURCES:.cc=.o)) +STATIC_MEMENVOBJECTS := $(addprefix $(STATIC_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o)) -PROGRAMS = db_bench leveldbutil $(TESTS) -BENCHMARKS = db_bench_sqlite3 db_bench_tree_db +DEVICE_LIBOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(SOURCES:.cc=.o)) +DEVICE_MEMENVOBJECTS := $(addprefix $(DEVICE_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o)) -LIBRARY = libleveldb.a -MEMENVLIBRARY = libmemenv.a +SIMULATOR_LIBOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(SOURCES:.cc=.o)) +SIMULATOR_MEMENVOBJECTS := $(addprefix $(SIMULATOR_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o)) + +SHARED_LIBOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(SOURCES:.cc=.o)) +SHARED_MEMENVOBJECTS := $(addprefix $(SHARED_OUTDIR)/, $(MEMENV_SOURCES:.cc=.o)) + +TESTUTIL := $(STATIC_OUTDIR)/util/testutil.o +TESTHARNESS := $(STATIC_OUTDIR)/util/testharness.o $(TESTUTIL) + +STATIC_TESTOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(TESTS))) +STATIC_UTILOBJS := $(addprefix $(STATIC_OUTDIR)/, $(addsuffix .o, $(UTILS))) +STATIC_ALLOBJS := $(STATIC_LIBOBJECTS) $(STATIC_MEMENVOBJECTS) $(STATIC_TESTOBJS) $(STATIC_UTILOBJS) $(TESTHARNESS) +DEVICE_ALLOBJS := $(DEVICE_LIBOBJECTS) $(DEVICE_MEMENVOBJECTS) +SIMULATOR_ALLOBJS := $(SIMULATOR_LIBOBJECTS) $(SIMULATOR_MEMENVOBJECTS) default: all # Should we build shared libraries? ifneq ($(PLATFORM_SHARED_EXT),) +# Many leveldb test apps use non-exported API's. Only build a subset for testing. +SHARED_ALLOBJS := $(SHARED_LIBOBJECTS) $(SHARED_MEMENVOBJECTS) $(TESTHARNESS) + ifneq ($(PLATFORM_SHARED_VERSIONED),true) -SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) -SHARED2 = $(SHARED1) -SHARED3 = $(SHARED1) -SHARED = $(SHARED1) +SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT) +SHARED_LIB2 = $(SHARED_LIB1) +SHARED_LIB3 = $(SHARED_LIB1) +SHARED_LIBS = $(SHARED_LIB1) +SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a else # Update db.h if you change these. -SHARED_MAJOR = 1 -SHARED_MINOR = 18 -SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) -SHARED2 = $(SHARED1).$(SHARED_MAJOR) -SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) -SHARED = $(SHARED1) $(SHARED2) $(SHARED3) -$(SHARED1): $(SHARED3) - ln -fs $(SHARED3) $(SHARED1) -$(SHARED2): $(SHARED3) - ln -fs $(SHARED3) $(SHARED2) +SHARED_VERSION_MAJOR = 1 +SHARED_VERSION_MINOR = 19 +SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT) +SHARED_LIB2 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR) +SHARED_LIB3 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR).$(SHARED_VERSION_MINOR) +SHARED_LIBS = $(SHARED_OUTDIR)/$(SHARED_LIB1) $(SHARED_OUTDIR)/$(SHARED_LIB2) $(SHARED_OUTDIR)/$(SHARED_LIB3) +$(SHARED_OUTDIR)/$(SHARED_LIB1): $(SHARED_OUTDIR)/$(SHARED_LIB3) + ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB1) +$(SHARED_OUTDIR)/$(SHARED_LIB2): $(SHARED_OUTDIR)/$(SHARED_LIB3) + ln -fs $(SHARED_LIB3) $(SHARED_OUTDIR)/$(SHARED_LIB2) +SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a endif -$(SHARED3): - $(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $(SHARED3) $(LIBS) +$(SHARED_OUTDIR)/$(SHARED_LIB3): $(SHARED_LIBOBJECTS) + $(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED_LIB2) $(SHARED_LIBOBJECTS) -o $(SHARED_OUTDIR)/$(SHARED_LIB3) $(LIBS) endif # PLATFORM_SHARED_EXT -all: $(SHARED) $(LIBRARY) +all: $(SHARED_LIBS) $(SHARED_PROGRAMS) $(STATIC_OUTDIR)/libleveldb.a $(STATIC_OUTDIR)/libmemenv.a $(STATIC_PROGRAMS) -check: all $(PROGRAMS) $(TESTS) - for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done +check: $(STATIC_PROGRAMS) + for t in $(notdir $(TESTS)); do echo "***** Running $$t"; $(STATIC_OUTDIR)/$$t || exit 1; done clean: - -rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) */*.o */*/*.o ios-x86/*/*.o ios-arm/*/*.o build_config.mk - -rm -rf ios-x86/* ios-arm/* + -rm -rf out-static out-shared out-ios-x86 out-ios-arm out-ios-universal + -rm -f build_config.mk + -rm -rf ios-x86 ios-arm -$(LIBRARY): $(LIBOBJECTS) - rm -f $@ - $(AR) -rs $@ $(LIBOBJECTS) +$(STATIC_OUTDIR): + mkdir $@ -db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) - $(CXX) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@ $(LIBS) +$(STATIC_OUTDIR)/db: | $(STATIC_OUTDIR) + mkdir $@ -db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) - $(CXX) $(LDFLAGS) doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) -o $@ -lsqlite3 $(LIBS) +$(STATIC_OUTDIR)/helpers/memenv: | $(STATIC_OUTDIR) + mkdir -p $@ -db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) - $(CXX) $(LDFLAGS) doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) -o $@ -lkyotocabinet $(LIBS) +$(STATIC_OUTDIR)/port: | $(STATIC_OUTDIR) + mkdir $@ -leveldbutil: db/leveldb_main.o $(LIBOBJECTS) - $(CXX) $(LDFLAGS) db/leveldb_main.o $(LIBOBJECTS) -o $@ $(LIBS) +$(STATIC_OUTDIR)/table: | $(STATIC_OUTDIR) + mkdir $@ -arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(STATIC_OUTDIR)/util: | $(STATIC_OUTDIR) + mkdir $@ -autocompact_test: db/autocompact_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/autocompact_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +.PHONY: STATIC_OBJDIRS +STATIC_OBJDIRS: \ + $(STATIC_OUTDIR)/db \ + $(STATIC_OUTDIR)/port \ + $(STATIC_OUTDIR)/table \ + $(STATIC_OUTDIR)/util \ + $(STATIC_OUTDIR)/helpers/memenv -bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(SHARED_OUTDIR): + mkdir $@ -c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(SHARED_OUTDIR)/db: | $(SHARED_OUTDIR) + mkdir $@ -cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(SHARED_OUTDIR)/helpers/memenv: | $(SHARED_OUTDIR) + mkdir -p $@ -coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(SHARED_OUTDIR)/port: | $(SHARED_OUTDIR) + mkdir $@ -corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(SHARED_OUTDIR)/table: | $(SHARED_OUTDIR) + mkdir $@ -crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(SHARED_OUTDIR)/util: | $(SHARED_OUTDIR) + mkdir $@ -db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +.PHONY: SHARED_OBJDIRS +SHARED_OBJDIRS: \ + $(SHARED_OUTDIR)/db \ + $(SHARED_OUTDIR)/port \ + $(SHARED_OUTDIR)/table \ + $(SHARED_OUTDIR)/util \ + $(SHARED_OUTDIR)/helpers/memenv -dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(DEVICE_OUTDIR): + mkdir $@ -env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(DEVICE_OUTDIR)/db: | $(DEVICE_OUTDIR) + mkdir $@ -filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(DEVICE_OUTDIR)/helpers/memenv: | $(DEVICE_OUTDIR) + mkdir -p $@ -filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(DEVICE_OUTDIR)/port: | $(DEVICE_OUTDIR) + mkdir $@ -hash_test: util/hash_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) util/hash_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(DEVICE_OUTDIR)/table: | $(DEVICE_OUTDIR) + mkdir $@ -issue178_test: issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) issues/issue178_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(DEVICE_OUTDIR)/util: | $(DEVICE_OUTDIR) + mkdir $@ -issue200_test: issues/issue200_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) issues/issue200_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +.PHONY: DEVICE_OBJDIRS +DEVICE_OBJDIRS: \ + $(DEVICE_OUTDIR)/db \ + $(DEVICE_OUTDIR)/port \ + $(DEVICE_OUTDIR)/table \ + $(DEVICE_OUTDIR)/util \ + $(DEVICE_OUTDIR)/helpers/memenv -log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(SIMULATOR_OUTDIR): + mkdir $@ -table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(SIMULATOR_OUTDIR)/db: | $(SIMULATOR_OUTDIR) + mkdir $@ -skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(SIMULATOR_OUTDIR)/helpers/memenv: | $(SIMULATOR_OUTDIR) + mkdir -p $@ -version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(SIMULATOR_OUTDIR)/port: | $(SIMULATOR_OUTDIR) + mkdir $@ -version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(SIMULATOR_OUTDIR)/table: | $(SIMULATOR_OUTDIR) + mkdir $@ -write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) +$(SIMULATOR_OUTDIR)/util: | $(SIMULATOR_OUTDIR) + mkdir $@ -$(MEMENVLIBRARY) : $(MEMENVOBJECTS) - rm -f $@ - $(AR) -rs $@ $(MEMENVOBJECTS) +.PHONY: SIMULATOR_OBJDIRS +SIMULATOR_OBJDIRS: \ + $(SIMULATOR_OUTDIR)/db \ + $(SIMULATOR_OUTDIR)/port \ + $(SIMULATOR_OUTDIR)/table \ + $(SIMULATOR_OUTDIR)/util \ + $(SIMULATOR_OUTDIR)/helpers/memenv -memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) - $(CXX) $(LDFLAGS) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LIBS) +$(STATIC_ALLOBJS): | STATIC_OBJDIRS +$(DEVICE_ALLOBJS): | DEVICE_OBJDIRS +$(SIMULATOR_ALLOBJS): | SIMULATOR_OBJDIRS +$(SHARED_ALLOBJS): | SHARED_OBJDIRS ifeq ($(PLATFORM), IOS) -# For iOS, create universal object files to be used on both the simulator and +$(DEVICE_OUTDIR)/libleveldb.a: $(DEVICE_LIBOBJECTS) + rm -f $@ + $(AR) -rs $@ $(DEVICE_LIBOBJECTS) + +$(SIMULATOR_OUTDIR)/libleveldb.a: $(SIMULATOR_LIBOBJECTS) + rm -f $@ + $(AR) -rs $@ $(SIMULATOR_LIBOBJECTS) + +$(DEVICE_OUTDIR)/libmemenv.a: $(DEVICE_MEMENVOBJECTS) + rm -f $@ + $(AR) -rs $@ $(DEVICE_MEMENVOBJECTS) + +$(SIMULATOR_OUTDIR)/libmemenv.a: $(SIMULATOR_MEMENVOBJECTS) + rm -f $@ + $(AR) -rs $@ $(SIMULATOR_MEMENVOBJECTS) + +# For iOS, create universal object libraries to be used on both the simulator and # a device. -PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms -SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer -DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer -IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString) -IOSARCH=-arch armv6 -arch armv7 -arch armv7s -arch arm64 - -.cc.o: - mkdir -p ios-x86/$(dir $@) - xcrun -sdk iphonesimulator $(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@ - mkdir -p ios-arm/$(dir $@) - xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk $(IOSARCH) -c $< -o ios-arm/$@ - xcrun lipo ios-x86/$@ ios-arm/$@ -create -output $@ - -.c.o: - mkdir -p ios-x86/$(dir $@) - xcrun -sdk iphonesimulator $(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@ - mkdir -p ios-arm/$(dir $@) - xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk $(IOSARCH) -c $< -o ios-arm/$@ - xcrun lipo ios-x86/$@ ios-arm/$@ -create -output $@ +$(STATIC_OUTDIR)/libleveldb.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a + lipo -create $(DEVICE_OUTDIR)/libleveldb.a $(SIMULATOR_OUTDIR)/libleveldb.a -output $@ +$(STATIC_OUTDIR)/libmemenv.a: $(STATIC_OUTDIR) $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a + lipo -create $(DEVICE_OUTDIR)/libmemenv.a $(SIMULATOR_OUTDIR)/libmemenv.a -output $@ else -.cc.o: +$(STATIC_OUTDIR)/libleveldb.a:$(STATIC_LIBOBJECTS) + rm -f $@ + $(AR) -rs $@ $(STATIC_LIBOBJECTS) + +$(STATIC_OUTDIR)/libmemenv.a:$(STATIC_MEMENVOBJECTS) + rm -f $@ + $(AR) -rs $@ $(STATIC_MEMENVOBJECTS) +endif + +$(SHARED_MEMENVLIB):$(SHARED_MEMENVOBJECTS) + rm -f $@ + $(AR) -rs $@ $(SHARED_MEMENVOBJECTS) + +$(STATIC_OUTDIR)/db_bench:db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_bench.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/db_bench_sqlite3:doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) + $(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_sqlite3.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lsqlite3 $(LIBS) + +$(STATIC_OUTDIR)/db_bench_tree_db:doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) + $(CXX) $(LDFLAGS) $(CXXFLAGS) doc/bench/db_bench_tree_db.cc $(STATIC_LIBOBJECTS) $(TESTUTIL) -o $@ -lkyotocabinet $(LIBS) + +$(STATIC_OUTDIR)/leveldbutil:db/leveldbutil.cc $(STATIC_LIBOBJECTS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/leveldbutil.cc $(STATIC_LIBOBJECTS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/arena_test:util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) util/arena_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/autocompact_test:db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/autocompact_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/bloom_test:util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) util/bloom_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/c_test:$(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/db/c_test.o $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/cache_test:util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) util/cache_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/coding_test:util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) util/coding_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/corruption_test:db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/corruption_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/crc32c_test:util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) util/crc32c_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/db_test:db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/dbformat_test:db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/env_test:util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/fault_injection_test:db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/fault_injection_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/filename_test:db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/filename_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/filter_block_test:table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) table/filter_block_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/hash_test:util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) util/hash_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/issue178_test:issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue178_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/issue200_test:issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) issues/issue200_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/log_test:db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/log_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/recovery_test:db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/recovery_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/table_test:table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) table/table_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/skiplist_test:db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/skiplist_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/version_edit_test:db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_edit_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/version_set_test:db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/version_set_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/write_batch_test:db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) + $(CXX) $(LDFLAGS) $(CXXFLAGS) db/write_batch_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS) + +$(STATIC_OUTDIR)/memenv_test:$(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS) + $(XCRUN) $(CXX) $(LDFLAGS) $(STATIC_OUTDIR)/helpers/memenv/memenv_test.o $(STATIC_OUTDIR)/libmemenv.a $(STATIC_OUTDIR)/libleveldb.a $(TESTHARNESS) -o $@ $(LIBS) + +$(SHARED_OUTDIR)/db_bench:$(SHARED_OUTDIR)/db/db_bench.o $(SHARED_LIBS) $(TESTUTIL) + $(XCRUN) $(CXX) $(LDFLAGS) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SHARED_OUTDIR)/db/db_bench.o $(TESTUTIL) $(SHARED_OUTDIR)/$(SHARED_LIB3) -o $@ $(LIBS) + +.PHONY: run-shared +run-shared: $(SHARED_OUTDIR)/db_bench + LD_LIBRARY_PATH=$(SHARED_OUTDIR) $(SHARED_OUTDIR)/db_bench + +$(SIMULATOR_OUTDIR)/%.o: %.cc + xcrun -sdk iphonesimulator $(CXX) $(CXXFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@ + +$(DEVICE_OUTDIR)/%.o: %.cc + xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) $(DEVICE_CFLAGS) -c $< -o $@ + +$(SIMULATOR_OUTDIR)/%.o: %.c + xcrun -sdk iphonesimulator $(CC) $(CFLAGS) $(SIMULATOR_CFLAGS) -c $< -o $@ + +$(DEVICE_OUTDIR)/%.o: %.c + xcrun -sdk iphoneos $(CC) $(CFLAGS) $(DEVICE_CFLAGS) -c $< -o $@ + +$(STATIC_OUTDIR)/%.o: %.cc $(CXX) $(CXXFLAGS) -c $< -o $@ -.c.o: +$(STATIC_OUTDIR)/%.o: %.c $(CC) $(CFLAGS) -c $< -o $@ -endif + +$(SHARED_OUTDIR)/%.o: %.cc + $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@ + +$(SHARED_OUTDIR)/%.o: %.c + $(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@ diff --git a/README b/README deleted file mode 100644 index 3618adeee..000000000 --- a/README +++ /dev/null @@ -1,51 +0,0 @@ -leveldb: A key-value store -Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) - -The code under this directory implements a system for maintaining a -persistent key/value store. - -See doc/index.html for more explanation. -See doc/impl.html for a brief overview of the implementation. - -The public interface is in include/*.h. Callers should not include or -rely on the details of any other header files in this package. Those -internal APIs may be changed without warning. - -Guide to header files: - -include/db.h - Main interface to the DB: Start here - -include/options.h - Control over the behavior of an entire database, and also - control over the behavior of individual reads and writes. - -include/comparator.h - Abstraction for user-specified comparison function. If you want - just bytewise comparison of keys, you can use the default comparator, - but clients can write their own comparator implementations if they - want custom ordering (e.g. to handle different character - encodings, etc.) - -include/iterator.h - Interface for iterating over data. You can get an iterator - from a DB object. - -include/write_batch.h - Interface for atomically applying multiple updates to a database. - -include/slice.h - A simple module for maintaining a pointer and a length into some - other byte array. - -include/status.h - Status is returned from many of the public interfaces and is used - to report success and various kinds of errors. - -include/env.h - Abstraction of the OS environment. A posix implementation of - this interface is in util/env_posix.cc - -include/table.h -include/table_builder.h - Lower-level modules that most clients probably won't use directly diff --git a/README.md b/README.md index 480affb5c..c75b185e0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ **LevelDB is a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.** +[![Build Status](https://travis-ci.org/google/leveldb.svg?branch=master)](https://travis-ci.org/google/leveldb) + Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) # Features @@ -10,9 +12,11 @@ Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) * Multiple changes can be made in one atomic batch. * Users can create a transient snapshot to get a consistent view of data. * Forward and backward iteration is supported over the data. - * Data is automatically compressed using the [Snappy compression library](http://code.google.com/p/snappy). + * Data is automatically compressed using the [Snappy compression library](http://google.github.io/snappy/). * External activity (file system operations etc.) is relayed through a virtual interface so users can customize the operating system interactions. - * [Detailed documentation](http://htmlpreview.github.io/?https://github.com/google/leveldb/blob/master/doc/index.html) about how to use the library is included with the source code. + +# Documentation + [LevelDB library documentation](https://rawgit.com/google/leveldb/master/doc/index.html) is online and bundled with the source code. # Limitations @@ -20,6 +24,37 @@ Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) * Only a single process (possibly multi-threaded) can access a particular database at a time. * There is no client-server support builtin to the library. An application that needs such support will have to wrap their own server around the library. +# Contributing to the leveldb Project +The leveldb project welcomes contributions. leveldb's primary goal is to be +a reliable and fast key/value store. Changes that are in line with the +features/limitations outlined above, and meet the requirements below, +will be considered. + +Contribution requirements: + +1. **POSIX only**. We _generally_ will only accept changes that are both + compiled, and tested on a POSIX platform - usually Linux. Very small + changes will sometimes be accepted, but consider that more of an + exception than the rule. + +2. **Stable API**. We strive very hard to maintain a stable API. Changes that + require changes for projects using leveldb _might_ be rejected without + sufficient benefit to the project. + +3. **Tests**: All changes must be accompanied by a new (or changed) test, or + a sufficient explanation as to why a new (or changed) test is not required. + +## Submitting a Pull Request +Before any pull request will be accepted the author must first sign a +Contributor License Agreement (CLA) at https://cla.developers.google.com/. + +In order to keep the commit timeline linear +[squash](https://git-scm.com/book/en/v2/Git-Tools-Rewriting-History#Squashing-Commits) +your changes down to a single commit and [rebase](https://git-scm.com/docs/git-rebase) +on google/leveldb/master. This keeps the commit timeline linear and more easily sync'ed +with the internal repository at Google. More information at GitHub's +[About Git rebase](https://help.github.com/articles/about-git-rebase/) page. + # Performance Here is a performance report (with explanations) from the run of the diff --git a/build_detect_platform b/build_detect_platform index a1101c1bd..d7edab1d8 100755 --- a/build_detect_platform +++ b/build_detect_platform @@ -175,7 +175,7 @@ DIRS="$PREFIX/db $PREFIX/util $PREFIX/table" set -f # temporarily disable globbing so that our patterns aren't expanded PRUNE_TEST="-name *test*.cc -prune" PRUNE_BENCH="-name *_bench.cc -prune" -PRUNE_TOOL="-name leveldb_main.cc -prune" +PRUNE_TOOL="-name leveldbutil.cc -prune" PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_TOOL -o -name '*.cc' -print | sort | sed "s,^$PREFIX/,," | tr "\n" " "` set +f # re-enable globbing diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 96afc6891..37a484d25 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -36,7 +36,7 @@ class CorruptionTest { tiny_cache_ = NewLRUCache(100); options_.env = &env_; options_.block_cache = tiny_cache_; - dbname_ = test::TmpDir() + "/db_test"; + dbname_ = test::TmpDir() + "/corruption_test"; DestroyDB(dbname_, options_); db_ = NULL; diff --git a/db/db_bench.cc b/db/db_bench.cc index 705a170aa..7a0f5e08c 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -33,6 +33,7 @@ // readmissing -- read N missing keys in random order // readhot -- read N times in random order from 1% section of DB // seekrandom -- N random seeks +// open -- cost of opening a DB // crc32c -- repeated crc32c of 4K of data // acquireload -- load N*1000 times // Meta operations: @@ -99,6 +100,9 @@ static int FLAGS_bloom_bits = -1; // benchmark will fail. static bool FLAGS_use_existing_db = false; +// If true, reuse existing log/MANIFEST files when re-opening a database. +static bool FLAGS_reuse_logs = false; + // Use the db with the following name. static const char* FLAGS_db = NULL; @@ -138,6 +142,7 @@ class RandomGenerator { } }; +#if defined(__linux) static Slice TrimSpace(Slice s) { size_t start = 0; while (start < s.size() && isspace(s[start])) { @@ -149,6 +154,7 @@ static Slice TrimSpace(Slice s) { } return Slice(s.data() + start, limit - start); } +#endif static void AppendWithSpace(std::string* str, Slice msg) { if (msg.empty()) return; @@ -442,7 +448,11 @@ class Benchmark { bool fresh_db = false; int num_threads = FLAGS_threads; - if (name == Slice("fillseq")) { + if (name == Slice("open")) { + method = &Benchmark::OpenBench; + num_ /= 10000; + if (num_ < 1) num_ = 1; + } else if (name == Slice("fillseq")) { fresh_db = true; method = &Benchmark::WriteSeq; } else if (name == Slice("fillbatch")) { @@ -695,6 +705,7 @@ class Benchmark { options.write_buffer_size = FLAGS_write_buffer_size; options.max_open_files = FLAGS_open_files; options.filter_policy = filter_policy_; + options.reuse_logs = FLAGS_reuse_logs; Status s = DB::Open(options, FLAGS_db, &db_); if (!s.ok()) { fprintf(stderr, "open error: %s\n", s.ToString().c_str()); @@ -702,6 +713,14 @@ class Benchmark { } } + void OpenBench(ThreadState* thread) { + for (int i = 0; i < num_; i++) { + delete db_; + Open(); + thread->stats.FinishedSingleOp(); + } + } + void WriteSeq(ThreadState* thread) { DoWrite(thread, true); } @@ -941,6 +960,9 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_use_existing_db = n; + } else if (sscanf(argv[i], "--reuse_logs=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_reuse_logs = n; } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { FLAGS_num = n; } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) { diff --git a/db/db_impl.cc b/db/db_impl.cc index 49b95953b..60f4e66e5 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -125,7 +125,7 @@ DBImpl::DBImpl(const Options& raw_options, const std::string& dbname) db_lock_(NULL), shutting_down_(NULL), bg_cv_(&mutex_), - mem_(new MemTable(internal_comparator_)), + mem_(NULL), imm_(NULL), logfile_(NULL), logfile_number_(0), @@ -134,7 +134,6 @@ DBImpl::DBImpl(const Options& raw_options, const std::string& dbname) tmp_batch_(new WriteBatch), bg_compaction_scheduled_(false), manual_compaction_(NULL) { - mem_->Ref(); has_imm_.Release_Store(NULL); // Reserve ten files or so for other uses and give the rest to TableCache. @@ -271,7 +270,7 @@ void DBImpl::DeleteObsoleteFiles() { } } -Status DBImpl::Recover(VersionEdit* edit) { +Status DBImpl::Recover(VersionEdit* edit, bool *save_manifest) { mutex_.AssertHeld(); // Ignore error from CreateDir since the creation of the DB is @@ -301,66 +300,69 @@ Status DBImpl::Recover(VersionEdit* edit) { } } - s = versions_->Recover(); - if (s.ok()) { - SequenceNumber max_sequence(0); + s = versions_->Recover(save_manifest); + if (!s.ok()) { + return s; + } + SequenceNumber max_sequence(0); - // Recover from all newer log files than the ones named in the - // descriptor (new log files may have been added by the previous - // incarnation without registering them in the descriptor). - // - // Note that PrevLogNumber() is no longer used, but we pay - // attention to it in case we are recovering a database - // produced by an older version of leveldb. - const uint64_t min_log = versions_->LogNumber(); - const uint64_t prev_log = versions_->PrevLogNumber(); - std::vector filenames; - s = env_->GetChildren(dbname_, &filenames); + // Recover from all newer log files than the ones named in the + // descriptor (new log files may have been added by the previous + // incarnation without registering them in the descriptor). + // + // Note that PrevLogNumber() is no longer used, but we pay + // attention to it in case we are recovering a database + // produced by an older version of leveldb. + const uint64_t min_log = versions_->LogNumber(); + const uint64_t prev_log = versions_->PrevLogNumber(); + std::vector filenames; + s = env_->GetChildren(dbname_, &filenames); + if (!s.ok()) { + return s; + } + std::set expected; + versions_->AddLiveFiles(&expected); + uint64_t number; + FileType type; + std::vector logs; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + expected.erase(number); + if (type == kLogFile && ((number >= min_log) || (number == prev_log))) + logs.push_back(number); + } + } + if (!expected.empty()) { + char buf[50]; + snprintf(buf, sizeof(buf), "%d missing files; e.g.", + static_cast(expected.size())); + return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin()))); + } + + // Recover in the order in which the logs were generated + std::sort(logs.begin(), logs.end()); + for (size_t i = 0; i < logs.size(); i++) { + s = RecoverLogFile(logs[i], (i == logs.size() - 1), save_manifest, edit, + &max_sequence); if (!s.ok()) { return s; } - std::set expected; - versions_->AddLiveFiles(&expected); - uint64_t number; - FileType type; - std::vector logs; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type)) { - expected.erase(number); - if (type == kLogFile && ((number >= min_log) || (number == prev_log))) - logs.push_back(number); - } - } - if (!expected.empty()) { - char buf[50]; - snprintf(buf, sizeof(buf), "%d missing files; e.g.", - static_cast(expected.size())); - return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin()))); - } - // Recover in the order in which the logs were generated - std::sort(logs.begin(), logs.end()); - for (size_t i = 0; i < logs.size(); i++) { - s = RecoverLogFile(logs[i], edit, &max_sequence); - - // The previous incarnation may not have written any MANIFEST - // records after allocating this log number. So we manually - // update the file number allocation counter in VersionSet. - versions_->MarkFileNumberUsed(logs[i]); - } - - if (s.ok()) { - if (versions_->LastSequence() < max_sequence) { - versions_->SetLastSequence(max_sequence); - } - } + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsed(logs[i]); } - return s; + if (versions_->LastSequence() < max_sequence) { + versions_->SetLastSequence(max_sequence); + } + + return Status::OK(); } -Status DBImpl::RecoverLogFile(uint64_t log_number, - VersionEdit* edit, +Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log, + bool* save_manifest, VersionEdit* edit, SequenceNumber* max_sequence) { struct LogReporter : public log::Reader::Reporter { Env* env; @@ -405,6 +407,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, std::string scratch; Slice record; WriteBatch batch; + int compactions = 0; MemTable* mem = NULL; while (reader.ReadRecord(&record, &scratch) && status.ok()) { @@ -432,25 +435,52 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, } if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) { + compactions++; + *save_manifest = true; status = WriteLevel0Table(mem, edit, NULL); + mem->Unref(); + mem = NULL; if (!status.ok()) { // Reflect errors immediately so that conditions like full // file-systems cause the DB::Open() to fail. break; } - mem->Unref(); - mem = NULL; } } - if (status.ok() && mem != NULL) { - status = WriteLevel0Table(mem, edit, NULL); - // Reflect errors immediately so that conditions like full - // file-systems cause the DB::Open() to fail. + delete file; + + // See if we should keep reusing the last log file. + if (status.ok() && options_.reuse_logs && last_log && compactions == 0) { + assert(logfile_ == NULL); + assert(log_ == NULL); + assert(mem_ == NULL); + uint64_t lfile_size; + if (env_->GetFileSize(fname, &lfile_size).ok() && + env_->NewAppendableFile(fname, &logfile_).ok()) { + Log(options_.info_log, "Reusing old log %s \n", fname.c_str()); + log_ = new log::Writer(logfile_, lfile_size); + logfile_number_ = log_number; + if (mem != NULL) { + mem_ = mem; + mem = NULL; + } else { + // mem can be NULL if lognum exists but was empty. + mem_ = new MemTable(internal_comparator_); + mem_->Ref(); + } + } + } + + if (mem != NULL) { + // mem did not get reused; compact it. + if (status.ok()) { + *save_manifest = true; + status = WriteLevel0Table(mem, edit, NULL); + } + mem->Unref(); } - if (mem != NULL) mem->Unref(); - delete file; return status; } @@ -821,8 +851,9 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, delete iter; if (s.ok()) { Log(options_.info_log, - "Generated table #%llu: %lld keys, %lld bytes", + "Generated table #%llu@%d: %lld keys, %lld bytes", (unsigned long long) output_number, + compact->compaction->level(), (unsigned long long) current_entries, (unsigned long long) current_bytes); } @@ -1395,6 +1426,19 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { } else if (in == "sstables") { *value = versions_->current()->DebugString(); return true; + } else if (in == "approximate-memory-usage") { + size_t total_usage = options_.block_cache->TotalCharge(); + if (mem_) { + total_usage += mem_->ApproximateMemoryUsage(); + } + if (imm_) { + total_usage += imm_->ApproximateMemoryUsage(); + } + char buf[50]; + snprintf(buf, sizeof(buf), "%llu", + static_cast(total_usage)); + value->append(buf); + return true; } return false; @@ -1449,8 +1493,11 @@ Status DB::Open(const Options& options, const std::string& dbname, DBImpl* impl = new DBImpl(options, dbname); impl->mutex_.Lock(); VersionEdit edit; - Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists - if (s.ok()) { + // Recover handles create_if_missing, error_if_exists + bool save_manifest = false; + Status s = impl->Recover(&edit, &save_manifest); + if (s.ok() && impl->mem_ == NULL) { + // Create new log and a corresponding memtable. uint64_t new_log_number = impl->versions_->NewFileNumber(); WritableFile* lfile; s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), @@ -1460,15 +1507,22 @@ Status DB::Open(const Options& options, const std::string& dbname, impl->logfile_ = lfile; impl->logfile_number_ = new_log_number; impl->log_ = new log::Writer(lfile); - s = impl->versions_->LogAndApply(&edit, &impl->mutex_); - } - if (s.ok()) { - impl->DeleteObsoleteFiles(); - impl->MaybeScheduleCompaction(); + impl->mem_ = new MemTable(impl->internal_comparator_); + impl->mem_->Ref(); } } + if (s.ok() && save_manifest) { + edit.SetPrevLogNumber(0); // No older logs needed after recovery. + edit.SetLogNumber(impl->logfile_number_); + s = impl->versions_->LogAndApply(&edit, &impl->mutex_); + } + if (s.ok()) { + impl->DeleteObsoleteFiles(); + impl->MaybeScheduleCompaction(); + } impl->mutex_.Unlock(); if (s.ok()) { + assert(impl->mem_ != NULL); *dbptr = impl; } else { delete impl; diff --git a/db/db_impl.h b/db/db_impl.h index cfc998164..8ff323e72 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -78,7 +78,8 @@ class DBImpl : public DB { // Recover the descriptor from persistent storage. May do a significant // amount of work to recover recently logged updates. Any changes to // be made to the descriptor are added to *edit. - Status Recover(VersionEdit* edit) EXCLUSIVE_LOCKS_REQUIRED(mutex_); + Status Recover(VersionEdit* edit, bool* save_manifest) + EXCLUSIVE_LOCKS_REQUIRED(mutex_); void MaybeIgnoreError(Status* s) const; @@ -90,9 +91,8 @@ class DBImpl : public DB { // Errors are recorded in bg_error_. void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_); - Status RecoverLogFile(uint64_t log_number, - VersionEdit* edit, - SequenceNumber* max_sequence) + Status RecoverLogFile(uint64_t log_number, bool last_log, bool* save_manifest, + VersionEdit* edit, SequenceNumber* max_sequence) EXCLUSIVE_LOCKS_REQUIRED(mutex_); Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base) diff --git a/db/db_test.cc b/db/db_test.cc index 0fed9137d..a0b08bc19 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -193,6 +193,7 @@ class DBTest { // Sequence of option configurations to try enum OptionConfig { kDefault, + kReuse, kFilter, kUncompressed, kEnd @@ -237,7 +238,11 @@ class DBTest { // Return the current option configuration. Options CurrentOptions() { Options options; + options.reuse_logs = false; switch (option_config_) { + case kReuse: + options.reuse_logs = true; + break; case kFilter: options.filter_policy = filter_policy_; break; @@ -558,6 +563,17 @@ TEST(DBTest, GetFromVersions) { } while (ChangeOptions()); } +TEST(DBTest, GetMemUsage) { + do { + ASSERT_OK(Put("foo", "v1")); + std::string val; + ASSERT_TRUE(db_->GetProperty("leveldb.approximate-memory-usage", &val)); + int mem_usage = atoi(val.c_str()); + ASSERT_GT(mem_usage, 0); + ASSERT_LT(mem_usage, 5*1024*1024); + } while (ChangeOptions()); +} + TEST(DBTest, GetSnapshot) { do { // Try with both a short key and a long key @@ -1080,6 +1096,14 @@ TEST(DBTest, ApproximateSizes) { // 0 because GetApproximateSizes() does not account for memtable space ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); + if (options.reuse_logs) { + // Recovery will reuse memtable, and GetApproximateSizes() does not + // account for memtable usage; + Reopen(&options); + ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); + continue; + } + // Check sizes across recovery by reopening a few times for (int run = 0; run < 3; run++) { Reopen(&options); @@ -1123,6 +1147,11 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); + if (options.reuse_logs) { + // Need to force a memtable compaction since recovery does not do so. + ASSERT_OK(dbfull()->TEST_CompactMemTable()); + } + // Check sizes across recovery by reopening a few times for (int run = 0; run < 3; run++) { Reopen(&options); @@ -2084,7 +2113,8 @@ void BM_LogAndApply(int iters, int num_base_files) { InternalKeyComparator cmp(BytewiseComparator()); Options options; VersionSet vset(dbname, &options, NULL, &cmp); - ASSERT_OK(vset.Recover()); + bool save_manifest; + ASSERT_OK(vset.Recover(&save_manifest)); VersionEdit vbase; uint64_t fnum = 1; for (int i = 0; i < num_base_files; i++) { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc new file mode 100644 index 000000000..875dfe81e --- /dev/null +++ b/db/fault_injection_test.cc @@ -0,0 +1,554 @@ +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom Env to keep track of the state of a filesystem as of +// the last "sync". It then checks for data loss errors by purposely dropping +// file data (or entire files) not protected by a "sync". + +#include "leveldb/db.h" + +#include +#include +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/log_format.h" +#include "db/version_set.h" +#include "leveldb/cache.h" +#include "leveldb/env.h" +#include "leveldb/table.h" +#include "leveldb/write_batch.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +static const int kValueSize = 1000; +static const int kMaxNumValues = 2000; +static const size_t kNumIterations = 3; + +class FaultInjectionTestEnv; + +namespace { + +// Assume a filename, and not a directory name like "/foo/bar/" +static std::string GetDirName(const std::string filename) { + size_t found = filename.find_last_of("/\\"); + if (found == std::string::npos) { + return ""; + } else { + return filename.substr(0, found); + } +} + +Status SyncDir(const std::string& dir) { + // As this is a test it isn't required to *actually* sync this directory. + return Status::OK(); +} + +// A basic file truncation function suitable for this test. +Status Truncate(const std::string& filename, uint64_t length) { + leveldb::Env* env = leveldb::Env::Default(); + + SequentialFile* orig_file; + Status s = env->NewSequentialFile(filename, &orig_file); + if (!s.ok()) + return s; + + char* scratch = new char[length]; + leveldb::Slice result; + s = orig_file->Read(length, &result, scratch); + delete orig_file; + if (s.ok()) { + std::string tmp_name = GetDirName(filename) + "/truncate.tmp"; + WritableFile* tmp_file; + s = env->NewWritableFile(tmp_name, &tmp_file); + if (s.ok()) { + s = tmp_file->Append(result); + delete tmp_file; + if (s.ok()) { + s = env->RenameFile(tmp_name, filename); + } else { + env->DeleteFile(tmp_name); + } + } + } + + delete[] scratch; + + return s; +} + +struct FileState { + std::string filename_; + ssize_t pos_; + ssize_t pos_at_last_sync_; + ssize_t pos_at_last_flush_; + + FileState(const std::string& filename) + : filename_(filename), + pos_(-1), + pos_at_last_sync_(-1), + pos_at_last_flush_(-1) { } + + FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {} + + bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; } + + Status DropUnsyncedData() const; +}; + +} // anonymous namespace + +// A wrapper around WritableFile which informs another Env whenever this file +// is written to or sync'ed. +class TestWritableFile : public WritableFile { + public: + TestWritableFile(const FileState& state, + WritableFile* f, + FaultInjectionTestEnv* env); + virtual ~TestWritableFile(); + virtual Status Append(const Slice& data); + virtual Status Close(); + virtual Status Flush(); + virtual Status Sync(); + + private: + FileState state_; + WritableFile* target_; + bool writable_file_opened_; + FaultInjectionTestEnv* env_; + + Status SyncParent(); +}; + +class FaultInjectionTestEnv : public EnvWrapper { + public: + FaultInjectionTestEnv() : EnvWrapper(Env::Default()), filesystem_active_(true) {} + virtual ~FaultInjectionTestEnv() { } + virtual Status NewWritableFile(const std::string& fname, + WritableFile** result); + virtual Status NewAppendableFile(const std::string& fname, + WritableFile** result); + virtual Status DeleteFile(const std::string& f); + virtual Status RenameFile(const std::string& s, const std::string& t); + + void WritableFileClosed(const FileState& state); + Status DropUnsyncedFileData(); + Status DeleteFilesCreatedAfterLastDirSync(); + void DirWasSynced(); + bool IsFileCreatedSinceLastDirSync(const std::string& filename); + void ResetState(); + void UntrackFile(const std::string& f); + // Setting the filesystem to inactive is the test equivalent to simulating a + // system reset. Setting to inactive will freeze our saved filesystem state so + // that it will stop being recorded. It can then be reset back to the state at + // the time of the reset. + bool IsFilesystemActive() const { return filesystem_active_; } + void SetFilesystemActive(bool active) { filesystem_active_ = active; } + + private: + port::Mutex mutex_; + std::map db_file_state_; + std::set new_files_since_last_dir_sync_; + bool filesystem_active_; // Record flushes, syncs, writes +}; + +TestWritableFile::TestWritableFile(const FileState& state, + WritableFile* f, + FaultInjectionTestEnv* env) + : state_(state), + target_(f), + writable_file_opened_(true), + env_(env) { + assert(f != NULL); +} + +TestWritableFile::~TestWritableFile() { + if (writable_file_opened_) { + Close(); + } + delete target_; +} + +Status TestWritableFile::Append(const Slice& data) { + Status s = target_->Append(data); + if (s.ok() && env_->IsFilesystemActive()) { + state_.pos_ += data.size(); + } + return s; +} + +Status TestWritableFile::Close() { + writable_file_opened_ = false; + Status s = target_->Close(); + if (s.ok()) { + env_->WritableFileClosed(state_); + } + return s; +} + +Status TestWritableFile::Flush() { + Status s = target_->Flush(); + if (s.ok() && env_->IsFilesystemActive()) { + state_.pos_at_last_flush_ = state_.pos_; + } + return s; +} + +Status TestWritableFile::SyncParent() { + Status s = SyncDir(GetDirName(state_.filename_)); + if (s.ok()) { + env_->DirWasSynced(); + } + return s; +} + +Status TestWritableFile::Sync() { + if (!env_->IsFilesystemActive()) { + return Status::OK(); + } + // Ensure new files referred to by the manifest are in the filesystem. + Status s = target_->Sync(); + if (s.ok()) { + state_.pos_at_last_sync_ = state_.pos_; + } + if (env_->IsFileCreatedSinceLastDirSync(state_.filename_)) { + Status ps = SyncParent(); + if (s.ok() && !ps.ok()) { + s = ps; + } + } + return s; +} + +Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname, + WritableFile** result) { + WritableFile* actual_writable_file; + Status s = target()->NewWritableFile(fname, &actual_writable_file); + if (s.ok()) { + FileState state(fname); + state.pos_ = 0; + *result = new TestWritableFile(state, actual_writable_file, this); + // NewWritableFile doesn't append to files, so if the same file is + // opened again then it will be truncated - so forget our saved + // state. + UntrackFile(fname); + MutexLock l(&mutex_); + new_files_since_last_dir_sync_.insert(fname); + } + return s; +} + +Status FaultInjectionTestEnv::NewAppendableFile(const std::string& fname, + WritableFile** result) { + WritableFile* actual_writable_file; + Status s = target()->NewAppendableFile(fname, &actual_writable_file); + if (s.ok()) { + FileState state(fname); + state.pos_ = 0; + { + MutexLock l(&mutex_); + if (db_file_state_.count(fname) == 0) { + new_files_since_last_dir_sync_.insert(fname); + } else { + state = db_file_state_[fname]; + } + } + *result = new TestWritableFile(state, actual_writable_file, this); + } + return s; +} + +Status FaultInjectionTestEnv::DropUnsyncedFileData() { + Status s; + MutexLock l(&mutex_); + for (std::map::const_iterator it = + db_file_state_.begin(); + s.ok() && it != db_file_state_.end(); ++it) { + const FileState& state = it->second; + if (!state.IsFullySynced()) { + s = state.DropUnsyncedData(); + } + } + return s; +} + +void FaultInjectionTestEnv::DirWasSynced() { + MutexLock l(&mutex_); + new_files_since_last_dir_sync_.clear(); +} + +bool FaultInjectionTestEnv::IsFileCreatedSinceLastDirSync( + const std::string& filename) { + MutexLock l(&mutex_); + return new_files_since_last_dir_sync_.find(filename) != + new_files_since_last_dir_sync_.end(); +} + +void FaultInjectionTestEnv::UntrackFile(const std::string& f) { + MutexLock l(&mutex_); + db_file_state_.erase(f); + new_files_since_last_dir_sync_.erase(f); +} + +Status FaultInjectionTestEnv::DeleteFile(const std::string& f) { + Status s = EnvWrapper::DeleteFile(f); + ASSERT_OK(s); + if (s.ok()) { + UntrackFile(f); + } + return s; +} + +Status FaultInjectionTestEnv::RenameFile(const std::string& s, + const std::string& t) { + Status ret = EnvWrapper::RenameFile(s, t); + + if (ret.ok()) { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + db_file_state_.erase(s); + } + + if (new_files_since_last_dir_sync_.erase(s) != 0) { + assert(new_files_since_last_dir_sync_.find(t) == + new_files_since_last_dir_sync_.end()); + new_files_since_last_dir_sync_.insert(t); + } + } + + return ret; +} + +void FaultInjectionTestEnv::ResetState() { + // Since we are not destroying the database, the existing files + // should keep their recorded synced/flushed state. Therefore + // we do not reset db_file_state_ and new_files_since_last_dir_sync_. + MutexLock l(&mutex_); + SetFilesystemActive(true); +} + +Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() { + // Because DeleteFile access this container make a copy to avoid deadlock + mutex_.Lock(); + std::set new_files(new_files_since_last_dir_sync_.begin(), + new_files_since_last_dir_sync_.end()); + mutex_.Unlock(); + Status s; + std::set::const_iterator it; + for (it = new_files.begin(); s.ok() && it != new_files.end(); ++it) { + s = DeleteFile(*it); + } + return s; +} + +void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) { + MutexLock l(&mutex_); + db_file_state_[state.filename_] = state; +} + +Status FileState::DropUnsyncedData() const { + ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; + return Truncate(filename_, sync_pos); +} + +class FaultInjectionTest { + public: + enum ExpectedVerifResult { VAL_EXPECT_NO_ERROR, VAL_EXPECT_ERROR }; + enum ResetMethod { RESET_DROP_UNSYNCED_DATA, RESET_DELETE_UNSYNCED_FILES }; + + FaultInjectionTestEnv* env_; + std::string dbname_; + Cache* tiny_cache_; + Options options_; + DB* db_; + + FaultInjectionTest() + : env_(new FaultInjectionTestEnv), + tiny_cache_(NewLRUCache(100)), + db_(NULL) { + dbname_ = test::TmpDir() + "/fault_test"; + DestroyDB(dbname_, Options()); // Destroy any db from earlier run + options_.reuse_logs = true; + options_.env = env_; + options_.paranoid_checks = true; + options_.block_cache = tiny_cache_; + options_.create_if_missing = true; + } + + ~FaultInjectionTest() { + CloseDB(); + DestroyDB(dbname_, Options()); + delete tiny_cache_; + delete env_; + } + + void ReuseLogs(bool reuse) { + options_.reuse_logs = reuse; + } + + void Build(int start_idx, int num_vals) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = start_idx; i < start_idx + num_vals; i++) { + Slice key = Key(i, &key_space); + batch.Clear(); + batch.Put(key, Value(i, &value_space)); + WriteOptions options; + ASSERT_OK(db_->Write(options, &batch)); + } + } + + Status ReadValue(int i, std::string* val) const { + std::string key_space, value_space; + Slice key = Key(i, &key_space); + Value(i, &value_space); + ReadOptions options; + return db_->Get(options, key, val); + } + + Status Verify(int start_idx, int num_vals, + ExpectedVerifResult expected) const { + std::string val; + std::string value_space; + Status s; + for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) { + Value(i, &value_space); + s = ReadValue(i, &val); + if (expected == VAL_EXPECT_NO_ERROR) { + if (s.ok()) { + ASSERT_EQ(value_space, val); + } + } else if (s.ok()) { + fprintf(stderr, "Expected an error at %d, but was OK\n", i); + s = Status::IOError(dbname_, "Expected value error:"); + } else { + s = Status::OK(); // An expected error + } + } + return s; + } + + // Return the ith key + Slice Key(int i, std::string* storage) const { + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", i); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) const { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } + + Status OpenDB() { + delete db_; + db_ = NULL; + env_->ResetState(); + return DB::Open(options_, dbname_, &db_); + } + + void CloseDB() { + delete db_; + db_ = NULL; + } + + void DeleteAllData() { + Iterator* iter = db_->NewIterator(ReadOptions()); + WriteOptions options; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(db_->Delete(WriteOptions(), iter->key())); + } + + delete iter; + } + + void ResetDBState(ResetMethod reset_method) { + switch (reset_method) { + case RESET_DROP_UNSYNCED_DATA: + ASSERT_OK(env_->DropUnsyncedFileData()); + break; + case RESET_DELETE_UNSYNCED_FILES: + ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync()); + break; + default: + assert(false); + } + } + + void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) { + DeleteAllData(); + Build(0, num_pre_sync); + db_->CompactRange(NULL, NULL); + Build(num_pre_sync, num_post_sync); + } + + void PartialCompactTestReopenWithFault(ResetMethod reset_method, + int num_pre_sync, + int num_post_sync) { + env_->SetFilesystemActive(false); + CloseDB(); + ResetDBState(reset_method); + ASSERT_OK(OpenDB()); + ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::VAL_EXPECT_NO_ERROR)); + ASSERT_OK(Verify(num_pre_sync, num_post_sync, FaultInjectionTest::VAL_EXPECT_ERROR)); + } + + void NoWriteTestPreFault() { + } + + void NoWriteTestReopenWithFault(ResetMethod reset_method) { + CloseDB(); + ResetDBState(reset_method); + ASSERT_OK(OpenDB()); + } + + void DoTest() { + Random rnd(0); + ASSERT_OK(OpenDB()); + for (size_t idx = 0; idx < kNumIterations; idx++) { + int num_pre_sync = rnd.Uniform(kMaxNumValues); + int num_post_sync = rnd.Uniform(kMaxNumValues); + + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + PartialCompactTestReopenWithFault(RESET_DROP_UNSYNCED_DATA, + num_pre_sync, + num_post_sync); + + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(RESET_DROP_UNSYNCED_DATA); + + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + // No new files created so we expect all values since no files will be + // dropped. + PartialCompactTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES, + num_pre_sync + num_post_sync, + 0); + + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES); + } + } +}; + +TEST(FaultInjectionTest, FaultTestNoLogReuse) { + ReuseLogs(false); + DoTest(); +} + +TEST(FaultInjectionTest, FaultTestWithLogReuse) { + ReuseLogs(true); + DoTest(); +} + +} // namespace leveldb + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/leveldb_main.cc b/db/leveldbutil.cc similarity index 100% rename from db/leveldb_main.cc rename to db/leveldbutil.cc diff --git a/db/log_reader.cc b/db/log_reader.cc index e44b66c85..a6d304545 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -25,7 +25,8 @@ Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum, eof_(false), last_record_offset_(0), end_of_buffer_offset_(0), - initial_offset_(initial_offset) { + initial_offset_(initial_offset), + resyncing_(initial_offset > 0) { } Reader::~Reader() { @@ -72,8 +73,25 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) { Slice fragment; while (true) { - uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); const unsigned int record_type = ReadPhysicalRecord(&fragment); + + // ReadPhysicalRecord may have only had an empty trailer remaining in its + // internal buffer. Calculate the offset of the next physical record now + // that it has returned, properly accounting for its header size. + uint64_t physical_record_offset = + end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size(); + + if (resyncing_) { + if (record_type == kMiddleType) { + continue; + } else if (record_type == kLastType) { + resyncing_ = false; + continue; + } else { + resyncing_ = false; + } + } + switch (record_type) { case kFullType: if (in_fragmented_record) { diff --git a/db/log_reader.h b/db/log_reader.h index 6aff79171..8389d61f8 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -73,6 +73,11 @@ class Reader { // Offset at which to start looking for the first record to return uint64_t const initial_offset_; + // True if we are resynchronizing after a seek (initial_offset_ > 0). In + // particular, a run of kMiddleType and kLastType records can be silently + // skipped in this mode + bool resyncing_; + // Extend record types with the following special values enum { kEof = kMaxRecordType + 1, diff --git a/db/log_test.cc b/db/log_test.cc index dcf056265..48a592865 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -79,7 +79,7 @@ class LogTest { virtual Status Skip(uint64_t n) { if (n > contents_.size()) { contents_.clear(); - return Status::NotFound("in-memory file skipepd past end"); + return Status::NotFound("in-memory file skipped past end"); } contents_.remove_prefix(n); @@ -104,23 +104,34 @@ class LogTest { StringSource source_; ReportCollector report_; bool reading_; - Writer writer_; - Reader reader_; + Writer* writer_; + Reader* reader_; // Record metadata for testing initial offset functionality static size_t initial_offset_record_sizes_[]; static uint64_t initial_offset_last_record_offsets_[]; + static int num_initial_offset_records_; public: LogTest() : reading_(false), - writer_(&dest_), - reader_(&source_, &report_, true/*checksum*/, - 0/*initial_offset*/) { + writer_(new Writer(&dest_)), + reader_(new Reader(&source_, &report_, true/*checksum*/, + 0/*initial_offset*/)) { + } + + ~LogTest() { + delete writer_; + delete reader_; + } + + void ReopenForAppend() { + delete writer_; + writer_ = new Writer(&dest_, dest_.contents_.size()); } void Write(const std::string& msg) { ASSERT_TRUE(!reading_) << "Write() after starting to read"; - writer_.AddRecord(Slice(msg)); + writer_->AddRecord(Slice(msg)); } size_t WrittenBytes() const { @@ -134,7 +145,7 @@ class LogTest { } std::string scratch; Slice record; - if (reader_.ReadRecord(&record, &scratch)) { + if (reader_->ReadRecord(&record, &scratch)) { return record.ToString(); } else { return "EOF"; @@ -182,13 +193,18 @@ class LogTest { } void WriteInitialOffsetLog() { - for (int i = 0; i < 4; i++) { + for (int i = 0; i < num_initial_offset_records_; i++) { std::string record(initial_offset_record_sizes_[i], static_cast('a' + i)); Write(record); } } + void StartReadingAt(uint64_t initial_offset) { + delete reader_; + reader_ = new Reader(&source_, &report_, true/*checksum*/, initial_offset); + } + void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) { WriteInitialOffsetLog(); reading_ = true; @@ -208,32 +224,48 @@ class LogTest { source_.contents_ = Slice(dest_.contents_); Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/, initial_offset); - Slice record; - std::string scratch; - ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); - ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset], - record.size()); - ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset], - offset_reader->LastRecordOffset()); - ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]); + + // Read all records from expected_record_offset through the last one. + ASSERT_LT(expected_record_offset, num_initial_offset_records_); + for (; expected_record_offset < num_initial_offset_records_; + ++expected_record_offset) { + Slice record; + std::string scratch; + ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); + ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset], + record.size()); + ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset], + offset_reader->LastRecordOffset()); + ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]); + } delete offset_reader; } - }; size_t LogTest::initial_offset_record_sizes_[] = {10000, // Two sizable records in first block 10000, 2 * log::kBlockSize - 1000, // Span three blocks - 1}; + 1, + 13716, // Consume all but two bytes of block 3. + log::kBlockSize - kHeaderSize, // Consume the entirety of block 4. + }; uint64_t LogTest::initial_offset_last_record_offsets_[] = {0, kHeaderSize + 10000, 2 * (kHeaderSize + 10000), 2 * (kHeaderSize + 10000) + - (2 * log::kBlockSize - 1000) + 3 * kHeaderSize}; + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize, + 2 * (kHeaderSize + 10000) + + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize + + kHeaderSize + 1, + 3 * log::kBlockSize, + }; +// LogTest::initial_offset_last_record_offsets_ must be defined before this. +int LogTest::num_initial_offset_records_ = + sizeof(LogTest::initial_offset_last_record_offsets_)/sizeof(uint64_t); TEST(LogTest, Empty) { ASSERT_EQ("EOF", Read()); @@ -318,6 +350,15 @@ TEST(LogTest, AlignedEof) { ASSERT_EQ("EOF", Read()); } +TEST(LogTest, OpenForAppend) { + Write("hello"); + ReopenForAppend(); + Write("world"); + ASSERT_EQ("hello", Read()); + ASSERT_EQ("world", Read()); + ASSERT_EQ("EOF", Read()); +} + TEST(LogTest, RandomRead) { const int N = 500; Random write_rnd(301); @@ -445,6 +486,22 @@ TEST(LogTest, PartialLastIsIgnored) { ASSERT_EQ(0, DroppedBytes()); } +TEST(LogTest, SkipIntoMultiRecord) { + // Consider a fragmented record: + // first(R1), middle(R1), last(R1), first(R2) + // If initial_offset points to a record after first(R1) but before first(R2) + // incomplete fragment errors are not actual errors, and must be suppressed + // until a new first or full record is encountered. + Write(BigString("foo", 3*kBlockSize)); + Write("correct"); + StartReadingAt(kBlockSize); + + ASSERT_EQ("correct", Read()); + ASSERT_EQ("", ReportMessage()); + ASSERT_EQ(0, DroppedBytes()); + ASSERT_EQ("EOF", Read()); +} + TEST(LogTest, ErrorJoinsRecords) { // Consider two fragmented records: // first(R1) last(R1) first(R2) last(R2) @@ -514,6 +571,10 @@ TEST(LogTest, ReadFourthStart) { 3); } +TEST(LogTest, ReadInitialOffsetIntoBlockPadding) { + CheckInitialOffsetRecord(3 * log::kBlockSize - 3, 5); +} + TEST(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); } diff --git a/db/log_writer.cc b/db/log_writer.cc index 2da99ac08..74a03270d 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -12,13 +12,22 @@ namespace leveldb { namespace log { +static void InitTypeCrc(uint32_t* type_crc) { + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc[i] = crc32c::Value(&t, 1); + } +} + Writer::Writer(WritableFile* dest) : dest_(dest), block_offset_(0) { - for (int i = 0; i <= kMaxRecordType; i++) { - char t = static_cast(i); - type_crc_[i] = crc32c::Value(&t, 1); - } + InitTypeCrc(type_crc_); +} + +Writer::Writer(WritableFile* dest, uint64_t dest_length) + : dest_(dest), block_offset_(dest_length % kBlockSize) { + InitTypeCrc(type_crc_); } Writer::~Writer() { diff --git a/db/log_writer.h b/db/log_writer.h index a3a954d96..9e7cc4705 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -22,6 +22,12 @@ class Writer { // "*dest" must be initially empty. // "*dest" must remain live while this Writer is in use. explicit Writer(WritableFile* dest); + + // Create a writer that will append data to "*dest". + // "*dest" must have initial length "dest_length". + // "*dest" must remain live while this Writer is in use. + Writer(WritableFile* dest, uint64_t dest_length); + ~Writer(); Status AddRecord(const Slice& slice); diff --git a/db/memtable.h b/db/memtable.h index 92e90bb09..9f41567cd 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -36,10 +36,7 @@ class MemTable { } // Returns an estimate of the number of bytes of data in use by this - // data structure. - // - // REQUIRES: external synchronization to prevent simultaneous - // operations on the same MemTable. + // data structure. It is safe to call when MemTable is being modified. size_t ApproximateMemoryUsage(); // Return an iterator that yields the contents of the memtable. diff --git a/db/recovery_test.cc b/db/recovery_test.cc new file mode 100644 index 000000000..9596f4288 --- /dev/null +++ b/db/recovery_test.cc @@ -0,0 +1,324 @@ +// Copyright (c) 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "leveldb/db.h" +#include "leveldb/env.h" +#include "leveldb/write_batch.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace leveldb { + +class RecoveryTest { + public: + RecoveryTest() : env_(Env::Default()), db_(NULL) { + dbname_ = test::TmpDir() + "/recovery_test"; + DestroyDB(dbname_, Options()); + Open(); + } + + ~RecoveryTest() { + Close(); + DestroyDB(dbname_, Options()); + } + + DBImpl* dbfull() const { return reinterpret_cast(db_); } + Env* env() const { return env_; } + + bool CanAppend() { + WritableFile* tmp; + Status s = env_->NewAppendableFile(CurrentFileName(dbname_), &tmp); + delete tmp; + if (s.IsNotSupportedError()) { + return false; + } else { + return true; + } + } + + void Close() { + delete db_; + db_ = NULL; + } + + void Open(Options* options = NULL) { + Close(); + Options opts; + if (options != NULL) { + opts = *options; + } else { + opts.reuse_logs = true; // TODO(sanjay): test both ways + opts.create_if_missing = true; + } + if (opts.env == NULL) { + opts.env = env_; + } + ASSERT_OK(DB::Open(opts, dbname_, &db_)); + ASSERT_EQ(1, NumLogs()); + } + + Status Put(const std::string& k, const std::string& v) { + return db_->Put(WriteOptions(), k, v); + } + + std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { + std::string result; + Status s = db_->Get(ReadOptions(), k, &result); + if (s.IsNotFound()) { + result = "NOT_FOUND"; + } else if (!s.ok()) { + result = s.ToString(); + } + return result; + } + + std::string ManifestFileName() { + std::string current; + ASSERT_OK(ReadFileToString(env_, CurrentFileName(dbname_), ¤t)); + size_t len = current.size(); + if (len > 0 && current[len-1] == '\n') { + current.resize(len - 1); + } + return dbname_ + "/" + current; + } + + std::string LogName(uint64_t number) { + return LogFileName(dbname_, number); + } + + size_t DeleteLogFiles() { + std::vector logs = GetFiles(kLogFile); + for (size_t i = 0; i < logs.size(); i++) { + ASSERT_OK(env_->DeleteFile(LogName(logs[i]))) << LogName(logs[i]); + } + return logs.size(); + } + + uint64_t FirstLogFile() { + return GetFiles(kLogFile)[0]; + } + + std::vector GetFiles(FileType t) { + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + std::vector result; + for (size_t i = 0; i < filenames.size(); i++) { + uint64_t number; + FileType type; + if (ParseFileName(filenames[i], &number, &type) && type == t) { + result.push_back(number); + } + } + return result; + } + + int NumLogs() { + return GetFiles(kLogFile).size(); + } + + int NumTables() { + return GetFiles(kTableFile).size(); + } + + uint64_t FileSize(const std::string& fname) { + uint64_t result; + ASSERT_OK(env_->GetFileSize(fname, &result)) << fname; + return result; + } + + void CompactMemTable() { + dbfull()->TEST_CompactMemTable(); + } + + // Directly construct a log file that sets key to val. + void MakeLogFile(uint64_t lognum, SequenceNumber seq, Slice key, Slice val) { + std::string fname = LogFileName(dbname_, lognum); + WritableFile* file; + ASSERT_OK(env_->NewWritableFile(fname, &file)); + log::Writer writer(file); + WriteBatch batch; + batch.Put(key, val); + WriteBatchInternal::SetSequence(&batch, seq); + ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch))); + ASSERT_OK(file->Flush()); + delete file; + } + + private: + std::string dbname_; + Env* env_; + DB* db_; +}; + +TEST(RecoveryTest, ManifestReused) { + if (!CanAppend()) { + fprintf(stderr, "skipping test because env does not support appending\n"); + return; + } + ASSERT_OK(Put("foo", "bar")); + Close(); + std::string old_manifest = ManifestFileName(); + Open(); + ASSERT_EQ(old_manifest, ManifestFileName()); + ASSERT_EQ("bar", Get("foo")); + Open(); + ASSERT_EQ(old_manifest, ManifestFileName()); + ASSERT_EQ("bar", Get("foo")); +} + +TEST(RecoveryTest, LargeManifestCompacted) { + if (!CanAppend()) { + fprintf(stderr, "skipping test because env does not support appending\n"); + return; + } + ASSERT_OK(Put("foo", "bar")); + Close(); + std::string old_manifest = ManifestFileName(); + + // Pad with zeroes to make manifest file very big. + { + uint64_t len = FileSize(old_manifest); + WritableFile* file; + ASSERT_OK(env()->NewAppendableFile(old_manifest, &file)); + std::string zeroes(3*1048576 - static_cast(len), 0); + ASSERT_OK(file->Append(zeroes)); + ASSERT_OK(file->Flush()); + delete file; + } + + Open(); + std::string new_manifest = ManifestFileName(); + ASSERT_NE(old_manifest, new_manifest); + ASSERT_GT(10000, FileSize(new_manifest)); + ASSERT_EQ("bar", Get("foo")); + + Open(); + ASSERT_EQ(new_manifest, ManifestFileName()); + ASSERT_EQ("bar", Get("foo")); +} + +TEST(RecoveryTest, NoLogFiles) { + ASSERT_OK(Put("foo", "bar")); + ASSERT_EQ(1, DeleteLogFiles()); + Open(); + ASSERT_EQ("NOT_FOUND", Get("foo")); + Open(); + ASSERT_EQ("NOT_FOUND", Get("foo")); +} + +TEST(RecoveryTest, LogFileReuse) { + if (!CanAppend()) { + fprintf(stderr, "skipping test because env does not support appending\n"); + return; + } + for (int i = 0; i < 2; i++) { + ASSERT_OK(Put("foo", "bar")); + if (i == 0) { + // Compact to ensure current log is empty + CompactMemTable(); + } + Close(); + ASSERT_EQ(1, NumLogs()); + uint64_t number = FirstLogFile(); + if (i == 0) { + ASSERT_EQ(0, FileSize(LogName(number))); + } else { + ASSERT_LT(0, FileSize(LogName(number))); + } + Open(); + ASSERT_EQ(1, NumLogs()); + ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file"; + ASSERT_EQ("bar", Get("foo")); + Open(); + ASSERT_EQ(1, NumLogs()); + ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file"; + ASSERT_EQ("bar", Get("foo")); + } +} + +TEST(RecoveryTest, MultipleMemTables) { + // Make a large log. + const int kNum = 1000; + for (int i = 0; i < kNum; i++) { + char buf[100]; + snprintf(buf, sizeof(buf), "%050d", i); + ASSERT_OK(Put(buf, buf)); + } + ASSERT_EQ(0, NumTables()); + Close(); + ASSERT_EQ(0, NumTables()); + ASSERT_EQ(1, NumLogs()); + uint64_t old_log_file = FirstLogFile(); + + // Force creation of multiple memtables by reducing the write buffer size. + Options opt; + opt.reuse_logs = true; + opt.write_buffer_size = (kNum*100) / 2; + Open(&opt); + ASSERT_LE(2, NumTables()); + ASSERT_EQ(1, NumLogs()); + ASSERT_NE(old_log_file, FirstLogFile()) << "must not reuse log"; + for (int i = 0; i < kNum; i++) { + char buf[100]; + snprintf(buf, sizeof(buf), "%050d", i); + ASSERT_EQ(buf, Get(buf)); + } +} + +TEST(RecoveryTest, MultipleLogFiles) { + ASSERT_OK(Put("foo", "bar")); + Close(); + ASSERT_EQ(1, NumLogs()); + + // Make a bunch of uncompacted log files. + uint64_t old_log = FirstLogFile(); + MakeLogFile(old_log+1, 1000, "hello", "world"); + MakeLogFile(old_log+2, 1001, "hi", "there"); + MakeLogFile(old_log+3, 1002, "foo", "bar2"); + + // Recover and check that all log files were processed. + Open(); + ASSERT_LE(1, NumTables()); + ASSERT_EQ(1, NumLogs()); + uint64_t new_log = FirstLogFile(); + ASSERT_LE(old_log+3, new_log); + ASSERT_EQ("bar2", Get("foo")); + ASSERT_EQ("world", Get("hello")); + ASSERT_EQ("there", Get("hi")); + + // Test that previous recovery produced recoverable state. + Open(); + ASSERT_LE(1, NumTables()); + ASSERT_EQ(1, NumLogs()); + if (CanAppend()) { + ASSERT_EQ(new_log, FirstLogFile()); + } + ASSERT_EQ("bar2", Get("foo")); + ASSERT_EQ("world", Get("hello")); + ASSERT_EQ("there", Get("hi")); + + // Check that introducing an older log file does not cause it to be re-read. + Close(); + MakeLogFile(old_log+1, 2000, "hello", "stale write"); + Open(); + ASSERT_LE(1, NumTables()); + ASSERT_EQ(1, NumLogs()); + if (CanAppend()) { + ASSERT_EQ(new_log, FirstLogFile()); + } + ASSERT_EQ("bar2", Get("foo")); + ASSERT_EQ("world", Get("hello")); + ASSERT_EQ("there", Get("hi")); +} + +} // namespace leveldb + +int main(int argc, char** argv) { + return leveldb::test::RunAllTests(); +} diff --git a/db/skiplist.h b/db/skiplist.h index ed8b09220..8bd77764d 100644 --- a/db/skiplist.h +++ b/db/skiplist.h @@ -1,10 +1,10 @@ -#ifndef STORAGE_LEVELDB_DB_SKIPLIST_H_ -#define STORAGE_LEVELDB_DB_SKIPLIST_H_ - // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// + +#ifndef STORAGE_LEVELDB_DB_SKIPLIST_H_ +#define STORAGE_LEVELDB_DB_SKIPLIST_H_ + // Thread safety // ------------- // diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc index c78f4b4fb..aee1461e1 100644 --- a/db/skiplist_test.cc +++ b/db/skiplist_test.cc @@ -250,7 +250,7 @@ class ConcurrentTest { // Note that generation 0 is never inserted, so it is ok if // <*,0,*> is missing. ASSERT_TRUE((gen(pos) == 0) || - (gen(pos) > initial_state.Get(key(pos))) + (gen(pos) > static_cast(initial_state.Get(key(pos)))) ) << "key: " << key(pos) << "; gen: " << gen(pos) << "; initgen: " diff --git a/db/snapshot.h b/db/snapshot.h index e7f8fd2c3..6ed413c42 100644 --- a/db/snapshot.h +++ b/db/snapshot.h @@ -5,6 +5,7 @@ #ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ #define STORAGE_LEVELDB_DB_SNAPSHOT_H_ +#include "db/dbformat.h" #include "leveldb/db.h" namespace leveldb { diff --git a/db/version_set.cc b/db/version_set.cc index aa83df55e..a5e0f77a6 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -893,7 +893,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { return s; } -Status VersionSet::Recover() { +Status VersionSet::Recover(bool *save_manifest) { struct LogReporter : public log::Reader::Reporter { Status* status; virtual void Corruption(size_t bytes, const Status& s) { @@ -1003,11 +1003,49 @@ Status VersionSet::Recover() { last_sequence_ = last_sequence; log_number_ = log_number; prev_log_number_ = prev_log_number; + + // See if we can reuse the existing MANIFEST file. + if (ReuseManifest(dscname, current)) { + // No need to save new manifest + } else { + *save_manifest = true; + } } return s; } +bool VersionSet::ReuseManifest(const std::string& dscname, + const std::string& dscbase) { + if (!options_->reuse_logs) { + return false; + } + FileType manifest_type; + uint64_t manifest_number; + uint64_t manifest_size; + if (!ParseFileName(dscbase, &manifest_number, &manifest_type) || + manifest_type != kDescriptorFile || + !env_->GetFileSize(dscname, &manifest_size).ok() || + // Make new compacted MANIFEST if old one is too big + manifest_size >= kTargetFileSize) { + return false; + } + + assert(descriptor_file_ == NULL); + assert(descriptor_log_ == NULL); + Status r = env_->NewAppendableFile(dscname, &descriptor_file_); + if (!r.ok()) { + Log(options_->info_log, "Reuse MANIFEST: %s\n", r.ToString().c_str()); + assert(descriptor_file_ == NULL); + return false; + } + + Log(options_->info_log, "Reusing MANIFEST %s\n", dscname.c_str()); + descriptor_log_ = new log::Writer(descriptor_file_, manifest_size); + manifest_file_number_ = manifest_number; + return true; +} + void VersionSet::MarkFileNumberUsed(uint64_t number) { if (next_file_number_ <= number) { next_file_number_ = number + 1; diff --git a/db/version_set.h b/db/version_set.h index 8dc14b8e0..1dec74567 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -179,7 +179,7 @@ class VersionSet { EXCLUSIVE_LOCKS_REQUIRED(mu); // Recover the last saved descriptor from persistent storage. - Status Recover(); + Status Recover(bool *save_manifest); // Return the current version. Version* current() const { return current_; } @@ -274,6 +274,8 @@ class VersionSet { friend class Compaction; friend class Version; + bool ReuseManifest(const std::string& dscname, const std::string& dscbase); + void Finalize(Version* v); void GetRange(const std::vector& inputs, diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index 310a3c891..9448ef7b2 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -5,6 +5,7 @@ #ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ #define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ +#include "db/dbformat.h" #include "leveldb/write_batch.h" namespace leveldb { diff --git a/doc/index.html b/doc/index.html index 3ed0ed9d9..215519258 100644 --- a/doc/index.html +++ b/doc/index.html @@ -22,7 +22,7 @@ directory. The following example shows how to open a database, creating it if necessary:

-  #include <assert>
+  #include <cassert>
   #include "leveldb/db.h"
 
   leveldb::DB* db;
diff --git a/helpers/memenv/memenv.cc b/helpers/memenv/memenv.cc
index 43ef2e072..9a98884da 100644
--- a/helpers/memenv/memenv.cc
+++ b/helpers/memenv/memenv.cc
@@ -277,6 +277,19 @@ class InMemoryEnv : public EnvWrapper {
     return Status::OK();
   }
 
+  virtual Status NewAppendableFile(const std::string& fname,
+                                   WritableFile** result) {
+    MutexLock lock(&mutex_);
+    FileState** sptr = &file_map_[fname];
+    FileState* file = *sptr;
+    if (file == NULL) {
+      file = new FileState();
+      file->Ref();
+    }
+    *result = new WritableFileImpl(file);
+    return Status::OK();
+  }
+
   virtual bool FileExists(const std::string& fname) {
     MutexLock lock(&mutex_);
     return file_map_.find(fname) != file_map_.end();
diff --git a/helpers/memenv/memenv_test.cc b/helpers/memenv/memenv_test.cc
index a44310fed..5cff77613 100644
--- a/helpers/memenv/memenv_test.cc
+++ b/helpers/memenv/memenv_test.cc
@@ -40,6 +40,8 @@ TEST(MemEnvTest, Basics) {
 
   // Create a file.
   ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(0, file_size);
   delete writable_file;
 
   // Check that the file exists.
@@ -55,9 +57,16 @@ TEST(MemEnvTest, Basics) {
   ASSERT_OK(writable_file->Append("abc"));
   delete writable_file;
 
-  // Check for expected size.
+  // Check that append works.
+  ASSERT_OK(env_->NewAppendableFile("/dir/f", &writable_file));
   ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
   ASSERT_EQ(3, file_size);
+  ASSERT_OK(writable_file->Append("hello"));
+  delete writable_file;
+
+  // Check for expected size.
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(8, file_size);
 
   // Check that renaming works.
   ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
@@ -65,7 +74,7 @@ TEST(MemEnvTest, Basics) {
   ASSERT_TRUE(!env_->FileExists("/dir/f"));
   ASSERT_TRUE(env_->FileExists("/dir/g"));
   ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
-  ASSERT_EQ(3, file_size);
+  ASSERT_EQ(8, file_size);
 
   // Check that opening non-existent file fails.
   SequentialFile* seq_file;
diff --git a/include/leveldb/cache.h b/include/leveldb/cache.h
index 1a201e5e0..6819d5bc4 100644
--- a/include/leveldb/cache.h
+++ b/include/leveldb/cache.h
@@ -81,6 +81,17 @@ class Cache {
   // its cache keys.
   virtual uint64_t NewId() = 0;
 
+  // Remove all cache entries that are not actively in use.  Memory-constrained
+  // applications may wish to call this method to reduce memory usage.
+  // Default implementation of Prune() does nothing.  Subclasses are strongly
+  // encouraged to override the default implementation.  A future release of
+  // leveldb may change Prune() to a pure abstract method.
+  virtual void Prune() {}
+
+  // Return an estimate of the combined charges of all elements stored in the
+  // cache.
+  virtual size_t TotalCharge() const = 0;
+
  private:
   void LRU_Remove(Handle* e);
   void LRU_Append(Handle* e);
diff --git a/include/leveldb/db.h b/include/leveldb/db.h
index 4c169bf22..9752cbad5 100644
--- a/include/leveldb/db.h
+++ b/include/leveldb/db.h
@@ -14,7 +14,7 @@ namespace leveldb {
 
 // Update Makefile if you change these
 static const int kMajorVersion = 1;
-static const int kMinorVersion = 18;
+static const int kMinorVersion = 19;
 
 struct Options;
 struct ReadOptions;
@@ -115,6 +115,8 @@ class DB {
   //     about the internal operation of the DB.
   //  "leveldb.sstables" - returns a multi-line string that describes all
   //     of the sstables that make up the db contents.
+  //  "leveldb.approximate-memory-usage" - returns the approximate number of
+  //     bytes of memory in use by the DB.
   virtual bool GetProperty(const Slice& property, std::string* value) = 0;
 
   // For each i in [0,n-1], store in "sizes[i]", the approximate
diff --git a/include/leveldb/env.h b/include/leveldb/env.h
index f709514da..99b6c2141 100644
--- a/include/leveldb/env.h
+++ b/include/leveldb/env.h
@@ -69,6 +69,21 @@ class Env {
   virtual Status NewWritableFile(const std::string& fname,
                                  WritableFile** result) = 0;
 
+  // Create an object that either appends to an existing file, or
+  // writes to a new file (if the file does not exist to begin with).
+  // On success, stores a pointer to the new file in *result and
+  // returns OK.  On failure stores NULL in *result and returns
+  // non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  //
+  // May return an IsNotSupportedError error if this Env does
+  // not allow appending to an existing file.  Users of Env (including
+  // the leveldb implementation) must be prepared to deal with
+  // an Env that does not support appending.
+  virtual Status NewAppendableFile(const std::string& fname,
+                                   WritableFile** result);
+
   // Returns true iff the named file exists.
   virtual bool FileExists(const std::string& fname) = 0;
 
@@ -289,6 +304,9 @@ class EnvWrapper : public Env {
   Status NewWritableFile(const std::string& f, WritableFile** r) {
     return target_->NewWritableFile(f, r);
   }
+  Status NewAppendableFile(const std::string& f, WritableFile** r) {
+    return target_->NewAppendableFile(f, r);
+  }
   bool FileExists(const std::string& f) { return target_->FileExists(f); }
   Status GetChildren(const std::string& dir, std::vector* r) {
     return target_->GetChildren(dir, r);
diff --git a/include/leveldb/iterator.h b/include/leveldb/iterator.h
index 76aced04b..da631ed9d 100644
--- a/include/leveldb/iterator.h
+++ b/include/leveldb/iterator.h
@@ -37,7 +37,7 @@ class Iterator {
   // Valid() after this call iff the source is not empty.
   virtual void SeekToLast() = 0;
 
-  // Position at the first key in the source that at or past target
+  // Position at the first key in the source that is at or past target.
   // The iterator is Valid() after this call iff the source contains
   // an entry that comes at or past target.
   virtual void Seek(const Slice& target) = 0;
diff --git a/include/leveldb/options.h b/include/leveldb/options.h
index 7c9b97345..83a1ef39a 100644
--- a/include/leveldb/options.h
+++ b/include/leveldb/options.h
@@ -128,6 +128,12 @@ struct Options {
   // efficiently detect that and will switch to uncompressed mode.
   CompressionType compression;
 
+  // EXPERIMENTAL: If true, append to existing MANIFEST and log files
+  // when a database is opened.  This can significantly speed up open.
+  //
+  // Default: currently false, but may become true later.
+  bool reuse_logs;
+
   // If non-NULL, use the specified filter policy to reduce disk reads.
   // Many applications will benefit from passing the result of
   // NewBloomFilterPolicy() here.
diff --git a/include/leveldb/status.h b/include/leveldb/status.h
index 11dbd4b47..d9575f975 100644
--- a/include/leveldb/status.h
+++ b/include/leveldb/status.h
@@ -60,6 +60,12 @@ class Status {
   // Returns true iff the status indicates an IOError.
   bool IsIOError() const { return code() == kIOError; }
 
+  // Returns true iff the status indicates a NotSupportedError.
+  bool IsNotSupportedError() const { return code() == kNotSupported; }
+
+  // Returns true iff the status indicates an InvalidArgument.
+  bool IsInvalidArgument() const { return code() == kInvalidArgument; }
+
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   std::string ToString() const;
diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h
index 9bf091f75..1c4c7aafc 100644
--- a/port/atomic_pointer.h
+++ b/port/atomic_pointer.h
@@ -35,8 +35,12 @@
 #define ARCH_CPU_X86_FAMILY 1
 #elif defined(__ARMEL__)
 #define ARCH_CPU_ARM_FAMILY 1
+#elif defined(__aarch64__)
+#define ARCH_CPU_ARM64_FAMILY 1
 #elif defined(__ppc__) || defined(__powerpc__) || defined(__powerpc64__)
 #define ARCH_CPU_PPC_FAMILY 1
+#elif defined(__mips__)
+#define ARCH_CPU_MIPS_FAMILY 1
 #endif
 
 namespace leveldb {
@@ -92,6 +96,13 @@ inline void MemoryBarrier() {
 }
 #define LEVELDB_HAVE_MEMORY_BARRIER
 
+// ARM64
+#elif defined(ARCH_CPU_ARM64_FAMILY)
+inline void MemoryBarrier() {
+  asm volatile("dmb sy" : : : "memory");
+}
+#define LEVELDB_HAVE_MEMORY_BARRIER
+
 // PPC
 #elif defined(ARCH_CPU_PPC_FAMILY) && defined(__GNUC__)
 inline void MemoryBarrier() {
@@ -101,6 +112,13 @@ inline void MemoryBarrier() {
 }
 #define LEVELDB_HAVE_MEMORY_BARRIER
 
+// MIPS
+#elif defined(ARCH_CPU_MIPS_FAMILY) && defined(__GNUC__)
+inline void MemoryBarrier() {
+  __asm__ __volatile__("sync" : : : "memory");
+}
+#define LEVELDB_HAVE_MEMORY_BARRIER
+
 #endif
 
 // AtomicPointer built using platform-specific MemoryBarrier()
@@ -215,6 +233,7 @@ class AtomicPointer {
 #undef LEVELDB_HAVE_MEMORY_BARRIER
 #undef ARCH_CPU_X86_FAMILY
 #undef ARCH_CPU_ARM_FAMILY
+#undef ARCH_CPU_ARM64_FAMILY
 #undef ARCH_CPU_PPC_FAMILY
 
 }  // namespace port
diff --git a/port/port_posix.cc b/port/port_posix.cc
index 5ba127a5b..30e8007ae 100644
--- a/port/port_posix.cc
+++ b/port/port_posix.cc
@@ -7,7 +7,6 @@
 #include 
 #include 
 #include 
-#include "util/logging.h"
 
 namespace leveldb {
 namespace port {
diff --git a/table/filter_block.cc b/table/filter_block.cc
index 203e15c8b..4e78b954f 100644
--- a/table/filter_block.cc
+++ b/table/filter_block.cc
@@ -68,7 +68,7 @@ void FilterBlockBuilder::GenerateFilter() {
 
   // Generate filter for current set of keys and append to result_.
   filter_offsets_.push_back(result_.size());
-  policy_->CreateFilter(&tmp_keys_[0], num_keys, &result_);
+  policy_->CreateFilter(&tmp_keys_[0], static_cast(num_keys), &result_);
 
   tmp_keys_.clear();
   keys_.clear();
@@ -97,7 +97,7 @@ bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) {
   if (index < num_) {
     uint32_t start = DecodeFixed32(offset_ + index*4);
     uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
-    if (start <= limit && limit <= (offset_ - data_)) {
+    if (start <= limit && limit <= static_cast(offset_ - data_)) {
       Slice filter = Slice(data_ + start, limit - start);
       return policy_->KeyMayMatch(key, filter);
     } else if (start == limit) {
diff --git a/table/format.cc b/table/format.cc
index aa63144c9..24e4e0244 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -30,15 +30,14 @@ Status BlockHandle::DecodeFrom(Slice* input) {
 }
 
 void Footer::EncodeTo(std::string* dst) const {
-#ifndef NDEBUG
   const size_t original_size = dst->size();
-#endif
   metaindex_handle_.EncodeTo(dst);
   index_handle_.EncodeTo(dst);
   dst->resize(2 * BlockHandle::kMaxEncodedLength);  // Padding
   PutFixed32(dst, static_cast(kTableMagicNumber & 0xffffffffu));
   PutFixed32(dst, static_cast(kTableMagicNumber >> 32));
   assert(dst->size() == original_size + kEncodedLength);
+  (void)original_size;  // Disable unused variable warning.
 }
 
 Status Footer::DecodeFrom(Slice* input) {
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index 9e16b3dbe..f410c3fab 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -5,6 +5,9 @@
 #ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_
 #define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_
 
+#include "leveldb/iterator.h"
+#include "leveldb/slice.h"
+
 namespace leveldb {
 
 // A internal wrapper class with an interface similar to Iterator that
diff --git a/table/table.cc b/table/table.cc
index dff8a8259..decf8082c 100644
--- a/table/table.cc
+++ b/table/table.cc
@@ -82,7 +82,7 @@ Status Table::Open(const Options& options,
     *table = new Table(rep);
     (*table)->ReadMeta(footer);
   } else {
-    if (index_block) delete index_block;
+    delete index_block;
   }
 
   return s;
diff --git a/table/table_test.cc b/table/table_test.cc
index c723bf84c..abf6e246f 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -853,12 +853,20 @@ TEST(TableTest, ApproximateOffsetOfCompressed) {
   options.compression = kSnappyCompression;
   c.Finish(options, &keys, &kvmap);
 
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"),       0,      0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),    2000,   3000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),    2000,   3000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6000));
+  // Expected upper and lower bounds of space used by compressible strings.
+  static const int kSlop = 1000;  // Compressor effectiveness varies.
+  const int expected = 2500;  // 10000 * compression ratio (0.25)
+  const int min_z = expected - kSlop;
+  const int max_z = expected + kSlop;
+
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, kSlop));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, kSlop));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, kSlop));
+  // Have now emitted a large compressible string, so adjust expected offset.
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), min_z, max_z));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), min_z, max_z));
+  // Have now emitted two large compressible strings, so adjust expected offset.
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 2 * min_z, 2 * max_z));
 }
 
 }  // namespace leveldb
diff --git a/util/arena.cc b/util/arena.cc
index 9367f7149..74078213e 100644
--- a/util/arena.cc
+++ b/util/arena.cc
@@ -9,8 +9,7 @@ namespace leveldb {
 
 static const int kBlockSize = 4096;
 
-Arena::Arena() {
-  blocks_memory_ = 0;
+Arena::Arena() : memory_usage_(0) {
   alloc_ptr_ = NULL;  // First allocation will allocate a block
   alloc_bytes_remaining_ = 0;
 }
@@ -60,8 +59,9 @@ char* Arena::AllocateAligned(size_t bytes) {
 
 char* Arena::AllocateNewBlock(size_t block_bytes) {
   char* result = new char[block_bytes];
-  blocks_memory_ += block_bytes;
   blocks_.push_back(result);
+  memory_usage_.NoBarrier_Store(
+      reinterpret_cast(MemoryUsage() + block_bytes + sizeof(char*)));
   return result;
 }
 
diff --git a/util/arena.h b/util/arena.h
index 73bbf1cb9..48bab3374 100644
--- a/util/arena.h
+++ b/util/arena.h
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include "port/port.h"
 
 namespace leveldb {
 
@@ -24,10 +25,9 @@ class Arena {
   char* AllocateAligned(size_t bytes);
 
   // Returns an estimate of the total memory usage of data allocated
-  // by the arena (including space allocated but not yet used for user
-  // allocations).
+  // by the arena.
   size_t MemoryUsage() const {
-    return blocks_memory_ + blocks_.capacity() * sizeof(char*);
+    return reinterpret_cast(memory_usage_.NoBarrier_Load());
   }
 
  private:
@@ -41,8 +41,8 @@ class Arena {
   // Array of new[] allocated memory blocks
   std::vector blocks_;
 
-  // Bytes of memory in blocks allocated so far
-  size_t blocks_memory_;
+  // Total memory usage of the arena.
+  port::AtomicPointer memory_usage_;
 
   // No copying allowed
   Arena(const Arena&);
diff --git a/util/bloom.cc b/util/bloom.cc
index a27a2ace2..bf3e4ca6e 100644
--- a/util/bloom.cc
+++ b/util/bloom.cc
@@ -47,7 +47,7 @@ class BloomFilterPolicy : public FilterPolicy {
     dst->resize(init_size + bytes, 0);
     dst->push_back(static_cast(k_));  // Remember # of probes in filter
     char* array = &(*dst)[init_size];
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
       // Use double-hashing to generate a sequence of hash values.
       // See analysis in [Kirsch,Mitzenmacher 2006].
       uint32_t h = BloomHash(keys[i]);
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 77fb1b315..1b87a2be3 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -46,7 +46,8 @@ class BloomTest {
       key_slices.push_back(Slice(keys_[i]));
     }
     filter_.clear();
-    policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_);
+    policy_->CreateFilter(&key_slices[0], static_cast(key_slices.size()),
+                          &filter_);
     keys_.clear();
     if (kVerbose >= 2) DumpFilter();
   }
diff --git a/util/cache.cc b/util/cache.cc
index 8b197bc02..ce4688617 100644
--- a/util/cache.cc
+++ b/util/cache.cc
@@ -19,6 +19,23 @@ Cache::~Cache() {
 namespace {
 
 // LRU cache implementation
+//
+// Cache entries have an "in_cache" boolean indicating whether the cache has a
+// reference on the entry.  The only ways that this can become false without the
+// entry being passed to its "deleter" are via Erase(), via Insert() when
+// an element with a duplicate key is inserted, or on destruction of the cache.
+//
+// The cache keeps two linked lists of items in the cache.  All items in the
+// cache are in one list or the other, and never both.  Items still referenced
+// by clients but erased from the cache are in neither list.  The lists are:
+// - in-use:  contains the items currently referenced by clients, in no
+//   particular order.  (This list is used for invariant checking.  If we
+//   removed the check, elements that would otherwise be on this list could be
+//   left as disconnected singleton lists.)
+// - LRU:  contains the items not currently referenced by clients, in LRU order
+// Elements are moved between these lists by the Ref() and Unref() methods,
+// when they detect an element in the cache acquiring or losing its only
+// external reference.
 
 // An entry is a variable length heap-allocated structure.  Entries
 // are kept in a circular doubly linked list ordered by access time.
@@ -30,7 +47,8 @@ struct LRUHandle {
   LRUHandle* prev;
   size_t charge;      // TODO(opt): Only allow uint32_t?
   size_t key_length;
-  uint32_t refs;
+  bool in_cache;      // Whether entry is in the cache.
+  uint32_t refs;      // References, including cache reference, if present.
   uint32_t hash;      // Hash of key(); used for fast sharding and comparisons
   char key_data[1];   // Beginning of key
 
@@ -147,49 +165,77 @@ class LRUCache {
   Cache::Handle* Lookup(const Slice& key, uint32_t hash);
   void Release(Cache::Handle* handle);
   void Erase(const Slice& key, uint32_t hash);
+  void Prune();
+  size_t TotalCharge() const {
+    MutexLock l(&mutex_);
+    return usage_;
+  }
 
  private:
   void LRU_Remove(LRUHandle* e);
-  void LRU_Append(LRUHandle* e);
+  void LRU_Append(LRUHandle*list, LRUHandle* e);
+  void Ref(LRUHandle* e);
   void Unref(LRUHandle* e);
+  bool FinishErase(LRUHandle* e);
 
   // Initialized before use.
   size_t capacity_;
 
   // mutex_ protects the following state.
-  port::Mutex mutex_;
+  mutable port::Mutex mutex_;
   size_t usage_;
 
   // Dummy head of LRU list.
   // lru.prev is newest entry, lru.next is oldest entry.
+  // Entries have refs==1 and in_cache==true.
   LRUHandle lru_;
 
+  // Dummy head of in-use list.
+  // Entries are in use by clients, and have refs >= 2 and in_cache==true.
+  LRUHandle in_use_;
+
   HandleTable table_;
 };
 
 LRUCache::LRUCache()
     : usage_(0) {
-  // Make empty circular linked list
+  // Make empty circular linked lists.
   lru_.next = &lru_;
   lru_.prev = &lru_;
+  in_use_.next = &in_use_;
+  in_use_.prev = &in_use_;
 }
 
 LRUCache::~LRUCache() {
+  assert(in_use_.next == &in_use_);  // Error if caller has an unreleased handle
   for (LRUHandle* e = lru_.next; e != &lru_; ) {
     LRUHandle* next = e->next;
-    assert(e->refs == 1);  // Error if caller has an unreleased handle
+    assert(e->in_cache);
+    e->in_cache = false;
+    assert(e->refs == 1);  // Invariant of lru_ list.
     Unref(e);
     e = next;
   }
 }
 
+void LRUCache::Ref(LRUHandle* e) {
+  if (e->refs == 1 && e->in_cache) {  // If on lru_ list, move to in_use_ list.
+    LRU_Remove(e);
+    LRU_Append(&in_use_, e);
+  }
+  e->refs++;
+}
+
 void LRUCache::Unref(LRUHandle* e) {
   assert(e->refs > 0);
   e->refs--;
-  if (e->refs <= 0) {
-    usage_ -= e->charge;
+  if (e->refs == 0) { // Deallocate.
+    assert(!e->in_cache);
     (*e->deleter)(e->key(), e->value);
     free(e);
+  } else if (e->in_cache && e->refs == 1) {  // No longer in use; move to lru_ list.
+    LRU_Remove(e);
+    LRU_Append(&lru_, e);
   }
 }
 
@@ -198,10 +244,10 @@ void LRUCache::LRU_Remove(LRUHandle* e) {
   e->prev->next = e->next;
 }
 
-void LRUCache::LRU_Append(LRUHandle* e) {
-  // Make "e" newest entry by inserting just before lru_
-  e->next = &lru_;
-  e->prev = lru_.prev;
+void LRUCache::LRU_Append(LRUHandle* list, LRUHandle* e) {
+  // Make "e" newest entry by inserting just before *list
+  e->next = list;
+  e->prev = list->prev;
   e->prev->next = e;
   e->next->prev = e;
 }
@@ -210,9 +256,7 @@ Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) {
   MutexLock l(&mutex_);
   LRUHandle* e = table_.Lookup(key, hash);
   if (e != NULL) {
-    e->refs++;
-    LRU_Remove(e);
-    LRU_Append(e);
+    Ref(e);
   }
   return reinterpret_cast(e);
 }
@@ -234,33 +278,57 @@ Cache::Handle* LRUCache::Insert(
   e->charge = charge;
   e->key_length = key.size();
   e->hash = hash;
-  e->refs = 2;  // One from LRUCache, one for the returned handle
+  e->in_cache = false;
+  e->refs = 1;  // for the returned handle.
   memcpy(e->key_data, key.data(), key.size());
-  LRU_Append(e);
-  usage_ += charge;
 
-  LRUHandle* old = table_.Insert(e);
-  if (old != NULL) {
-    LRU_Remove(old);
-    Unref(old);
-  }
+  if (capacity_ > 0) {
+    e->refs++;  // for the cache's reference.
+    e->in_cache = true;
+    LRU_Append(&in_use_, e);
+    usage_ += charge;
+    FinishErase(table_.Insert(e));
+  } // else don't cache.  (Tests use capacity_==0 to turn off caching.)
 
   while (usage_ > capacity_ && lru_.next != &lru_) {
     LRUHandle* old = lru_.next;
-    LRU_Remove(old);
-    table_.Remove(old->key(), old->hash);
-    Unref(old);
+    assert(old->refs == 1);
+    bool erased = FinishErase(table_.Remove(old->key(), old->hash));
+    if (!erased) {  // to avoid unused variable when compiled NDEBUG
+      assert(erased);
+    }
   }
 
   return reinterpret_cast(e);
 }
 
+// If e != NULL, finish removing *e from the cache; it has already been removed
+// from the hash table.  Return whether e != NULL.  Requires mutex_ held.
+bool LRUCache::FinishErase(LRUHandle* e) {
+  if (e != NULL) {
+    assert(e->in_cache);
+    LRU_Remove(e);
+    e->in_cache = false;
+    usage_ -= e->charge;
+    Unref(e);
+  }
+  return e != NULL;
+}
+
 void LRUCache::Erase(const Slice& key, uint32_t hash) {
   MutexLock l(&mutex_);
-  LRUHandle* e = table_.Remove(key, hash);
-  if (e != NULL) {
-    LRU_Remove(e);
-    Unref(e);
+  FinishErase(table_.Remove(key, hash));
+}
+
+void LRUCache::Prune() {
+  MutexLock l(&mutex_);
+  while (lru_.next != &lru_) {
+    LRUHandle* e = lru_.next;
+    assert(e->refs == 1);
+    bool erased = FinishErase(table_.Remove(e->key(), e->hash));
+    if (!erased) {  // to avoid unused variable when compiled NDEBUG
+      assert(erased);
+    }
   }
 }
 
@@ -314,6 +382,18 @@ class ShardedLRUCache : public Cache {
     MutexLock l(&id_mutex_);
     return ++(last_id_);
   }
+  virtual void Prune() {
+    for (int s = 0; s < kNumShards; s++) {
+      shard_[s].Prune();
+    }
+  }
+  virtual size_t TotalCharge() const {
+    size_t total = 0;
+    for (int s = 0; s < kNumShards; s++) {
+      total += shard_[s].TotalCharge();
+    }
+    return total;
+  }
 };
 
 }  // end anonymous namespace
diff --git a/util/cache_test.cc b/util/cache_test.cc
index 43716715a..468f7a642 100644
--- a/util/cache_test.cc
+++ b/util/cache_test.cc
@@ -59,6 +59,11 @@ class CacheTest {
                                    &CacheTest::Deleter));
   }
 
+  Cache::Handle* InsertAndReturnHandle(int key, int value, int charge = 1) {
+    return cache_->Insert(EncodeKey(key), EncodeValue(value), charge,
+                          &CacheTest::Deleter);
+  }
+
   void Erase(int key) {
     cache_->Erase(EncodeKey(key));
   }
@@ -135,8 +140,11 @@ TEST(CacheTest, EntriesArePinned) {
 TEST(CacheTest, EvictionPolicy) {
   Insert(100, 101);
   Insert(200, 201);
+  Insert(300, 301);
+  Cache::Handle* h = cache_->Lookup(EncodeKey(300));
 
-  // Frequently used entry must be kept around
+  // Frequently used entry must be kept around,
+  // as must things that are still in use.
   for (int i = 0; i < kCacheSize + 100; i++) {
     Insert(1000+i, 2000+i);
     ASSERT_EQ(2000+i, Lookup(1000+i));
@@ -144,6 +152,25 @@ TEST(CacheTest, EvictionPolicy) {
   }
   ASSERT_EQ(101, Lookup(100));
   ASSERT_EQ(-1, Lookup(200));
+  ASSERT_EQ(301, Lookup(300));
+  cache_->Release(h);
+}
+
+TEST(CacheTest, UseExceedsCacheSize) {
+  // Overfill the cache, keeping handles on all inserted entries.
+  std::vector h;
+  for (int i = 0; i < kCacheSize + 100; i++) {
+    h.push_back(InsertAndReturnHandle(1000+i, 2000+i));
+  }
+
+  // Check that all the entries can be found in the cache.
+  for (int i = 0; i < h.size(); i++) {
+    ASSERT_EQ(2000+i, Lookup(1000+i));
+  }
+
+  for (int i = 0; i < h.size(); i++) {
+    cache_->Release(h[i]);
+  }
 }
 
 TEST(CacheTest, HeavyEntries) {
@@ -179,6 +206,19 @@ TEST(CacheTest, NewId) {
   ASSERT_NE(a, b);
 }
 
+TEST(CacheTest, Prune) {
+  Insert(1, 100);
+  Insert(2, 200);
+
+  Cache::Handle* handle = cache_->Lookup(EncodeKey(1));
+  ASSERT_TRUE(handle);
+  cache_->Prune();
+  cache_->Release(handle);
+
+  ASSERT_EQ(100, Lookup(1));
+  ASSERT_EQ(-1, Lookup(2));
+}
+
 }  // namespace leveldb
 
 int main(int argc, char** argv) {
diff --git a/util/env.cc b/util/env.cc
index c2600e964..c58a0821e 100644
--- a/util/env.cc
+++ b/util/env.cc
@@ -9,6 +9,10 @@ namespace leveldb {
 Env::~Env() {
 }
 
+Status Env::NewAppendableFile(const std::string& fname, WritableFile** result) {
+  return Status::NotSupported("NewAppendableFile", fname);
+}
+
 SequentialFile::~SequentialFile() {
 }
 
diff --git a/util/env_posix.cc b/util/env_posix.cc
index ba2667864..e0fca52f4 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -351,6 +351,19 @@ class PosixEnv : public Env {
     return s;
   }
 
+  virtual Status NewAppendableFile(const std::string& fname,
+                                   WritableFile** result) {
+    Status s;
+    FILE* f = fopen(fname.c_str(), "a");
+    if (f == NULL) {
+      *result = NULL;
+      s = IOError(fname, errno);
+    } else {
+      *result = new PosixWritableFile(fname, f);
+    }
+    return s;
+  }
+
   virtual bool FileExists(const std::string& fname) {
     return access(fname.c_str(), F_OK) == 0;
   }
diff --git a/util/env_win.cc b/util/env_win.cc
index e11a96b79..b074b7579 100644
--- a/util/env_win.cc
+++ b/util/env_win.cc
@@ -106,7 +106,7 @@ private:
 class Win32WritableFile : public WritableFile
 {
 public:
-    Win32WritableFile(const std::string& fname);
+    Win32WritableFile(const std::string& fname, bool append);
     ~Win32WritableFile();
 
     virtual Status Append(const Slice& data);
@@ -158,6 +158,8 @@ public:
         RandomAccessFile** result);
     virtual Status NewWritableFile(const std::string& fname,
         WritableFile** result);
+    virtual Status NewAppendableFile(const std::string& fname,
+        WritableFile** result);
 
     virtual bool FileExists(const std::string& fname);
 
@@ -423,17 +425,23 @@ void Win32RandomAccessFile::_CleanUp()
     }
 }
 
-Win32WritableFile::Win32WritableFile(const std::string& fname)
+Win32WritableFile::Win32WritableFile(const std::string& fname, bool append)
     : filename_(fname)
 {
     std::wstring path;
     ToWidePath(fname, path);
-    DWORD Flag = PathFileExistsW(path.c_str()) ? OPEN_EXISTING : CREATE_ALWAYS;
+    // NewAppendableFile: append to an existing file, or create a new one
+    //     if none exists - this is OPEN_ALWAYS behavior, with
+    //     FILE_APPEND_DATA to avoid having to manually position the file
+    //     pointer at the end of the file.
+    // NewWritableFile: create a new file, delete if it exists - this is
+    //     CREATE_ALWAYS behavior. This file is used for writing only so
+    //     use GENERIC_WRITE.
     _hFile = CreateFileW(path.c_str(),
-                         GENERIC_READ | GENERIC_WRITE,
+                         append ? FILE_APPEND_DATA : GENERIC_WRITE,
                          FILE_SHARE_READ|FILE_SHARE_DELETE|FILE_SHARE_WRITE,
                          NULL,
-                         Flag,
+                         append ? OPEN_ALWAYS : CREATE_ALWAYS,
                          FILE_ATTRIBUTE_NORMAL,
                          NULL);
     // CreateFileW returns INVALID_HANDLE_VALUE in case of error, always check isEnable() before use
@@ -823,7 +831,9 @@ Status Win32Env::NewLogger( const std::string& fname, Logger** result )
 {
     Status sRet;
     std::string path = fname;
-    Win32WritableFile* pMapFile = new Win32WritableFile(ModifyPath(path));
+    // Logs are opened with write semantics, not with append semantics
+    // (see PosixEnv::NewLogger)
+    Win32WritableFile* pMapFile = new Win32WritableFile(ModifyPath(path), false);
     if(!pMapFile->isEnable()){
         delete pMapFile;
         *result = NULL;
@@ -837,7 +847,20 @@ Status Win32Env::NewWritableFile( const std::string& fname, WritableFile** resul
 {
     Status sRet;
     std::string path = fname;
-    Win32WritableFile* pFile = new Win32WritableFile(ModifyPath(path));
+    Win32WritableFile* pFile = new Win32WritableFile(ModifyPath(path), false);
+    if(!pFile->isEnable()){
+        *result = NULL;
+        sRet = Status::IOError(fname,Win32::GetLastErrSz());
+    }else
+        *result = pFile;
+    return sRet;
+}
+
+Status Win32Env::NewAppendableFile( const std::string& fname, WritableFile** result )
+{
+    Status sRet;
+    std::string path = fname;
+    Win32WritableFile* pFile = new Win32WritableFile(ModifyPath(path), true);
     if(!pFile->isEnable()){
         *result = NULL;
         sRet = Status::IOError(fname,Win32::GetLastErrSz());
diff --git a/util/options.cc b/util/options.cc
index 76af5b930..8b618fb1a 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -22,8 +22,8 @@ Options::Options()
       block_size(4096),
       block_restart_interval(16),
       compression(kSnappyCompression),
+      reuse_logs(false),
       filter_policy(NULL) {
 }
 
-
 }  // namespace leveldb
diff --git a/util/testutil.h b/util/testutil.h
index adad3fc1e..d7e458370 100644
--- a/util/testutil.h
+++ b/util/testutil.h
@@ -45,6 +45,16 @@ class ErrorEnv : public EnvWrapper {
     }
     return target()->NewWritableFile(fname, result);
   }
+
+  virtual Status NewAppendableFile(const std::string& fname,
+                                   WritableFile** result) {
+    if (writable_file_error_) {
+      ++num_writable_file_errors_;
+      *result = NULL;
+      return Status::IOError(fname, "fake error");
+    }
+    return target()->NewAppendableFile(fname, result);
+  }
 };
 
 }  // namespace test

From f67e496552ca939919a08b75784d089efee555c7 Mon Sep 17 00:00:00 2001
From: Jack Grigg 
Date: Wed, 23 Sep 2020 00:06:36 +0100
Subject: [PATCH 07/19] Squashed 'src/leveldb/' changes from
 a31c8aa40..196962ff0

196962ff0 Add AcceleratedCRC32C to port_win.h
1bdf1c34c Merge upstream LevelDB v1.20
d31721eb0 Merge #17: Fixed file sharing errors
fecd44902 Fixed file sharing error in Win32Env::GetFileSize(), Win32SequentialFile::_Init(), Win32RandomAccessFile::_Init() Fixed error checking in Win32SequentialFile::_Init()
5b7510f1b Merge #14: Merge upstream LevelDB 1.19
0d969fd57 Merge #16: [LevelDB] Do no crash if filesystem can't fsync
c8c029b5b [LevelDB] Do no crash if filesystem can't fsync
a53934a3a Increase leveldb version to 1.20.
f3f139737 Separate Env tests from PosixEnv tests.
eb4f0972f leveldb: Fix compilation warnings in port_posix_sse.cc on x86 (32-bit).
d0883b600 Fixed path to doc file: index.md.
7fa20948d Convert documentation to markdown.
ea175e28f Implement support for Intel crc32 instruction (SSE 4.2)
95cd743e5 Including  for std::numeric_limits.
646c3588d Limit the number of read-only files the POSIX Env will have open.
d40bc3fa5 Merge #13: Typo
ebbd772d3 Typo
a2fb086d0 Add option for max file size. The currend hard-coded value of 2M is inefficient in colossus.

git-subtree-dir: src/leveldb
git-subtree-split: 196962ff01c39b4705d8117df5c3f8c205349950
---
 Makefile                     |  12 +-
 README.md                    |  25 +-
 build_detect_platform        |  30 +-
 db/db_bench.cc               |  38 ++-
 db/db_impl.cc                |   1 +
 db/log_format.h              |   2 +-
 db/version_set.cc            |  57 ++--
 db/version_set.h             |   2 +-
 doc/doc.css                  |  89 ------
 doc/impl.html                | 213 --------------
 doc/impl.md                  | 170 +++++++++++
 doc/index.html               | 549 -----------------------------------
 doc/index.md                 | 523 +++++++++++++++++++++++++++++++++
 doc/log_format.md            |  75 +++++
 doc/log_format.txt           |  75 -----
 doc/table_format.md          | 107 +++++++
 doc/table_format.txt         | 104 -------
 include/leveldb/db.h         |   2 +-
 include/leveldb/options.h    |  12 +
 port/port_example.h          |   6 +
 port/port_posix.h            |   2 +
 port/port_posix_sse.cc       | 129 ++++++++
 port/port_win.h              |   2 +
 table/filter_block.cc        |   2 +-
 util/crc32c.cc               |  18 ++
 util/env_posix.cc            | 195 +++++++++----
 util/env_posix_test.cc       |  66 +++++
 util/env_posix_test_helper.h |  28 ++
 util/env_test.cc             |  18 +-
 util/env_win.cc              |  12 +-
 util/options.cc              |   1 +
 31 files changed, 1414 insertions(+), 1151 deletions(-)
 delete mode 100644 doc/doc.css
 delete mode 100644 doc/impl.html
 create mode 100644 doc/impl.md
 delete mode 100644 doc/index.html
 create mode 100644 doc/index.md
 create mode 100644 doc/log_format.md
 delete mode 100644 doc/log_format.txt
 create mode 100644 doc/table_format.md
 delete mode 100644 doc/table_format.txt
 create mode 100644 port/port_posix_sse.cc
 create mode 100644 util/env_posix_test.cc
 create mode 100644 util/env_posix_test_helper.h

diff --git a/Makefile b/Makefile
index 07a5a1ead..f7cc7d736 100644
--- a/Makefile
+++ b/Makefile
@@ -44,6 +44,7 @@ TESTS = \
 	util/cache_test \
 	util/coding_test \
 	util/crc32c_test \
+	util/env_posix_test \
 	util/env_test \
 	util/hash_test
 
@@ -121,7 +122,7 @@ SHARED_MEMENVLIB = $(SHARED_OUTDIR)/libmemenv.a
 else
 # Update db.h if you change these.
 SHARED_VERSION_MAJOR = 1
-SHARED_VERSION_MINOR = 19
+SHARED_VERSION_MINOR = 20
 SHARED_LIB1 = libleveldb.$(PLATFORM_SHARED_EXT)
 SHARED_LIB2 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR)
 SHARED_LIB3 = $(SHARED_LIB1).$(SHARED_VERSION_MAJOR).$(SHARED_VERSION_MINOR)
@@ -337,6 +338,9 @@ $(STATIC_OUTDIR)/db_test:db/db_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
 $(STATIC_OUTDIR)/dbformat_test:db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) $(LDFLAGS) $(CXXFLAGS) db/dbformat_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
 
+$(STATIC_OUTDIR)/env_posix_test:util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_posix_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
+
 $(STATIC_OUTDIR)/env_test:util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) $(LDFLAGS) $(CXXFLAGS) util/env_test.cc $(STATIC_LIBOBJECTS) $(TESTHARNESS) -o $@ $(LIBS)
 
@@ -412,3 +416,9 @@ $(SHARED_OUTDIR)/%.o: %.cc
 
 $(SHARED_OUTDIR)/%.o: %.c
 	$(CC) $(CFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
+
+$(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
+
+$(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@
diff --git a/README.md b/README.md
index c75b185e0..a010c5085 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
   * External activity (file system operations etc.) is relayed through a virtual interface so users can customize the operating system interactions.
 
 # Documentation
-  [LevelDB library documentation](https://rawgit.com/google/leveldb/master/doc/index.html) is online and bundled with the source code.
+  [LevelDB library documentation](https://github.com/google/leveldb/blob/master/doc/index.md) is online and bundled with the source code.
 
 
 # Limitations
@@ -113,29 +113,30 @@ by the one or two disk seeks needed to fetch the data from disk.
 Write performance will be mostly unaffected by whether or not the
 working set fits in memory.
 
-    readrandom   :      16.677 micros/op;  (approximately 60,000 reads per second)
-    readseq      :       0.476 micros/op;  232.3 MB/s
-    readreverse  :       0.724 micros/op;  152.9 MB/s
+    readrandom  : 16.677 micros/op;  (approximately 60,000 reads per second)
+    readseq     :  0.476 micros/op;  232.3 MB/s
+    readreverse :  0.724 micros/op;  152.9 MB/s
 
 LevelDB compacts its underlying storage data in the background to
 improve read performance.  The results listed above were done
 immediately after a lot of random writes.  The results after
 compactions (which are usually triggered automatically) are better.
 
-    readrandom   :      11.602 micros/op;  (approximately 85,000 reads per second)
-    readseq      :       0.423 micros/op;  261.8 MB/s
-    readreverse  :       0.663 micros/op;  166.9 MB/s
+    readrandom  : 11.602 micros/op;  (approximately 85,000 reads per second)
+    readseq     :  0.423 micros/op;  261.8 MB/s
+    readreverse :  0.663 micros/op;  166.9 MB/s
 
 Some of the high cost of reads comes from repeated decompression of blocks
 read from disk.  If we supply enough cache to the leveldb so it can hold the
 uncompressed blocks in memory, the read performance improves again:
 
-    readrandom   :       9.775 micros/op;  (approximately 100,000 reads per second before compaction)
-    readrandom   :       5.215 micros/op;  (approximately 190,000 reads per second after compaction)
+    readrandom  : 9.775 micros/op;  (approximately 100,000 reads per second before compaction)
+    readrandom  : 5.215 micros/op;  (approximately 190,000 reads per second after compaction)
 
 ## Repository contents
 
-See doc/index.html for more explanation. See doc/impl.html for a brief overview of the implementation.
+See [doc/index.md](doc/index.md) for more explanation. See
+[doc/impl.md](doc/impl.md) for a brief overview of the implementation.
 
 The public interface is in include/*.h.  Callers should not include or
 rely on the details of any other header files in this package.  Those
@@ -148,7 +149,7 @@ Guide to header files:
 * **include/options.h**: Control over the behavior of an entire database,
 and also control over the behavior of individual reads and writes.
 
-* **include/comparator.h**: Abstraction for user-specified comparison function. 
+* **include/comparator.h**: Abstraction for user-specified comparison function.
 If you want just bytewise comparison of keys, you can use the default
 comparator, but clients can write their own comparator implementations if they
 want custom ordering (e.g. to handle different character encodings, etc.)
@@ -165,7 +166,7 @@ length into some other byte array.
 * **include/status.h**: Status is returned from many of the public interfaces
 and is used to report success and various kinds of errors.
 
-* **include/env.h**: 
+* **include/env.h**:
 Abstraction of the OS environment.  A posix implementation of this interface is
 in util/env_posix.cc
 
diff --git a/build_detect_platform b/build_detect_platform
index d7edab1d8..4a9471590 100755
--- a/build_detect_platform
+++ b/build_detect_platform
@@ -63,6 +63,7 @@ PLATFORM_SHARED_EXT="so"
 PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
 PLATFORM_SHARED_CFLAGS="-fPIC"
 PLATFORM_SHARED_VERSIONED=true
+PLATFORM_SSEFLAGS=
 
 MEMCMP_FLAG=
 if [ "$CXX" = "g++" ]; then
@@ -77,6 +78,7 @@ case "$TARGET_OS" in
         COMMON_FLAGS="$MEMCMP_FLAG -lpthread -DOS_LINUX -DCYGWIN"
         PLATFORM_LDFLAGS="-lpthread"
         PORT_FILE=port/port_posix.cc
+        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     Darwin)
         PLATFORM=OS_MACOSX
@@ -85,24 +87,28 @@ case "$TARGET_OS" in
         [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
         PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/"
         PORT_FILE=port/port_posix.cc
+        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     Linux)
         PLATFORM=OS_LINUX
         COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX"
         PLATFORM_LDFLAGS="-pthread"
         PORT_FILE=port/port_posix.cc
+        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     SunOS)
         PLATFORM=OS_SOLARIS
         COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS"
         PLATFORM_LIBS="-lpthread -lrt"
         PORT_FILE=port/port_posix.cc
+        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     FreeBSD)
         PLATFORM=OS_FREEBSD
         COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD"
         PLATFORM_LIBS="-lpthread"
         PORT_FILE=port/port_posix.cc
+        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     GNU/kFreeBSD)
         PLATFORM=OS_KFREEBSD
@@ -115,24 +121,28 @@ case "$TARGET_OS" in
         COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"
         PLATFORM_LIBS="-lpthread -lgcc_s"
         PORT_FILE=port/port_posix.cc
+        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     OpenBSD)
         PLATFORM=OS_OPENBSD
         COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD"
         PLATFORM_LDFLAGS="-pthread"
         PORT_FILE=port/port_posix.cc
+        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     DragonFly)
         PLATFORM=OS_DRAGONFLYBSD
         COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD"
         PLATFORM_LIBS="-lpthread"
         PORT_FILE=port/port_posix.cc
+        PORT_SSE_FILE=port/port_posix_sse.cc
         ;;
     OS_ANDROID_CROSSCOMPILE)
         PLATFORM=OS_ANDROID
         COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
         PLATFORM_LDFLAGS=""  # All pthread features are in the Android C library
         PORT_FILE=port/port_posix.cc
+        PORT_SSE_FILE=port/port_posix_sse.cc
         CROSS_COMPILE=true
         ;;
     HP-UX)
@@ -140,6 +150,7 @@ case "$TARGET_OS" in
         COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
         PLATFORM_LDFLAGS="-pthread"
         PORT_FILE=port/port_posix.cc
+        PORT_SSE_FILE=port/port_posix_sse.cc
         # man ld: +h internal_name
         PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
         ;;
@@ -148,6 +159,7 @@ case "$TARGET_OS" in
         COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
         [ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
         PORT_FILE=port/port_posix.cc
+        PORT_SSE_FILE=port/port_posix_sse.cc
         PLATFORM_SHARED_EXT=
         PLATFORM_SHARED_LDFLAGS=
         PLATFORM_SHARED_CFLAGS=
@@ -182,7 +194,7 @@ set +f # re-enable globbing
 
 # The sources consist of the portable files, plus the platform-specific port
 # file.
-echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
+echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT
 echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
 
 if [ "$CROSS_COMPILE" = "true" ]; then
@@ -213,6 +225,21 @@ EOF
     fi
 
     rm -f $CXXOUTPUT 2>/dev/null
+
+    # Test if gcc SSE 4.2 is supported
+    $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -msse4.2 2>/dev/null  </dev/null
+fi
+
+# Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them.
+if [ -n "$PLATFORM_SSEFLAGS" ]; then
+    PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE"
 fi
 
 PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
@@ -225,6 +252,7 @@ echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
 echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
 echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
 echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
+echo "PLATFORM_SSEFLAGS=$PLATFORM_SSEFLAGS" >> $OUTPUT
 echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
 echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
 echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 7a0f5e08c..3ad19a512 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -84,6 +84,14 @@ static bool FLAGS_histogram = false;
 // (initialized to default value by "main")
 static int FLAGS_write_buffer_size = 0;
 
+// Number of bytes written to each file.
+// (initialized to default value by "main")
+static int FLAGS_max_file_size = 0;
+
+// Approximate size of user data packed per block (before compression.
+// (initialized to default value by "main")
+static int FLAGS_block_size = 0;
+
 // Number of bytes to use as a cache of uncompressed data.
 // Negative means use default settings.
 static int FLAGS_cache_size = -1;
@@ -109,6 +117,7 @@ static const char* FLAGS_db = NULL;
 namespace leveldb {
 
 namespace {
+leveldb::Env* g_env = NULL;
 
 // Helper for quickly generating random data.
 class RandomGenerator {
@@ -186,7 +195,7 @@ class Stats {
     done_ = 0;
     bytes_ = 0;
     seconds_ = 0;
-    start_ = Env::Default()->NowMicros();
+    start_ = g_env->NowMicros();
     finish_ = start_;
     message_.clear();
   }
@@ -204,7 +213,7 @@ class Stats {
   }
 
   void Stop() {
-    finish_ = Env::Default()->NowMicros();
+    finish_ = g_env->NowMicros();
     seconds_ = (finish_ - start_) * 1e-6;
   }
 
@@ -214,7 +223,7 @@ class Stats {
 
   void FinishedSingleOp() {
     if (FLAGS_histogram) {
-      double now = Env::Default()->NowMicros();
+      double now = g_env->NowMicros();
       double micros = now - last_op_finish_;
       hist_.Add(micros);
       if (micros > 20000) {
@@ -404,10 +413,10 @@ class Benchmark {
     reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
     heap_counter_(0) {
     std::vector files;
-    Env::Default()->GetChildren(FLAGS_db, &files);
+    g_env->GetChildren(FLAGS_db, &files);
     for (size_t i = 0; i < files.size(); i++) {
       if (Slice(files[i]).starts_with("heap-")) {
-        Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
+        g_env->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
       }
     }
     if (!FLAGS_use_existing_db) {
@@ -589,7 +598,7 @@ class Benchmark {
       arg[i].shared = &shared;
       arg[i].thread = new ThreadState(i);
       arg[i].thread->shared = &shared;
-      Env::Default()->StartThread(ThreadBody, &arg[i]);
+      g_env->StartThread(ThreadBody, &arg[i]);
     }
 
     shared.mu.Lock();
@@ -700,9 +709,12 @@ class Benchmark {
   void Open() {
     assert(db_ == NULL);
     Options options;
+    options.env = g_env;
     options.create_if_missing = !FLAGS_use_existing_db;
     options.block_cache = cache_;
     options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_file_size = FLAGS_max_file_size;
+    options.block_size = FLAGS_block_size;
     options.max_open_files = FLAGS_open_files;
     options.filter_policy = filter_policy_;
     options.reuse_logs = FLAGS_reuse_logs;
@@ -925,7 +937,7 @@ class Benchmark {
     char fname[100];
     snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_);
     WritableFile* file;
-    Status s = Env::Default()->NewWritableFile(fname, &file);
+    Status s = g_env->NewWritableFile(fname, &file);
     if (!s.ok()) {
       fprintf(stderr, "%s\n", s.ToString().c_str());
       return;
@@ -934,7 +946,7 @@ class Benchmark {
     delete file;
     if (!ok) {
       fprintf(stderr, "heap profiling not supported\n");
-      Env::Default()->DeleteFile(fname);
+      g_env->DeleteFile(fname);
     }
   }
 };
@@ -943,6 +955,8 @@ class Benchmark {
 
 int main(int argc, char** argv) {
   FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
+  FLAGS_max_file_size = leveldb::Options().max_file_size;
+  FLAGS_block_size = leveldb::Options().block_size;
   FLAGS_open_files = leveldb::Options().max_open_files;
   std::string default_db_path;
 
@@ -973,6 +987,10 @@ int main(int argc, char** argv) {
       FLAGS_value_size = n;
     } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
       FLAGS_write_buffer_size = n;
+    } else if (sscanf(argv[i], "--max_file_size=%d%c", &n, &junk) == 1) {
+      FLAGS_max_file_size = n;
+    } else if (sscanf(argv[i], "--block_size=%d%c", &n, &junk) == 1) {
+      FLAGS_block_size = n;
     } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
       FLAGS_cache_size = n;
     } else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
@@ -987,9 +1005,11 @@ int main(int argc, char** argv) {
     }
   }
 
+  leveldb::g_env = leveldb::Env::Default();
+
   // Choose a location for the test database if none given with --db=
   if (FLAGS_db == NULL) {
-      leveldb::Env::Default()->GetTestDirectory(&default_db_path);
+      leveldb::g_env->GetTestDirectory(&default_db_path);
       default_db_path += "/dbbench";
       FLAGS_db = default_db_path.c_str();
   }
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 60f4e66e5..f43ad7679 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -96,6 +96,7 @@ Options SanitizeOptions(const std::string& dbname,
   result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
   ClipToRange(&result.max_open_files,    64 + kNumNonTableCacheFiles, 50000);
   ClipToRange(&result.write_buffer_size, 64<<10,                      1<<30);
+  ClipToRange(&result.max_file_size,     1<<20,                       1<<30);
   ClipToRange(&result.block_size,        1<<10,                       4<<20);
   if (result.info_log == NULL) {
     // Open a log file in the same directory as the db
diff --git a/db/log_format.h b/db/log_format.h
index a8c06efe1..356e69fca 100644
--- a/db/log_format.h
+++ b/db/log_format.h
@@ -3,7 +3,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
 // Log format information shared by reader and writer.
-// See ../doc/log_format.txt for more detail.
+// See ../doc/log_format.md for more detail.
 
 #ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
 #define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
diff --git a/db/version_set.cc b/db/version_set.cc
index a5e0f77a6..b1256f90e 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -20,21 +20,29 @@
 
 namespace leveldb {
 
-static const int kTargetFileSize = 2 * 1048576;
+static int TargetFileSize(const Options* options) {
+  return options->max_file_size;
+}
 
 // Maximum bytes of overlaps in grandparent (i.e., level+2) before we
 // stop building a single file in a level->level+1 compaction.
-static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize;
+static int64_t MaxGrandParentOverlapBytes(const Options* options) {
+  return 10 * TargetFileSize(options);
+}
 
 // Maximum number of bytes in all compacted files.  We avoid expanding
 // the lower level file set of a compaction if it would make the
 // total compaction cover more than this many bytes.
-static const int64_t kExpandedCompactionByteSizeLimit = 25 * kTargetFileSize;
+static int64_t ExpandedCompactionByteSizeLimit(const Options* options) {
+  return 25 * TargetFileSize(options);
+}
 
-static double MaxBytesForLevel(int level) {
+static double MaxBytesForLevel(const Options* options, int level) {
   // Note: the result for level zero is not really used since we set
   // the level-0 compaction threshold based on number of files.
-  double result = 10 * 1048576.0;  // Result for both level-0 and level-1
+
+  // Result for both level-0 and level-1
+  double result = 10. * 1048576.0;
   while (level > 1) {
     result *= 10;
     level--;
@@ -42,8 +50,9 @@ static double MaxBytesForLevel(int level) {
   return result;
 }
 
-static uint64_t MaxFileSizeForLevel(int level) {
-  return kTargetFileSize;  // We could vary per level to reduce number of files?
+static uint64_t MaxFileSizeForLevel(const Options* options, int level) {
+  // We could vary per level to reduce number of files?
+  return TargetFileSize(options);
 }
 
 static int64_t TotalFileSize(const std::vector& files) {
@@ -508,7 +517,7 @@ int Version::PickLevelForMemTableOutput(
         // Check that file does not overlap too many grandparent bytes.
         GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
         const int64_t sum = TotalFileSize(overlaps);
-        if (sum > kMaxGrandParentOverlapBytes) {
+        if (sum > MaxGrandParentOverlapBytes(vset_->options_)) {
           break;
         }
       }
@@ -1027,7 +1036,7 @@ bool VersionSet::ReuseManifest(const std::string& dscname,
       manifest_type != kDescriptorFile ||
       !env_->GetFileSize(dscname, &manifest_size).ok() ||
       // Make new compacted MANIFEST if old one is too big
-      manifest_size >= kTargetFileSize) {
+      manifest_size >= TargetFileSize(options_)) {
     return false;
   }
 
@@ -1076,7 +1085,8 @@ void VersionSet::Finalize(Version* v) {
     } else {
       // Compute the ratio of current size to size limit.
       const uint64_t level_bytes = TotalFileSize(v->files_[level]);
-      score = static_cast(level_bytes) / MaxBytesForLevel(level);
+      score =
+          static_cast(level_bytes) / MaxBytesForLevel(options_, level);
     }
 
     if (score > best_score) {
@@ -1290,7 +1300,7 @@ Compaction* VersionSet::PickCompaction() {
     level = current_->compaction_level_;
     assert(level >= 0);
     assert(level+1 < config::kNumLevels);
-    c = new Compaction(level);
+    c = new Compaction(options_, level);
 
     // Pick the first file that comes after compact_pointer_[level]
     for (size_t i = 0; i < current_->files_[level].size(); i++) {
@@ -1307,7 +1317,7 @@ Compaction* VersionSet::PickCompaction() {
     }
   } else if (seek_compaction) {
     level = current_->file_to_compact_level_;
-    c = new Compaction(level);
+    c = new Compaction(options_, level);
     c->inputs_[0].push_back(current_->file_to_compact_);
   } else {
     return NULL;
@@ -1352,7 +1362,8 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
     const int64_t inputs1_size = TotalFileSize(c->inputs_[1]);
     const int64_t expanded0_size = TotalFileSize(expanded0);
     if (expanded0.size() > c->inputs_[0].size() &&
-        inputs1_size + expanded0_size < kExpandedCompactionByteSizeLimit) {
+        inputs1_size + expanded0_size <
+            ExpandedCompactionByteSizeLimit(options_)) {
       InternalKey new_start, new_limit;
       GetRange(expanded0, &new_start, &new_limit);
       std::vector expanded1;
@@ -1414,7 +1425,7 @@ Compaction* VersionSet::CompactRange(
   // and we must not pick one file and drop another older file if the
   // two files overlap.
   if (level > 0) {
-    const uint64_t limit = MaxFileSizeForLevel(level);
+    const uint64_t limit = MaxFileSizeForLevel(options_, level);
     uint64_t total = 0;
     for (size_t i = 0; i < inputs.size(); i++) {
       uint64_t s = inputs[i]->file_size;
@@ -1426,7 +1437,7 @@ Compaction* VersionSet::CompactRange(
     }
   }
 
-  Compaction* c = new Compaction(level);
+  Compaction* c = new Compaction(options_, level);
   c->input_version_ = current_;
   c->input_version_->Ref();
   c->inputs_[0] = inputs;
@@ -1434,9 +1445,9 @@ Compaction* VersionSet::CompactRange(
   return c;
 }
 
-Compaction::Compaction(int level)
+Compaction::Compaction(const Options* options, int level)
     : level_(level),
-      max_output_file_size_(MaxFileSizeForLevel(level)),
+      max_output_file_size_(MaxFileSizeForLevel(options, level)),
       input_version_(NULL),
       grandparent_index_(0),
       seen_key_(false),
@@ -1453,12 +1464,13 @@ Compaction::~Compaction() {
 }
 
 bool Compaction::IsTrivialMove() const {
+  const VersionSet* vset = input_version_->vset_;
   // Avoid a move if there is lots of overlapping grandparent data.
   // Otherwise, the move could create a parent file that will require
   // a very expensive merge later on.
-  return (num_input_files(0) == 1 &&
-          num_input_files(1) == 0 &&
-          TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes);
+  return (num_input_files(0) == 1 && num_input_files(1) == 0 &&
+          TotalFileSize(grandparents_) <=
+              MaxGrandParentOverlapBytes(vset->options_));
 }
 
 void Compaction::AddInputDeletions(VersionEdit* edit) {
@@ -1491,8 +1503,9 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
 }
 
 bool Compaction::ShouldStopBefore(const Slice& internal_key) {
+  const VersionSet* vset = input_version_->vset_;
   // Scan to find earliest grandparent file that contains key.
-  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+  const InternalKeyComparator* icmp = &vset->icmp_;
   while (grandparent_index_ < grandparents_.size() &&
       icmp->Compare(internal_key,
                     grandparents_[grandparent_index_]->largest.Encode()) > 0) {
@@ -1503,7 +1516,7 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) {
   }
   seen_key_ = true;
 
-  if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) {
+  if (overlapped_bytes_ > MaxGrandParentOverlapBytes(vset->options_)) {
     // Too much overlap for current output; start new output
     overlapped_bytes_ = 0;
     return true;
diff --git a/db/version_set.h b/db/version_set.h
index 1dec74567..c4e7ac360 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -366,7 +366,7 @@ class Compaction {
   friend class Version;
   friend class VersionSet;
 
-  explicit Compaction(int level);
+  Compaction(const Options* options, int level);
 
   int level_;
   uint64_t max_output_file_size_;
diff --git a/doc/doc.css b/doc/doc.css
deleted file mode 100644
index 700c564e4..000000000
--- a/doc/doc.css
+++ /dev/null
@@ -1,89 +0,0 @@
-body {
-  margin-left: 0.5in;
-  margin-right: 0.5in;
-  background: white;
-  color: black;
-}
-
-h1 {
-  margin-left: -0.2in;
-  font-size: 14pt;
-}
-h2 {
-  margin-left: -0in;
-  font-size: 12pt;
-}
-h3 {
-  margin-left: -0in;
-}
-h4 {
-  margin-left: -0in;
-}
-hr {
-  margin-left: -0in;
-}
-
-/* Definition lists: definition term bold */
-dt {
-  font-weight: bold;
-}
-
-address {
-  text-align: center;
-}
-code,samp,var {
-  color: blue;
-}
-kbd {
-  color: #600000;
-}
-div.note p {
-  float: right;
-  width: 3in;
-  margin-right: 0%;
-  padding: 1px;
-  border: 2px solid #6060a0;
-  background-color: #fffff0;
-}
-
-ul {
-  margin-top: -0em;
-  margin-bottom: -0em;
-}
-
-ol {
-  margin-top: -0em;
-  margin-bottom: -0em;
-}
-
-UL.nobullets {
-  list-style-type: none;
-  list-style-image: none;
-  margin-left: -1em;
-}
-
-p {
-  margin: 1em 0 1em 0;
-  padding: 0 0 0 0;
-}
-
-pre {
-  line-height: 1.3em;
-  padding: 0.4em 0 0.8em 0;
-  margin:  0 0 0 0;
-  border:  0 0 0 0;
-  color: blue;
-}
-
-.datatable {
-  margin-left: auto;
-  margin-right: auto;
-  margin-top: 2em;
-  margin-bottom: 2em;
-  border: 1px solid;
-}
-
-.datatable td,th {
-  padding: 0 0.5em 0 0.5em;
-  text-align: right;
-}
diff --git a/doc/impl.html b/doc/impl.html
deleted file mode 100644
index 6a468be09..000000000
--- a/doc/impl.html
+++ /dev/null
@@ -1,213 +0,0 @@
-
-
-
-
-Leveldb file layout and compactions
-
-
-
-
-

Files

- -The implementation of leveldb is similar in spirit to the -representation of a single - -Bigtable tablet (section 5.3). -However the organization of the files that make up the representation -is somewhat different and is explained below. - -

-Each database is represented by a set of files stored in a directory. -There are several different types of files as documented below: -

-

Log files

-

-A log file (*.log) stores a sequence of recent updates. Each update -is appended to the current log file. When the log file reaches a -pre-determined size (approximately 4MB by default), it is converted -to a sorted table (see below) and a new log file is created for future -updates. -

-A copy of the current log file is kept in an in-memory structure (the -memtable). This copy is consulted on every read so that read -operations reflect all logged updates. -

-

Sorted tables

-

-A sorted table (*.sst) stores a sequence of entries sorted by key. -Each entry is either a value for the key, or a deletion marker for the -key. (Deletion markers are kept around to hide obsolete values -present in older sorted tables). -

-The set of sorted tables are organized into a sequence of levels. The -sorted table generated from a log file is placed in a special young -level (also called level-0). When the number of young files exceeds a -certain threshold (currently four), all of the young files are merged -together with all of the overlapping level-1 files to produce a -sequence of new level-1 files (we create a new level-1 file for every -2MB of data.) -

-Files in the young level may contain overlapping keys. However files -in other levels have distinct non-overlapping key ranges. Consider -level number L where L >= 1. When the combined size of files in -level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2, -...), one file in level-L, and all of the overlapping files in -level-(L+1) are merged to form a set of new files for level-(L+1). -These merges have the effect of gradually migrating new updates from -the young level to the largest level using only bulk reads and writes -(i.e., minimizing expensive seeks). - -

Manifest

-

-A MANIFEST file lists the set of sorted tables that make up each -level, the corresponding key ranges, and other important metadata. -A new MANIFEST file (with a new number embedded in the file name) -is created whenever the database is reopened. The MANIFEST file is -formatted as a log, and changes made to the serving state (as files -are added or removed) are appended to this log. -

-

Current

-

-CURRENT is a simple text file that contains the name of the latest -MANIFEST file. -

-

Info logs

-

-Informational messages are printed to files named LOG and LOG.old. -

-

Others

-

-Other files used for miscellaneous purposes may also be present -(LOCK, *.dbtmp). - -

Level 0

-When the log file grows above a certain size (1MB by default): -
    -
  • Create a brand new memtable and log file and direct future updates here -
  • In the background: -
      -
    • Write the contents of the previous memtable to an sstable -
    • Discard the memtable -
    • Delete the old log file and the old memtable -
    • Add the new sstable to the young (level-0) level. -
    -
- -

Compactions

- -

-When the size of level L exceeds its limit, we compact it in a -background thread. The compaction picks a file from level L and all -overlapping files from the next level L+1. Note that if a level-L -file overlaps only part of a level-(L+1) file, the entire file at -level-(L+1) is used as an input to the compaction and will be -discarded after the compaction. Aside: because level-0 is special -(files in it may overlap each other), we treat compactions from -level-0 to level-1 specially: a level-0 compaction may pick more than -one level-0 file in case some of these files overlap each other. - -

-A compaction merges the contents of the picked files to produce a -sequence of level-(L+1) files. We switch to producing a new -level-(L+1) file after the current output file has reached the target -file size (2MB). We also switch to a new output file when the key -range of the current output file has grown enough to overlap more than -ten level-(L+2) files. This last rule ensures that a later compaction -of a level-(L+1) file will not pick up too much data from level-(L+2). - -

-The old files are discarded and the new files are added to the serving -state. - -

-Compactions for a particular level rotate through the key space. In -more detail, for each level L, we remember the ending key of the last -compaction at level L. The next compaction for level L will pick the -first file that starts after this key (wrapping around to the -beginning of the key space if there is no such file). - -

-Compactions drop overwritten values. They also drop deletion markers -if there are no higher numbered levels that contain a file whose range -overlaps the current key. - -

Timing

- -Level-0 compactions will read up to four 1MB files from level-0, and -at worst all the level-1 files (10MB). I.e., we will read 14MB and -write 14MB. - -

-Other than the special level-0 compactions, we will pick one 2MB file -from level L. In the worst case, this will overlap ~ 12 files from -level L+1 (10 because level-(L+1) is ten times the size of level-L, -and another two at the boundaries since the file ranges at level-L -will usually not be aligned with the file ranges at level-L+1). The -compaction will therefore read 26MB and write 26MB. Assuming a disk -IO rate of 100MB/s (ballpark range for modern drives), the worst -compaction cost will be approximately 0.5 second. - -

-If we throttle the background writing to something small, say 10% of -the full 100MB/s speed, a compaction may take up to 5 seconds. If the -user is writing at 10MB/s, we might build up lots of level-0 files -(~50 to hold the 5*10MB). This may significantly increase the cost of -reads due to the overhead of merging more files together on every -read. - -

-Solution 1: To reduce this problem, we might want to increase the log -switching threshold when the number of level-0 files is large. Though -the downside is that the larger this threshold, the more memory we will -need to hold the corresponding memtable. - -

-Solution 2: We might want to decrease write rate artificially when the -number of level-0 files goes up. - -

-Solution 3: We work on reducing the cost of very wide merges. -Perhaps most of the level-0 files will have their blocks sitting -uncompressed in the cache and we will only need to worry about the -O(N) complexity in the merging iterator. - -

Number of files

- -Instead of always making 2MB files, we could make larger files for -larger levels to reduce the total file count, though at the expense of -more bursty compactions. Alternatively, we could shard the set of -files into multiple directories. - -

-An experiment on an ext3 filesystem on Feb 04, 2011 shows -the following timings to do 100K file opens in directories with -varying number of files: - - - - - -
Files in directoryMicroseconds to open a file
10009
1000010
10000016
-So maybe even the sharding is not necessary on modern filesystems? - -

Recovery

- -
    -
  • Read CURRENT to find name of the latest committed MANIFEST -
  • Read the named MANIFEST file -
  • Clean up stale files -
  • We could open all sstables here, but it is probably better to be lazy... -
  • Convert log chunk to a new level-0 sstable -
  • Start directing new writes to a new log file with recovered sequence# -
- -

Garbage collection of files

- -DeleteObsoleteFiles() is called at the end of every -compaction and at the end of recovery. It finds the names of all -files in the database. It deletes all log files that are not the -current log file. It deletes all table files that are not referenced -from some level and are not the output of an active compaction. - - - diff --git a/doc/impl.md b/doc/impl.md new file mode 100644 index 000000000..4b13f2a6b --- /dev/null +++ b/doc/impl.md @@ -0,0 +1,170 @@ +## Files + +The implementation of leveldb is similar in spirit to the representation of a +single [Bigtable tablet (section 5.3)](http://research.google.com/archive/bigtable.html). +However the organization of the files that make up the representation is +somewhat different and is explained below. + +Each database is represented by a set of files stored in a directory. There are +several different types of files as documented below: + +### Log files + +A log file (*.log) stores a sequence of recent updates. Each update is appended +to the current log file. When the log file reaches a pre-determined size +(approximately 4MB by default), it is converted to a sorted table (see below) +and a new log file is created for future updates. + +A copy of the current log file is kept in an in-memory structure (the +`memtable`). This copy is consulted on every read so that read operations +reflect all logged updates. + +## Sorted tables + +A sorted table (*.ldb) stores a sequence of entries sorted by key. Each entry is +either a value for the key, or a deletion marker for the key. (Deletion markers +are kept around to hide obsolete values present in older sorted tables). + +The set of sorted tables are organized into a sequence of levels. The sorted +table generated from a log file is placed in a special **young** level (also +called level-0). When the number of young files exceeds a certain threshold +(currently four), all of the young files are merged together with all of the +overlapping level-1 files to produce a sequence of new level-1 files (we create +a new level-1 file for every 2MB of data.) + +Files in the young level may contain overlapping keys. However files in other +levels have distinct non-overlapping key ranges. Consider level number L where +L >= 1. When the combined size of files in level-L exceeds (10^L) MB (i.e., 10MB +for level-1, 100MB for level-2, ...), one file in level-L, and all of the +overlapping files in level-(L+1) are merged to form a set of new files for +level-(L+1). These merges have the effect of gradually migrating new updates +from the young level to the largest level using only bulk reads and writes +(i.e., minimizing expensive seeks). + +### Manifest + +A MANIFEST file lists the set of sorted tables that make up each level, the +corresponding key ranges, and other important metadata. A new MANIFEST file +(with a new number embedded in the file name) is created whenever the database +is reopened. The MANIFEST file is formatted as a log, and changes made to the +serving state (as files are added or removed) are appended to this log. + +### Current + +CURRENT is a simple text file that contains the name of the latest MANIFEST +file. + +### Info logs + +Informational messages are printed to files named LOG and LOG.old. + +### Others + +Other files used for miscellaneous purposes may also be present (LOCK, *.dbtmp). + +## Level 0 + +When the log file grows above a certain size (1MB by default): +Create a brand new memtable and log file and direct future updates here +In the background: +Write the contents of the previous memtable to an sstable +Discard the memtable +Delete the old log file and the old memtable +Add the new sstable to the young (level-0) level. + +## Compactions + +When the size of level L exceeds its limit, we compact it in a background +thread. The compaction picks a file from level L and all overlapping files from +the next level L+1. Note that if a level-L file overlaps only part of a +level-(L+1) file, the entire file at level-(L+1) is used as an input to the +compaction and will be discarded after the compaction. Aside: because level-0 +is special (files in it may overlap each other), we treat compactions from +level-0 to level-1 specially: a level-0 compaction may pick more than one +level-0 file in case some of these files overlap each other. + +A compaction merges the contents of the picked files to produce a sequence of +level-(L+1) files. We switch to producing a new level-(L+1) file after the +current output file has reached the target file size (2MB). We also switch to a +new output file when the key range of the current output file has grown enough +to overlap more than ten level-(L+2) files. This last rule ensures that a later +compaction of a level-(L+1) file will not pick up too much data from +level-(L+2). + +The old files are discarded and the new files are added to the serving state. + +Compactions for a particular level rotate through the key space. In more detail, +for each level L, we remember the ending key of the last compaction at level L. +The next compaction for level L will pick the first file that starts after this +key (wrapping around to the beginning of the key space if there is no such +file). + +Compactions drop overwritten values. They also drop deletion markers if there +are no higher numbered levels that contain a file whose range overlaps the +current key. + +### Timing + +Level-0 compactions will read up to four 1MB files from level-0, and at worst +all the level-1 files (10MB). I.e., we will read 14MB and write 14MB. + +Other than the special level-0 compactions, we will pick one 2MB file from level +L. In the worst case, this will overlap ~ 12 files from level L+1 (10 because +level-(L+1) is ten times the size of level-L, and another two at the boundaries +since the file ranges at level-L will usually not be aligned with the file +ranges at level-L+1). The compaction will therefore read 26MB and write 26MB. +Assuming a disk IO rate of 100MB/s (ballpark range for modern drives), the worst +compaction cost will be approximately 0.5 second. + +If we throttle the background writing to something small, say 10% of the full +100MB/s speed, a compaction may take up to 5 seconds. If the user is writing at +10MB/s, we might build up lots of level-0 files (~50 to hold the 5*10MB). This +may significantly increase the cost of reads due to the overhead of merging more +files together on every read. + +Solution 1: To reduce this problem, we might want to increase the log switching +threshold when the number of level-0 files is large. Though the downside is that +the larger this threshold, the more memory we will need to hold the +corresponding memtable. + +Solution 2: We might want to decrease write rate artificially when the number of +level-0 files goes up. + +Solution 3: We work on reducing the cost of very wide merges. Perhaps most of +the level-0 files will have their blocks sitting uncompressed in the cache and +we will only need to worry about the O(N) complexity in the merging iterator. + +### Number of files + +Instead of always making 2MB files, we could make larger files for larger levels +to reduce the total file count, though at the expense of more bursty +compactions. Alternatively, we could shard the set of files into multiple +directories. + +An experiment on an ext3 filesystem on Feb 04, 2011 shows the following timings +to do 100K file opens in directories with varying number of files: + + +| Files in directory | Microseconds to open a file | +|-------------------:|----------------------------:| +| 1000 | 9 | +| 10000 | 10 | +| 100000 | 16 | + +So maybe even the sharding is not necessary on modern filesystems? + +## Recovery + +* Read CURRENT to find name of the latest committed MANIFEST +* Read the named MANIFEST file +* Clean up stale files +* We could open all sstables here, but it is probably better to be lazy... +* Convert log chunk to a new level-0 sstable +* Start directing new writes to a new log file with recovered sequence# + +## Garbage collection of files + +`DeleteObsoleteFiles()` is called at the end of every compaction and at the end +of recovery. It finds the names of all files in the database. It deletes all log +files that are not the current log file. It deletes all table files that are not +referenced from some level and are not the output of an active compaction. diff --git a/doc/index.html b/doc/index.html deleted file mode 100644 index 215519258..000000000 --- a/doc/index.html +++ /dev/null @@ -1,549 +0,0 @@ - - - - -Leveldb - - - -

Leveldb

-
Jeff Dean, Sanjay Ghemawat
-

-The leveldb library provides a persistent key value store. Keys and -values are arbitrary byte arrays. The keys are ordered within the key -value store according to a user-specified comparator function. - -

-

Opening A Database

-

-A leveldb database has a name which corresponds to a file system -directory. All of the contents of database are stored in this -directory. The following example shows how to open a database, -creating it if necessary: -

-

-  #include <cassert>
-  #include "leveldb/db.h"
-
-  leveldb::DB* db;
-  leveldb::Options options;
-  options.create_if_missing = true;
-  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
-  assert(status.ok());
-  ...
-
-If you want to raise an error if the database already exists, add -the following line before the leveldb::DB::Open call: -
-  options.error_if_exists = true;
-
-

Status

-

-You may have noticed the leveldb::Status type above. Values of this -type are returned by most functions in leveldb that may encounter an -error. You can check if such a result is ok, and also print an -associated error message: -

-

-   leveldb::Status s = ...;
-   if (!s.ok()) cerr << s.ToString() << endl;
-
-

Closing A Database

-

-When you are done with a database, just delete the database object. -Example: -

-

-  ... open the db as described above ...
-  ... do something with db ...
-  delete db;
-
-

Reads And Writes

-

-The database provides Put, Delete, and Get methods to -modify/query the database. For example, the following code -moves the value stored under key1 to key2. -

-  std::string value;
-  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
-  if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value);
-  if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1);
-
- -

Atomic Updates

-

-Note that if the process dies after the Put of key2 but before the -delete of key1, the same value may be left stored under multiple keys. -Such problems can be avoided by using the WriteBatch class to -atomically apply a set of updates: -

-

-  #include "leveldb/write_batch.h"
-  ...
-  std::string value;
-  leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value);
-  if (s.ok()) {
-    leveldb::WriteBatch batch;
-    batch.Delete(key1);
-    batch.Put(key2, value);
-    s = db->Write(leveldb::WriteOptions(), &batch);
-  }
-
-The WriteBatch holds a sequence of edits to be made to the database, -and these edits within the batch are applied in order. Note that we -called Delete before Put so that if key1 is identical to key2, -we do not end up erroneously dropping the value entirely. -

-Apart from its atomicity benefits, WriteBatch may also be used to -speed up bulk updates by placing lots of individual mutations into the -same batch. - -

Synchronous Writes

-By default, each write to leveldb is asynchronous: it -returns after pushing the write from the process into the operating -system. The transfer from operating system memory to the underlying -persistent storage happens asynchronously. The sync flag -can be turned on for a particular write to make the write operation -not return until the data being written has been pushed all the way to -persistent storage. (On Posix systems, this is implemented by calling -either fsync(...) or fdatasync(...) or -msync(..., MS_SYNC) before the write operation returns.) -
-  leveldb::WriteOptions write_options;
-  write_options.sync = true;
-  db->Put(write_options, ...);
-
-Asynchronous writes are often more than a thousand times as fast as -synchronous writes. The downside of asynchronous writes is that a -crash of the machine may cause the last few updates to be lost. Note -that a crash of just the writing process (i.e., not a reboot) will not -cause any loss since even when sync is false, an update -is pushed from the process memory into the operating system before it -is considered done. - -

-Asynchronous writes can often be used safely. For example, when -loading a large amount of data into the database you can handle lost -updates by restarting the bulk load after a crash. A hybrid scheme is -also possible where every Nth write is synchronous, and in the event -of a crash, the bulk load is restarted just after the last synchronous -write finished by the previous run. (The synchronous write can update -a marker that describes where to restart on a crash.) - -

-WriteBatch provides an alternative to asynchronous writes. -Multiple updates may be placed in the same WriteBatch and -applied together using a synchronous write (i.e., -write_options.sync is set to true). The extra cost of -the synchronous write will be amortized across all of the writes in -the batch. - -

-

Concurrency

-

-A database may only be opened by one process at a time. -The leveldb implementation acquires a lock from the -operating system to prevent misuse. Within a single process, the -same leveldb::DB object may be safely shared by multiple -concurrent threads. I.e., different threads may write into or fetch -iterators or call Get on the same database without any -external synchronization (the leveldb implementation will -automatically do the required synchronization). However other objects -(like Iterator and WriteBatch) may require external synchronization. -If two threads share such an object, they must protect access to it -using their own locking protocol. More details are available in -the public header files. -

-

Iteration

-

-The following example demonstrates how to print all key,value pairs -in a database. -

-

-  leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
-  for (it->SeekToFirst(); it->Valid(); it->Next()) {
-    cout << it->key().ToString() << ": "  << it->value().ToString() << endl;
-  }
-  assert(it->status().ok());  // Check for any errors found during the scan
-  delete it;
-
-The following variation shows how to process just the keys in the -range [start,limit): -

-

-  for (it->Seek(start);
-       it->Valid() && it->key().ToString() < limit;
-       it->Next()) {
-    ...
-  }
-
-You can also process entries in reverse order. (Caveat: reverse -iteration may be somewhat slower than forward iteration.) -

-

-  for (it->SeekToLast(); it->Valid(); it->Prev()) {
-    ...
-  }
-
-

Snapshots

-

-Snapshots provide consistent read-only views over the entire state of -the key-value store. ReadOptions::snapshot may be non-NULL to indicate -that a read should operate on a particular version of the DB state. -If ReadOptions::snapshot is NULL, the read will operate on an -implicit snapshot of the current state. -

-Snapshots are created by the DB::GetSnapshot() method: -

-

-  leveldb::ReadOptions options;
-  options.snapshot = db->GetSnapshot();
-  ... apply some updates to db ...
-  leveldb::Iterator* iter = db->NewIterator(options);
-  ... read using iter to view the state when the snapshot was created ...
-  delete iter;
-  db->ReleaseSnapshot(options.snapshot);
-
-Note that when a snapshot is no longer needed, it should be released -using the DB::ReleaseSnapshot interface. This allows the -implementation to get rid of state that was being maintained just to -support reading as of that snapshot. -

Slice

-

-The return value of the it->key() and it->value() calls above -are instances of the leveldb::Slice type. Slice is a simple -structure that contains a length and a pointer to an external byte -array. Returning a Slice is a cheaper alternative to returning a -std::string since we do not need to copy potentially large keys and -values. In addition, leveldb methods do not return null-terminated -C-style strings since leveldb keys and values are allowed to -contain '\0' bytes. -

-C++ strings and null-terminated C-style strings can be easily converted -to a Slice: -

-

-   leveldb::Slice s1 = "hello";
-
-   std::string str("world");
-   leveldb::Slice s2 = str;
-
-A Slice can be easily converted back to a C++ string: -
-   std::string str = s1.ToString();
-   assert(str == std::string("hello"));
-
-Be careful when using Slices since it is up to the caller to ensure that -the external byte array into which the Slice points remains live while -the Slice is in use. For example, the following is buggy: -

-

-   leveldb::Slice slice;
-   if (...) {
-     std::string str = ...;
-     slice = str;
-   }
-   Use(slice);
-
-When the if statement goes out of scope, str will be destroyed and the -backing storage for slice will disappear. -

-

Comparators

-

-The preceding examples used the default ordering function for key, -which orders bytes lexicographically. You can however supply a custom -comparator when opening a database. For example, suppose each -database key consists of two numbers and we should sort by the first -number, breaking ties by the second number. First, define a proper -subclass of leveldb::Comparator that expresses these rules: -

-

-  class TwoPartComparator : public leveldb::Comparator {
-   public:
-    // Three-way comparison function:
-    //   if a < b: negative result
-    //   if a > b: positive result
-    //   else: zero result
-    int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const {
-      int a1, a2, b1, b2;
-      ParseKey(a, &a1, &a2);
-      ParseKey(b, &b1, &b2);
-      if (a1 < b1) return -1;
-      if (a1 > b1) return +1;
-      if (a2 < b2) return -1;
-      if (a2 > b2) return +1;
-      return 0;
-    }
-
-    // Ignore the following methods for now:
-    const char* Name() const { return "TwoPartComparator"; }
-    void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
-    void FindShortSuccessor(std::string*) const { }
-  };
-
-Now create a database using this custom comparator: -

-

-  TwoPartComparator cmp;
-  leveldb::DB* db;
-  leveldb::Options options;
-  options.create_if_missing = true;
-  options.comparator = &cmp;
-  leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db);
-  ...
-
-

Backwards compatibility

-

-The result of the comparator's Name method is attached to the -database when it is created, and is checked on every subsequent -database open. If the name changes, the leveldb::DB::Open call will -fail. Therefore, change the name if and only if the new key format -and comparison function are incompatible with existing databases, and -it is ok to discard the contents of all existing databases. -

-You can however still gradually evolve your key format over time with -a little bit of pre-planning. For example, you could store a version -number at the end of each key (one byte should suffice for most uses). -When you wish to switch to a new key format (e.g., adding an optional -third part to the keys processed by TwoPartComparator), -(a) keep the same comparator name (b) increment the version number -for new keys (c) change the comparator function so it uses the -version numbers found in the keys to decide how to interpret them. -

-

Performance

-

-Performance can be tuned by changing the default values of the -types defined in include/leveldb/options.h. - -

-

Block size

-

-leveldb groups adjacent keys together into the same block and such a -block is the unit of transfer to and from persistent storage. The -default block size is approximately 4096 uncompressed bytes. -Applications that mostly do bulk scans over the contents of the -database may wish to increase this size. Applications that do a lot -of point reads of small values may wish to switch to a smaller block -size if performance measurements indicate an improvement. There isn't -much benefit in using blocks smaller than one kilobyte, or larger than -a few megabytes. Also note that compression will be more effective -with larger block sizes. -

-

Compression

-

-Each block is individually compressed before being written to -persistent storage. Compression is on by default since the default -compression method is very fast, and is automatically disabled for -uncompressible data. In rare cases, applications may want to disable -compression entirely, but should only do so if benchmarks show a -performance improvement: -

-

-  leveldb::Options options;
-  options.compression = leveldb::kNoCompression;
-  ... leveldb::DB::Open(options, name, ...) ....
-
-

Cache

-

-The contents of the database are stored in a set of files in the -filesystem and each file stores a sequence of compressed blocks. If -options.cache is non-NULL, it is used to cache frequently used -uncompressed block contents. -

-

-  #include "leveldb/cache.h"
-
-  leveldb::Options options;
-  options.cache = leveldb::NewLRUCache(100 * 1048576);  // 100MB cache
-  leveldb::DB* db;
-  leveldb::DB::Open(options, name, &db);
-  ... use the db ...
-  delete db
-  delete options.cache;
-
-Note that the cache holds uncompressed data, and therefore it should -be sized according to application level data sizes, without any -reduction from compression. (Caching of compressed blocks is left to -the operating system buffer cache, or any custom Env -implementation provided by the client.) -

-When performing a bulk read, the application may wish to disable -caching so that the data processed by the bulk read does not end up -displacing most of the cached contents. A per-iterator option can be -used to achieve this: -

-

-  leveldb::ReadOptions options;
-  options.fill_cache = false;
-  leveldb::Iterator* it = db->NewIterator(options);
-  for (it->SeekToFirst(); it->Valid(); it->Next()) {
-    ...
-  }
-
-

Key Layout

-

-Note that the unit of disk transfer and caching is a block. Adjacent -keys (according to the database sort order) will usually be placed in -the same block. Therefore the application can improve its performance -by placing keys that are accessed together near each other and placing -infrequently used keys in a separate region of the key space. -

-For example, suppose we are implementing a simple file system on top -of leveldb. The types of entries we might wish to store are: -

-

-   filename -> permission-bits, length, list of file_block_ids
-   file_block_id -> data
-
-We might want to prefix filename keys with one letter (say '/') and the -file_block_id keys with a different letter (say '0') so that scans -over just the metadata do not force us to fetch and cache bulky file -contents. -

-

Filters

-

-Because of the way leveldb data is organized on disk, -a single Get() call may involve multiple reads from disk. -The optional FilterPolicy mechanism can be used to reduce -the number of disk reads substantially. -

-   leveldb::Options options;
-   options.filter_policy = NewBloomFilterPolicy(10);
-   leveldb::DB* db;
-   leveldb::DB::Open(options, "/tmp/testdb", &db);
-   ... use the database ...
-   delete db;
-   delete options.filter_policy;
-
-The preceding code associates a -Bloom filter -based filtering policy with the database. Bloom filter based -filtering relies on keeping some number of bits of data in memory per -key (in this case 10 bits per key since that is the argument we passed -to NewBloomFilterPolicy). This filter will reduce the number of unnecessary -disk reads needed for Get() calls by a factor of -approximately a 100. Increasing the bits per key will lead to a -larger reduction at the cost of more memory usage. We recommend that -applications whose working set does not fit in memory and that do a -lot of random reads set a filter policy. -

-If you are using a custom comparator, you should ensure that the filter -policy you are using is compatible with your comparator. For example, -consider a comparator that ignores trailing spaces when comparing keys. -NewBloomFilterPolicy must not be used with such a comparator. -Instead, the application should provide a custom filter policy that -also ignores trailing spaces. For example: -

-  class CustomFilterPolicy : public leveldb::FilterPolicy {
-   private:
-    FilterPolicy* builtin_policy_;
-   public:
-    CustomFilterPolicy() : builtin_policy_(NewBloomFilterPolicy(10)) { }
-    ~CustomFilterPolicy() { delete builtin_policy_; }
-
-    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
-
-    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
-      // Use builtin bloom filter code after removing trailing spaces
-      std::vector<Slice> trimmed(n);
-      for (int i = 0; i < n; i++) {
-        trimmed[i] = RemoveTrailingSpaces(keys[i]);
-      }
-      return builtin_policy_->CreateFilter(&trimmed[i], n, dst);
-    }
-
-    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
-      // Use builtin bloom filter code after removing trailing spaces
-      return builtin_policy_->KeyMayMatch(RemoveTrailingSpaces(key), filter);
-    }
-  };
-
-

-Advanced applications may provide a filter policy that does not use -a bloom filter but uses some other mechanism for summarizing a set -of keys. See leveldb/filter_policy.h for detail. -

-

Checksums

-

-leveldb associates checksums with all data it stores in the file system. -There are two separate controls provided over how aggressively these -checksums are verified: -

-

    -
  • ReadOptions::verify_checksums may be set to true to force - checksum verification of all data that is read from the file system on - behalf of a particular read. By default, no such verification is - done. -

    -

  • Options::paranoid_checks may be set to true before opening a - database to make the database implementation raise an error as soon as - it detects an internal corruption. Depending on which portion of the - database has been corrupted, the error may be raised when the database - is opened, or later by another database operation. By default, - paranoid checking is off so that the database can be used even if - parts of its persistent storage have been corrupted. -

    - If a database is corrupted (perhaps it cannot be opened when - paranoid checking is turned on), the leveldb::RepairDB function - may be used to recover as much of the data as possible -

    -

-

Approximate Sizes

-

-The GetApproximateSizes method can used to get the approximate -number of bytes of file system space used by one or more key ranges. -

-

-   leveldb::Range ranges[2];
-   ranges[0] = leveldb::Range("a", "c");
-   ranges[1] = leveldb::Range("x", "z");
-   uint64_t sizes[2];
-   leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes);
-
-The preceding call will set sizes[0] to the approximate number of -bytes of file system space used by the key range [a..c) and -sizes[1] to the approximate number of bytes used by the key range -[x..z). -

-

Environment

-

-All file operations (and other operating system calls) issued by the -leveldb implementation are routed through a leveldb::Env object. -Sophisticated clients may wish to provide their own Env -implementation to get better control. For example, an application may -introduce artificial delays in the file IO paths to limit the impact -of leveldb on other activities in the system. -

-

-  class SlowEnv : public leveldb::Env {
-    .. implementation of the Env interface ...
-  };
-
-  SlowEnv env;
-  leveldb::Options options;
-  options.env = &env;
-  Status s = leveldb::DB::Open(options, ...);
-
-

Porting

-

-leveldb may be ported to a new platform by providing platform -specific implementations of the types/methods/functions exported by -leveldb/port/port.h. See leveldb/port/port_example.h for more -details. -

-In addition, the new platform may need a new default leveldb::Env -implementation. See leveldb/util/env_posix.h for an example. - -

Other Information

- -

-Details about the leveldb implementation may be found in -the following documents: -

- - - diff --git a/doc/index.md b/doc/index.md new file mode 100644 index 000000000..be8569692 --- /dev/null +++ b/doc/index.md @@ -0,0 +1,523 @@ +leveldb +======= + +_Jeff Dean, Sanjay Ghemawat_ + +The leveldb library provides a persistent key value store. Keys and values are +arbitrary byte arrays. The keys are ordered within the key value store +according to a user-specified comparator function. + +## Opening A Database + +A leveldb database has a name which corresponds to a file system directory. All +of the contents of database are stored in this directory. The following example +shows how to open a database, creating it if necessary: + +```c++ +#include +#include "leveldb/db.h" + +leveldb::DB* db; +leveldb::Options options; +options.create_if_missing = true; +leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db); +assert(status.ok()); +... +``` + +If you want to raise an error if the database already exists, add the following +line before the `leveldb::DB::Open` call: + +```c++ +options.error_if_exists = true; +``` + +## Status + +You may have noticed the `leveldb::Status` type above. Values of this type are +returned by most functions in leveldb that may encounter an error. You can check +if such a result is ok, and also print an associated error message: + +```c++ +leveldb::Status s = ...; +if (!s.ok()) cerr << s.ToString() << endl; +``` + +## Closing A Database + +When you are done with a database, just delete the database object. Example: + +```c++ +... open the db as described above ... +... do something with db ... +delete db; +``` + +## Reads And Writes + +The database provides Put, Delete, and Get methods to modify/query the database. +For example, the following code moves the value stored under key1 to key2. + +```c++ +std::string value; +leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value); +if (s.ok()) s = db->Put(leveldb::WriteOptions(), key2, value); +if (s.ok()) s = db->Delete(leveldb::WriteOptions(), key1); +``` + +## Atomic Updates + +Note that if the process dies after the Put of key2 but before the delete of +key1, the same value may be left stored under multiple keys. Such problems can +be avoided by using the `WriteBatch` class to atomically apply a set of updates: + +```c++ +#include "leveldb/write_batch.h" +... +std::string value; +leveldb::Status s = db->Get(leveldb::ReadOptions(), key1, &value); +if (s.ok()) { + leveldb::WriteBatch batch; + batch.Delete(key1); + batch.Put(key2, value); + s = db->Write(leveldb::WriteOptions(), &batch); +} +``` + +The `WriteBatch` holds a sequence of edits to be made to the database, and these +edits within the batch are applied in order. Note that we called Delete before +Put so that if key1 is identical to key2, we do not end up erroneously dropping +the value entirely. + +Apart from its atomicity benefits, `WriteBatch` may also be used to speed up +bulk updates by placing lots of individual mutations into the same batch. + +## Synchronous Writes + +By default, each write to leveldb is asynchronous: it returns after pushing the +write from the process into the operating system. The transfer from operating +system memory to the underlying persistent storage happens asynchronously. The +sync flag can be turned on for a particular write to make the write operation +not return until the data being written has been pushed all the way to +persistent storage. (On Posix systems, this is implemented by calling either +`fsync(...)` or `fdatasync(...)` or `msync(..., MS_SYNC)` before the write +operation returns.) + +```c++ +leveldb::WriteOptions write_options; +write_options.sync = true; +db->Put(write_options, ...); +``` + +Asynchronous writes are often more than a thousand times as fast as synchronous +writes. The downside of asynchronous writes is that a crash of the machine may +cause the last few updates to be lost. Note that a crash of just the writing +process (i.e., not a reboot) will not cause any loss since even when sync is +false, an update is pushed from the process memory into the operating system +before it is considered done. + +Asynchronous writes can often be used safely. For example, when loading a large +amount of data into the database you can handle lost updates by restarting the +bulk load after a crash. A hybrid scheme is also possible where every Nth write +is synchronous, and in the event of a crash, the bulk load is restarted just +after the last synchronous write finished by the previous run. (The synchronous +write can update a marker that describes where to restart on a crash.) + +`WriteBatch` provides an alternative to asynchronous writes. Multiple updates +may be placed in the same WriteBatch and applied together using a synchronous +write (i.e., `write_options.sync` is set to true). The extra cost of the +synchronous write will be amortized across all of the writes in the batch. + +## Concurrency + +A database may only be opened by one process at a time. The leveldb +implementation acquires a lock from the operating system to prevent misuse. +Within a single process, the same `leveldb::DB` object may be safely shared by +multiple concurrent threads. I.e., different threads may write into or fetch +iterators or call Get on the same database without any external synchronization +(the leveldb implementation will automatically do the required synchronization). +However other objects (like Iterator and `WriteBatch`) may require external +synchronization. If two threads share such an object, they must protect access +to it using their own locking protocol. More details are available in the public +header files. + +## Iteration + +The following example demonstrates how to print all key,value pairs in a +database. + +```c++ +leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions()); +for (it->SeekToFirst(); it->Valid(); it->Next()) { + cout << it->key().ToString() << ": " << it->value().ToString() << endl; +} +assert(it->status().ok()); // Check for any errors found during the scan +delete it; +``` + +The following variation shows how to process just the keys in the range +[start,limit): + +```c++ +for (it->Seek(start); + it->Valid() && it->key().ToString() < limit; + it->Next()) { + ... +} +``` + +You can also process entries in reverse order. (Caveat: reverse iteration may be +somewhat slower than forward iteration.) + +```c++ +for (it->SeekToLast(); it->Valid(); it->Prev()) { + ... +} +``` + +## Snapshots + +Snapshots provide consistent read-only views over the entire state of the +key-value store. `ReadOptions::snapshot` may be non-NULL to indicate that a +read should operate on a particular version of the DB state. If +`ReadOptions::snapshot` is NULL, the read will operate on an implicit snapshot +of the current state. + +Snapshots are created by the `DB::GetSnapshot()` method: + +```c++ +leveldb::ReadOptions options; +options.snapshot = db->GetSnapshot(); +... apply some updates to db ... +leveldb::Iterator* iter = db->NewIterator(options); +... read using iter to view the state when the snapshot was created ... +delete iter; +db->ReleaseSnapshot(options.snapshot); +``` + +Note that when a snapshot is no longer needed, it should be released using the +`DB::ReleaseSnapshot` interface. This allows the implementation to get rid of +state that was being maintained just to support reading as of that snapshot. + +## Slice + +The return value of the `it->key()` and `it->value()` calls above are instances +of the `leveldb::Slice` type. Slice is a simple structure that contains a length +and a pointer to an external byte array. Returning a Slice is a cheaper +alternative to returning a `std::string` since we do not need to copy +potentially large keys and values. In addition, leveldb methods do not return +null-terminated C-style strings since leveldb keys and values are allowed to +contain `'\0'` bytes. + +C++ strings and null-terminated C-style strings can be easily converted to a +Slice: + +```c++ +leveldb::Slice s1 = "hello"; + +std::string str("world"); +leveldb::Slice s2 = str; +``` + +A Slice can be easily converted back to a C++ string: + +```c++ +std::string str = s1.ToString(); +assert(str == std::string("hello")); +``` + +Be careful when using Slices since it is up to the caller to ensure that the +external byte array into which the Slice points remains live while the Slice is +in use. For example, the following is buggy: + +```c++ +leveldb::Slice slice; +if (...) { + std::string str = ...; + slice = str; +} +Use(slice); +``` + +When the if statement goes out of scope, str will be destroyed and the backing +storage for slice will disappear. + +## Comparators + +The preceding examples used the default ordering function for key, which orders +bytes lexicographically. You can however supply a custom comparator when opening +a database. For example, suppose each database key consists of two numbers and +we should sort by the first number, breaking ties by the second number. First, +define a proper subclass of `leveldb::Comparator` that expresses these rules: + +```c++ +class TwoPartComparator : public leveldb::Comparator { + public: + // Three-way comparison function: + // if a < b: negative result + // if a > b: positive result + // else: zero result + int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const { + int a1, a2, b1, b2; + ParseKey(a, &a1, &a2); + ParseKey(b, &b1, &b2); + if (a1 < b1) return -1; + if (a1 > b1) return +1; + if (a2 < b2) return -1; + if (a2 > b2) return +1; + return 0; + } + + // Ignore the following methods for now: + const char* Name() const { return "TwoPartComparator"; } + void FindShortestSeparator(std::string*, const leveldb::Slice&) const {} + void FindShortSuccessor(std::string*) const {} +}; +``` + +Now create a database using this custom comparator: + +```c++ +TwoPartComparator cmp; +leveldb::DB* db; +leveldb::Options options; +options.create_if_missing = true; +options.comparator = &cmp; +leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &db); +... +``` + +### Backwards compatibility + +The result of the comparator's Name method is attached to the database when it +is created, and is checked on every subsequent database open. If the name +changes, the `leveldb::DB::Open` call will fail. Therefore, change the name if +and only if the new key format and comparison function are incompatible with +existing databases, and it is ok to discard the contents of all existing +databases. + +You can however still gradually evolve your key format over time with a little +bit of pre-planning. For example, you could store a version number at the end of +each key (one byte should suffice for most uses). When you wish to switch to a +new key format (e.g., adding an optional third part to the keys processed by +`TwoPartComparator`), (a) keep the same comparator name (b) increment the +version number for new keys (c) change the comparator function so it uses the +version numbers found in the keys to decide how to interpret them. + +## Performance + +Performance can be tuned by changing the default values of the types defined in +`include/leveldb/options.h`. + +### Block size + +leveldb groups adjacent keys together into the same block and such a block is +the unit of transfer to and from persistent storage. The default block size is +approximately 4096 uncompressed bytes. Applications that mostly do bulk scans +over the contents of the database may wish to increase this size. Applications +that do a lot of point reads of small values may wish to switch to a smaller +block size if performance measurements indicate an improvement. There isn't much +benefit in using blocks smaller than one kilobyte, or larger than a few +megabytes. Also note that compression will be more effective with larger block +sizes. + +### Compression + +Each block is individually compressed before being written to persistent +storage. Compression is on by default since the default compression method is +very fast, and is automatically disabled for uncompressible data. In rare cases, +applications may want to disable compression entirely, but should only do so if +benchmarks show a performance improvement: + +```c++ +leveldb::Options options; +options.compression = leveldb::kNoCompression; +... leveldb::DB::Open(options, name, ...) .... +``` + +### Cache + +The contents of the database are stored in a set of files in the filesystem and +each file stores a sequence of compressed blocks. If options.cache is non-NULL, +it is used to cache frequently used uncompressed block contents. + +```c++ +#include "leveldb/cache.h" + +leveldb::Options options; +options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache +leveldb::DB* db; +leveldb::DB::Open(options, name, &db); +... use the db ... +delete db +delete options.cache; +``` + +Note that the cache holds uncompressed data, and therefore it should be sized +according to application level data sizes, without any reduction from +compression. (Caching of compressed blocks is left to the operating system +buffer cache, or any custom Env implementation provided by the client.) + +When performing a bulk read, the application may wish to disable caching so that +the data processed by the bulk read does not end up displacing most of the +cached contents. A per-iterator option can be used to achieve this: + +```c++ +leveldb::ReadOptions options; +options.fill_cache = false; +leveldb::Iterator* it = db->NewIterator(options); +for (it->SeekToFirst(); it->Valid(); it->Next()) { + ... +} +``` + +### Key Layout + +Note that the unit of disk transfer and caching is a block. Adjacent keys +(according to the database sort order) will usually be placed in the same block. +Therefore the application can improve its performance by placing keys that are +accessed together near each other and placing infrequently used keys in a +separate region of the key space. + +For example, suppose we are implementing a simple file system on top of leveldb. +The types of entries we might wish to store are: + + filename -> permission-bits, length, list of file_block_ids + file_block_id -> data + +We might want to prefix filename keys with one letter (say '/') and the +`file_block_id` keys with a different letter (say '0') so that scans over just +the metadata do not force us to fetch and cache bulky file contents. + +### Filters + +Because of the way leveldb data is organized on disk, a single `Get()` call may +involve multiple reads from disk. The optional FilterPolicy mechanism can be +used to reduce the number of disk reads substantially. + +```c++ +leveldb::Options options; +options.filter_policy = NewBloomFilterPolicy(10); +leveldb::DB* db; +leveldb::DB::Open(options, "/tmp/testdb", &db); +... use the database ... +delete db; +delete options.filter_policy; +``` + +The preceding code associates a Bloom filter based filtering policy with the +database. Bloom filter based filtering relies on keeping some number of bits of +data in memory per key (in this case 10 bits per key since that is the argument +we passed to `NewBloomFilterPolicy`). This filter will reduce the number of +unnecessary disk reads needed for Get() calls by a factor of approximately +a 100. Increasing the bits per key will lead to a larger reduction at the cost +of more memory usage. We recommend that applications whose working set does not +fit in memory and that do a lot of random reads set a filter policy. + +If you are using a custom comparator, you should ensure that the filter policy +you are using is compatible with your comparator. For example, consider a +comparator that ignores trailing spaces when comparing keys. +`NewBloomFilterPolicy` must not be used with such a comparator. Instead, the +application should provide a custom filter policy that also ignores trailing +spaces. For example: + +```c++ +class CustomFilterPolicy : public leveldb::FilterPolicy { + private: + FilterPolicy* builtin_policy_; + + public: + CustomFilterPolicy() : builtin_policy_(NewBloomFilterPolicy(10)) {} + ~CustomFilterPolicy() { delete builtin_policy_; } + + const char* Name() const { return "IgnoreTrailingSpacesFilter"; } + + void CreateFilter(const Slice* keys, int n, std::string* dst) const { + // Use builtin bloom filter code after removing trailing spaces + std::vector trimmed(n); + for (int i = 0; i < n; i++) { + trimmed[i] = RemoveTrailingSpaces(keys[i]); + } + return builtin_policy_->CreateFilter(&trimmed[i], n, dst); + } +}; +``` + +Advanced applications may provide a filter policy that does not use a bloom +filter but uses some other mechanism for summarizing a set of keys. See +`leveldb/filter_policy.h` for detail. + +## Checksums + +leveldb associates checksums with all data it stores in the file system. There +are two separate controls provided over how aggressively these checksums are +verified: + +`ReadOptions::verify_checksums` may be set to true to force checksum +verification of all data that is read from the file system on behalf of a +particular read. By default, no such verification is done. + +`Options::paranoid_checks` may be set to true before opening a database to make +the database implementation raise an error as soon as it detects an internal +corruption. Depending on which portion of the database has been corrupted, the +error may be raised when the database is opened, or later by another database +operation. By default, paranoid checking is off so that the database can be used +even if parts of its persistent storage have been corrupted. + +If a database is corrupted (perhaps it cannot be opened when paranoid checking +is turned on), the `leveldb::RepairDB` function may be used to recover as much +of the data as possible + +## Approximate Sizes + +The `GetApproximateSizes` method can used to get the approximate number of bytes +of file system space used by one or more key ranges. + +```c++ +leveldb::Range ranges[2]; +ranges[0] = leveldb::Range("a", "c"); +ranges[1] = leveldb::Range("x", "z"); +uint64_t sizes[2]; +leveldb::Status s = db->GetApproximateSizes(ranges, 2, sizes); +``` + +The preceding call will set `sizes[0]` to the approximate number of bytes of +file system space used by the key range `[a..c)` and `sizes[1]` to the +approximate number of bytes used by the key range `[x..z)`. + +## Environment + +All file operations (and other operating system calls) issued by the leveldb +implementation are routed through a `leveldb::Env` object. Sophisticated clients +may wish to provide their own Env implementation to get better control. +For example, an application may introduce artificial delays in the file IO +paths to limit the impact of leveldb on other activities in the system. + +```c++ +class SlowEnv : public leveldb::Env { + ... implementation of the Env interface ... +}; + +SlowEnv env; +leveldb::Options options; +options.env = &env; +Status s = leveldb::DB::Open(options, ...); +``` + +## Porting + +leveldb may be ported to a new platform by providing platform specific +implementations of the types/methods/functions exported by +`leveldb/port/port.h`. See `leveldb/port/port_example.h` for more details. + +In addition, the new platform may need a new default `leveldb::Env` +implementation. See `leveldb/util/env_posix.h` for an example. + +## Other Information + +Details about the leveldb implementation may be found in the following +documents: + +1. [Implementation notes](impl.md) +2. [Format of an immutable Table file](table_format.md) +3. [Format of a log file](log_format.md) diff --git a/doc/log_format.md b/doc/log_format.md new file mode 100644 index 000000000..f32cb5d7d --- /dev/null +++ b/doc/log_format.md @@ -0,0 +1,75 @@ +leveldb Log format +================== +The log file contents are a sequence of 32KB blocks. The only exception is that +the tail of the file may contain a partial block. + +Each block consists of a sequence of records: + + block := record* trailer? + record := + checksum: uint32 // crc32c of type and data[] ; little-endian + length: uint16 // little-endian + type: uint8 // One of FULL, FIRST, MIDDLE, LAST + data: uint8[length] + +A record never starts within the last six bytes of a block (since it won't fit). +Any leftover bytes here form the trailer, which must consist entirely of zero +bytes and must be skipped by readers. + +Aside: if exactly seven bytes are left in the current block, and a new non-zero +length record is added, the writer must emit a FIRST record (which contains zero +bytes of user data) to fill up the trailing seven bytes of the block and then +emit all of the user data in subsequent blocks. + +More types may be added in the future. Some Readers may skip record types they +do not understand, others may report that some data was skipped. + + FULL == 1 + FIRST == 2 + MIDDLE == 3 + LAST == 4 + +The FULL record contains the contents of an entire user record. + +FIRST, MIDDLE, LAST are types used for user records that have been split into +multiple fragments (typically because of block boundaries). FIRST is the type +of the first fragment of a user record, LAST is the type of the last fragment of +a user record, and MIDDLE is the type of all interior fragments of a user +record. + +Example: consider a sequence of user records: + + A: length 1000 + B: length 97270 + C: length 8000 + +**A** will be stored as a FULL record in the first block. + +**B** will be split into three fragments: first fragment occupies the rest of +the first block, second fragment occupies the entirety of the second block, and +the third fragment occupies a prefix of the third block. This will leave six +bytes free in the third block, which will be left empty as the trailer. + +**C** will be stored as a FULL record in the fourth block. + +---- + +## Some benefits over the recordio format: + +1. We do not need any heuristics for resyncing - just go to next block boundary + and scan. If there is a corruption, skip to the next block. As a + side-benefit, we do not get confused when part of the contents of one log + file are embedded as a record inside another log file. + +2. Splitting at approximate boundaries (e.g., for mapreduce) is simple: find the + next block boundary and skip records until we hit a FULL or FIRST record. + +3. We do not need extra buffering for large records. + +## Some downsides compared to recordio format: + +1. No packing of tiny records. This could be fixed by adding a new record type, + so it is a shortcoming of the current implementation, not necessarily the + format. + +2. No compression. Again, this could be fixed by adding new record types. diff --git a/doc/log_format.txt b/doc/log_format.txt deleted file mode 100644 index 4cca5ef6e..000000000 --- a/doc/log_format.txt +++ /dev/null @@ -1,75 +0,0 @@ -The log file contents are a sequence of 32KB blocks. The only -exception is that the tail of the file may contain a partial block. - -Each block consists of a sequence of records: - block := record* trailer? - record := - checksum: uint32 // crc32c of type and data[] ; little-endian - length: uint16 // little-endian - type: uint8 // One of FULL, FIRST, MIDDLE, LAST - data: uint8[length] - -A record never starts within the last six bytes of a block (since it -won't fit). Any leftover bytes here form the trailer, which must -consist entirely of zero bytes and must be skipped by readers. - -Aside: if exactly seven bytes are left in the current block, and a new -non-zero length record is added, the writer must emit a FIRST record -(which contains zero bytes of user data) to fill up the trailing seven -bytes of the block and then emit all of the user data in subsequent -blocks. - -More types may be added in the future. Some Readers may skip record -types they do not understand, others may report that some data was -skipped. - -FULL == 1 -FIRST == 2 -MIDDLE == 3 -LAST == 4 - -The FULL record contains the contents of an entire user record. - -FIRST, MIDDLE, LAST are types used for user records that have been -split into multiple fragments (typically because of block boundaries). -FIRST is the type of the first fragment of a user record, LAST is the -type of the last fragment of a user record, and MIDDLE is the type of -all interior fragments of a user record. - -Example: consider a sequence of user records: - A: length 1000 - B: length 97270 - C: length 8000 -A will be stored as a FULL record in the first block. - -B will be split into three fragments: first fragment occupies the rest -of the first block, second fragment occupies the entirety of the -second block, and the third fragment occupies a prefix of the third -block. This will leave six bytes free in the third block, which will -be left empty as the trailer. - -C will be stored as a FULL record in the fourth block. - -=================== - -Some benefits over the recordio format: - -(1) We do not need any heuristics for resyncing - just go to next -block boundary and scan. If there is a corruption, skip to the next -block. As a side-benefit, we do not get confused when part of the -contents of one log file are embedded as a record inside another log -file. - -(2) Splitting at approximate boundaries (e.g., for mapreduce) is -simple: find the next block boundary and skip records until we -hit a FULL or FIRST record. - -(3) We do not need extra buffering for large records. - -Some downsides compared to recordio format: - -(1) No packing of tiny records. This could be fixed by adding a new -record type, so it is a shortcoming of the current implementation, -not necessarily the format. - -(2) No compression. Again, this could be fixed by adding new record types. diff --git a/doc/table_format.md b/doc/table_format.md new file mode 100644 index 000000000..5fe7e7241 --- /dev/null +++ b/doc/table_format.md @@ -0,0 +1,107 @@ +leveldb File format +=================== + + + [data block 1] + [data block 2] + ... + [data block N] + [meta block 1] + ... + [meta block K] + [metaindex block] + [index block] + [Footer] (fixed size; starts at file_size - sizeof(Footer)) + + +The file contains internal pointers. Each such pointer is called +a BlockHandle and contains the following information: + + offset: varint64 + size: varint64 + +See [varints](https://developers.google.com/protocol-buffers/docs/encoding#varints) +for an explanation of varint64 format. + +1. The sequence of key/value pairs in the file are stored in sorted +order and partitioned into a sequence of data blocks. These blocks +come one after another at the beginning of the file. Each data block +is formatted according to the code in `block_builder.cc`, and then +optionally compressed. + +2. After the data blocks we store a bunch of meta blocks. The +supported meta block types are described below. More meta block types +may be added in the future. Each meta block is again formatted using +`block_builder.cc` and then optionally compressed. + +3. A "metaindex" block. It contains one entry for every other meta +block where the key is the name of the meta block and the value is a +BlockHandle pointing to that meta block. + +4. An "index" block. This block contains one entry per data block, +where the key is a string >= last key in that data block and before +the first key in the successive data block. The value is the +BlockHandle for the data block. + +5. At the very end of the file is a fixed length footer that contains +the BlockHandle of the metaindex and index blocks as well as a magic number. + + metaindex_handle: char[p]; // Block handle for metaindex + index_handle: char[q]; // Block handle for index + padding: char[40-p-q];// zeroed bytes to make fixed length + // (40==2*BlockHandle::kMaxEncodedLength) + magic: fixed64; // == 0xdb4775248b80fb57 (little-endian) + +## "filter" Meta Block + +If a `FilterPolicy` was specified when the database was opened, a +filter block is stored in each table. The "metaindex" block contains +an entry that maps from `filter.` to the BlockHandle for the filter +block where `` is the string returned by the filter policy's +`Name()` method. + +The filter block stores a sequence of filters, where filter i contains +the output of `FilterPolicy::CreateFilter()` on all keys that are stored +in a block whose file offset falls within the range + + [ i*base ... (i+1)*base-1 ] + +Currently, "base" is 2KB. So for example, if blocks X and Y start in +the range `[ 0KB .. 2KB-1 ]`, all of the keys in X and Y will be +converted to a filter by calling `FilterPolicy::CreateFilter()`, and the +resulting filter will be stored as the first filter in the filter +block. + +The filter block is formatted as follows: + + [filter 0] + [filter 1] + [filter 2] + ... + [filter N-1] + + [offset of filter 0] : 4 bytes + [offset of filter 1] : 4 bytes + [offset of filter 2] : 4 bytes + ... + [offset of filter N-1] : 4 bytes + + [offset of beginning of offset array] : 4 bytes + lg(base) : 1 byte + +The offset array at the end of the filter block allows efficient +mapping from a data block offset to the corresponding filter. + +## "stats" Meta Block + +This meta block contains a bunch of stats. The key is the name +of the statistic. The value contains the statistic. + +TODO(postrelease): record following stats. + + data size + index size + key size (uncompressed) + value size (uncompressed) + number of entries + number of data blocks diff --git a/doc/table_format.txt b/doc/table_format.txt deleted file mode 100644 index ca8f9b446..000000000 --- a/doc/table_format.txt +++ /dev/null @@ -1,104 +0,0 @@ -File format -=========== - - - [data block 1] - [data block 2] - ... - [data block N] - [meta block 1] - ... - [meta block K] - [metaindex block] - [index block] - [Footer] (fixed size; starts at file_size - sizeof(Footer)) - - -The file contains internal pointers. Each such pointer is called -a BlockHandle and contains the following information: - offset: varint64 - size: varint64 -See https://developers.google.com/protocol-buffers/docs/encoding#varints -for an explanation of varint64 format. - -(1) The sequence of key/value pairs in the file are stored in sorted -order and partitioned into a sequence of data blocks. These blocks -come one after another at the beginning of the file. Each data block -is formatted according to the code in block_builder.cc, and then -optionally compressed. - -(2) After the data blocks we store a bunch of meta blocks. The -supported meta block types are described below. More meta block types -may be added in the future. Each meta block is again formatted using -block_builder.cc and then optionally compressed. - -(3) A "metaindex" block. It contains one entry for every other meta -block where the key is the name of the meta block and the value is a -BlockHandle pointing to that meta block. - -(4) An "index" block. This block contains one entry per data block, -where the key is a string >= last key in that data block and before -the first key in the successive data block. The value is the -BlockHandle for the data block. - -(6) At the very end of the file is a fixed length footer that contains -the BlockHandle of the metaindex and index blocks as well as a magic number. - metaindex_handle: char[p]; // Block handle for metaindex - index_handle: char[q]; // Block handle for index - padding: char[40-p-q]; // zeroed bytes to make fixed length - // (40==2*BlockHandle::kMaxEncodedLength) - magic: fixed64; // == 0xdb4775248b80fb57 (little-endian) - -"filter" Meta Block -------------------- - -If a "FilterPolicy" was specified when the database was opened, a -filter block is stored in each table. The "metaindex" block contains -an entry that maps from "filter." to the BlockHandle for the filter -block where "" is the string returned by the filter policy's -"Name()" method. - -The filter block stores a sequence of filters, where filter i contains -the output of FilterPolicy::CreateFilter() on all keys that are stored -in a block whose file offset falls within the range - - [ i*base ... (i+1)*base-1 ] - -Currently, "base" is 2KB. So for example, if blocks X and Y start in -the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be -converted to a filter by calling FilterPolicy::CreateFilter(), and the -resulting filter will be stored as the first filter in the filter -block. - -The filter block is formatted as follows: - - [filter 0] - [filter 1] - [filter 2] - ... - [filter N-1] - - [offset of filter 0] : 4 bytes - [offset of filter 1] : 4 bytes - [offset of filter 2] : 4 bytes - ... - [offset of filter N-1] : 4 bytes - - [offset of beginning of offset array] : 4 bytes - lg(base) : 1 byte - -The offset array at the end of the filter block allows efficient -mapping from a data block offset to the corresponding filter. - -"stats" Meta Block ------------------- - -This meta block contains a bunch of stats. The key is the name -of the statistic. The value contains the statistic. -TODO(postrelease): record following stats. - data size - index size - key size (uncompressed) - value size (uncompressed) - number of entries - number of data blocks diff --git a/include/leveldb/db.h b/include/leveldb/db.h index 9752cbad5..bfab10a0b 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -14,7 +14,7 @@ namespace leveldb { // Update Makefile if you change these static const int kMajorVersion = 1; -static const int kMinorVersion = 19; +static const int kMinorVersion = 20; struct Options; struct ReadOptions; diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 83a1ef39a..976e38122 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -112,6 +112,18 @@ struct Options { // Default: 16 int block_restart_interval; + // Leveldb will write up to this amount of bytes to a file before + // switching to a new one. + // Most clients should leave this parameter alone. However if your + // filesystem is more efficient with larger files, you could + // consider increasing the value. The downside will be longer + // compactions and hence longer latency/performance hiccups. + // Another reason to increase this parameter might be when you are + // initially populating a large database. + // + // Default: 2MB + size_t max_file_size; + // Compress blocks using the specified compression algorithm. This // parameter can be changed dynamically. // diff --git a/port/port_example.h b/port/port_example.h index ab9e489b3..97bd669a5 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -129,6 +129,12 @@ extern bool Snappy_Uncompress(const char* input_data, size_t input_length, // The concatenation of all "data[0,n-1]" fragments is the heap profile. extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); +// Extend the CRC to include the first n bytes of buf. +// +// Returns zero if the CRC cannot be extended using acceleration, else returns +// the newly extended CRC value (which may also be zero). +uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); + } // namespace port } // namespace leveldb diff --git a/port/port_posix.h b/port/port_posix.h index ccca9939d..7e8213b22 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -152,6 +152,8 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { return false; } +uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); + } // namespace port } // namespace leveldb diff --git a/port/port_posix_sse.cc b/port/port_posix_sse.cc new file mode 100644 index 000000000..1e519ba0b --- /dev/null +++ b/port/port_posix_sse.cc @@ -0,0 +1,129 @@ +// Copyright 2016 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. +// +// In a separate source file to allow this accelerated CRC32C function to be +// compiled with the appropriate compiler flags to enable x86 SSE 4.2 +// instructions. + +#include +#include +#include "port/port.h" + +#if defined(LEVELDB_PLATFORM_POSIX_SSE) + +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) && defined(__SSE4_2__) +#include +#include +#endif + +#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) + +namespace leveldb { +namespace port { + +#if defined(LEVELDB_PLATFORM_POSIX_SSE) + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t *p) { + // SSE is x86 only, so ensured that |p| is always little-endian. + uint32_t word; + memcpy(&word, p, sizeof(word)); + return word; +} + +#if defined(_M_X64) || defined(__x86_64__) // LE_LOAD64 is only used on x64. + +// Used to fetch a naturally-aligned 64-bit word in little endian byte-order +static inline uint64_t LE_LOAD64(const uint8_t *p) { + uint64_t dword; + memcpy(&dword, p, sizeof(dword)); + return dword; +} + +#endif // defined(_M_X64) || defined(__x86_64__) + +static inline bool HaveSSE42() { +#if defined(_MSC_VER) + int cpu_info[4]; + __cpuid(cpu_info, 1); + return (cpu_info[2] & (1 << 20)) != 0; +#elif defined(__GNUC__) + unsigned int eax, ebx, ecx, edx; + __get_cpuid(1, &eax, &ebx, &ecx, &edx); + return (ecx & (1 << 20)) != 0; +#else + return false; +#endif +} + +#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) + +// For further improvements see Intel publication at: +// http://download.intel.com/design/intarch/papers/323405.pdf +uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) { +#if !defined(LEVELDB_PLATFORM_POSIX_SSE) + return 0; +#else + static bool have = HaveSSE42(); + if (!have) { + return 0; + } + + const uint8_t *p = reinterpret_cast(buf); + const uint8_t *e = p + size; + uint32_t l = crc ^ 0xffffffffu; + +#define STEP1 do { \ + l = _mm_crc32_u8(l, *p++); \ +} while (0) +#define STEP4 do { \ + l = _mm_crc32_u32(l, LE_LOAD32(p)); \ + p += 4; \ +} while (0) +#define STEP8 do { \ + l = _mm_crc32_u64(l, LE_LOAD64(p)); \ + p += 8; \ +} while (0) + + if (size > 16) { + // Process unaligned bytes + for (unsigned int i = reinterpret_cast(p) % 8; i; --i) { + STEP1; + } + + // _mm_crc32_u64 is only available on x64. +#if defined(_M_X64) || defined(__x86_64__) + // Process 8 bytes at a time + while ((e-p) >= 8) { + STEP8; + } + // Process 4 bytes at a time + if ((e-p) >= 4) { + STEP4; + } +#else // !(defined(_M_X64) || defined(__x86_64__)) + // Process 4 bytes at a time + while ((e-p) >= 4) { + STEP4; + } +#endif // defined(_M_X64) || defined(__x86_64__) + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP8 +#undef STEP4 +#undef STEP1 + return l ^ 0xffffffffu; +#endif // defined(LEVELDB_PLATFORM_POSIX_SSE) +} + +} // namespace port +} // namespace leveldb diff --git a/port/port_win.h b/port/port_win.h index 45bf2f0ea..500900482 100644 --- a/port/port_win.h +++ b/port/port_win.h @@ -168,6 +168,8 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { return false; } +uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); + } } diff --git a/table/filter_block.cc b/table/filter_block.cc index 4e78b954f..1ed513417 100644 --- a/table/filter_block.cc +++ b/table/filter_block.cc @@ -9,7 +9,7 @@ namespace leveldb { -// See doc/table_format.txt for an explanation of the filter block format. +// See doc/table_format.md for an explanation of the filter block format. // Generate new filter every 2KB of data static const size_t kFilterBaseLg = 11; diff --git a/util/crc32c.cc b/util/crc32c.cc index 6db9e7707..edd61cfd6 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -8,6 +8,8 @@ #include "util/crc32c.h" #include + +#include "port/port.h" #include "util/coding.h" namespace leveldb { @@ -283,7 +285,23 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) { return DecodeFixed32(reinterpret_cast(p)); } +// Determine if the CPU running this program can accelerate the CRC32C +// calculation. +static bool CanAccelerateCRC32C() { + // port::AcceleretedCRC32C returns zero when unable to accelerate. + static const char kTestCRCBuffer[] = "TestCRCBuffer"; + static const char kBufSize = sizeof(kTestCRCBuffer) - 1; + static const uint32_t kTestCRCValue = 0xdcbc59fa; + + return port::AcceleratedCRC32C(0, kTestCRCBuffer, kBufSize) == kTestCRCValue; +} + uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + static bool accelerate = CanAccelerateCRC32C(); + if (accelerate) { + return port::AcceleratedCRC32C(crc, buf, size); + } + const uint8_t *p = reinterpret_cast(buf); const uint8_t *e = p + size; uint32_t l = crc ^ 0xffffffffu; diff --git a/util/env_posix.cc b/util/env_posix.cc index e0fca52f4..dd852af35 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -11,12 +11,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include "leveldb/env.h" #include "leveldb/slice.h" @@ -24,15 +26,70 @@ #include "util/logging.h" #include "util/mutexlock.h" #include "util/posix_logger.h" +#include "util/env_posix_test_helper.h" namespace leveldb { namespace { +static int open_read_only_file_limit = -1; +static int mmap_limit = -1; + static Status IOError(const std::string& context, int err_number) { return Status::IOError(context, strerror(err_number)); } +// Helper class to limit resource usage to avoid exhaustion. +// Currently used to limit read-only file descriptors and mmap file usage +// so that we do not end up running out of file descriptors, virtual memory, +// or running into kernel performance problems for very large databases. +class Limiter { + public: + // Limit maximum number of resources to |n|. + Limiter(intptr_t n) { + SetAllowed(n); + } + + // If another resource is available, acquire it and return true. + // Else return false. + bool Acquire() { + if (GetAllowed() <= 0) { + return false; + } + MutexLock l(&mu_); + intptr_t x = GetAllowed(); + if (x <= 0) { + return false; + } else { + SetAllowed(x - 1); + return true; + } + } + + // Release a resource acquired by a previous call to Acquire() that returned + // true. + void Release() { + MutexLock l(&mu_); + SetAllowed(GetAllowed() + 1); + } + + private: + port::Mutex mu_; + port::AtomicPointer allowed_; + + intptr_t GetAllowed() const { + return reinterpret_cast(allowed_.Acquire_Load()); + } + + // REQUIRES: mu_ must be held + void SetAllowed(intptr_t v) { + allowed_.Release_Store(reinterpret_cast(v)); + } + + Limiter(const Limiter&); + void operator=(const Limiter&); +}; + class PosixSequentialFile: public SequentialFile { private: std::string filename_; @@ -70,87 +127,65 @@ class PosixSequentialFile: public SequentialFile { class PosixRandomAccessFile: public RandomAccessFile { private: std::string filename_; + bool temporary_fd_; // If true, fd_ is -1 and we open on every read. int fd_; + Limiter* limiter_; public: - PosixRandomAccessFile(const std::string& fname, int fd) - : filename_(fname), fd_(fd) { } - virtual ~PosixRandomAccessFile() { close(fd_); } + PosixRandomAccessFile(const std::string& fname, int fd, Limiter* limiter) + : filename_(fname), fd_(fd), limiter_(limiter) { + temporary_fd_ = !limiter->Acquire(); + if (temporary_fd_) { + // Open file on every access. + close(fd_); + fd_ = -1; + } + } + + virtual ~PosixRandomAccessFile() { + if (!temporary_fd_) { + close(fd_); + limiter_->Release(); + } + } virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { + int fd = fd_; + if (temporary_fd_) { + fd = open(filename_.c_str(), O_RDONLY); + if (fd < 0) { + return IOError(filename_, errno); + } + } + Status s; - ssize_t r = pread(fd_, scratch, n, static_cast(offset)); + ssize_t r = pread(fd, scratch, n, static_cast(offset)); *result = Slice(scratch, (r < 0) ? 0 : r); if (r < 0) { // An error: return a non-ok status s = IOError(filename_, errno); } + if (temporary_fd_) { + // Close the temporary file descriptor opened earlier. + close(fd); + } return s; } }; -// Helper class to limit mmap file usage so that we do not end up -// running out virtual memory or running into kernel performance -// problems for very large databases. -class MmapLimiter { - public: - // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes. - MmapLimiter() { - SetAllowed(sizeof(void*) >= 8 ? 1000 : 0); - } - - // If another mmap slot is available, acquire it and return true. - // Else return false. - bool Acquire() { - if (GetAllowed() <= 0) { - return false; - } - MutexLock l(&mu_); - intptr_t x = GetAllowed(); - if (x <= 0) { - return false; - } else { - SetAllowed(x - 1); - return true; - } - } - - // Release a slot acquired by a previous call to Acquire() that returned true. - void Release() { - MutexLock l(&mu_); - SetAllowed(GetAllowed() + 1); - } - - private: - port::Mutex mu_; - port::AtomicPointer allowed_; - - intptr_t GetAllowed() const { - return reinterpret_cast(allowed_.Acquire_Load()); - } - - // REQUIRES: mu_ must be held - void SetAllowed(intptr_t v) { - allowed_.Release_Store(reinterpret_cast(v)); - } - - MmapLimiter(const MmapLimiter&); - void operator=(const MmapLimiter&); -}; - // mmap() based random-access class PosixMmapReadableFile: public RandomAccessFile { private: std::string filename_; void* mmapped_region_; size_t length_; - MmapLimiter* limiter_; + Limiter* limiter_; public: // base[0,length-1] contains the mmapped contents of the file. PosixMmapReadableFile(const std::string& fname, void* base, size_t length, - MmapLimiter* limiter) + Limiter* limiter) : filename_(fname), mmapped_region_(base), length_(length), limiter_(limiter) { } @@ -231,7 +266,7 @@ class PosixWritableFile : public WritableFile { if (fd < 0) { s = IOError(dir, errno); } else { - if (fsync(fd) < 0) { + if (fsync(fd) < 0 && errno != EINVAL) { s = IOError(dir, errno); } close(fd); @@ -333,7 +368,7 @@ class PosixEnv : public Env { mmap_limit_.Release(); } } else { - *result = new PosixRandomAccessFile(fname, fd); + *result = new PosixRandomAccessFile(fname, fd, &fd_limit_); } return s; } @@ -533,10 +568,42 @@ class PosixEnv : public Env { BGQueue queue_; PosixLockTable locks_; - MmapLimiter mmap_limit_; + Limiter mmap_limit_; + Limiter fd_limit_; }; -PosixEnv::PosixEnv() : started_bgthread_(false) { +// Return the maximum number of concurrent mmaps. +static int MaxMmaps() { + if (mmap_limit >= 0) { + return mmap_limit; + } + // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes. + mmap_limit = sizeof(void*) >= 8 ? 1000 : 0; + return mmap_limit; +} + +// Return the maximum number of read-only files to keep open. +static intptr_t MaxOpenFiles() { + if (open_read_only_file_limit >= 0) { + return open_read_only_file_limit; + } + struct rlimit rlim; + if (getrlimit(RLIMIT_NOFILE, &rlim)) { + // getrlimit failed, fallback to hard-coded default. + open_read_only_file_limit = 50; + } else if (rlim.rlim_cur == RLIM_INFINITY) { + open_read_only_file_limit = std::numeric_limits::max(); + } else { + // Allow use of 20% of available file descriptors for read-only files. + open_read_only_file_limit = rlim.rlim_cur / 5; + } + return open_read_only_file_limit; +} + +PosixEnv::PosixEnv() + : started_bgthread_(false), + mmap_limit_(MaxMmaps()), + fd_limit_(MaxOpenFiles()) { PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL)); PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL)); } @@ -611,6 +678,16 @@ static pthread_once_t once = PTHREAD_ONCE_INIT; static Env* default_env; static void InitDefaultEnv() { default_env = new PosixEnv; } +void EnvPosixTestHelper::SetReadOnlyFDLimit(int limit) { + assert(default_env == NULL); + open_read_only_file_limit = limit; +} + +void EnvPosixTestHelper::SetReadOnlyMMapLimit(int limit) { + assert(default_env == NULL); + mmap_limit = limit; +} + Env* Env::Default() { pthread_once(&once, InitDefaultEnv); return default_env; diff --git a/util/env_posix_test.cc b/util/env_posix_test.cc new file mode 100644 index 000000000..295f8ae44 --- /dev/null +++ b/util/env_posix_test.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "leveldb/env.h" + +#include "port/port.h" +#include "util/testharness.h" +#include "util/env_posix_test_helper.h" + +namespace leveldb { + +static const int kDelayMicros = 100000; +static const int kReadOnlyFileLimit = 4; +static const int kMMapLimit = 4; + +class EnvPosixTest { + public: + Env* env_; + EnvPosixTest() : env_(Env::Default()) { } + + static void SetFileLimits(int read_only_file_limit, int mmap_limit) { + EnvPosixTestHelper::SetReadOnlyFDLimit(read_only_file_limit); + EnvPosixTestHelper::SetReadOnlyMMapLimit(mmap_limit); + } +}; + +TEST(EnvPosixTest, TestOpenOnRead) { + // Write some test data to a single file that will be opened |n| times. + std::string test_dir; + ASSERT_OK(env_->GetTestDirectory(&test_dir)); + std::string test_file = test_dir + "/open_on_read.txt"; + + FILE* f = fopen(test_file.c_str(), "w"); + ASSERT_TRUE(f != NULL); + const char kFileData[] = "abcdefghijklmnopqrstuvwxyz"; + fputs(kFileData, f); + fclose(f); + + // Open test file some number above the sum of the two limits to force + // open-on-read behavior of POSIX Env leveldb::RandomAccessFile. + const int kNumFiles = kReadOnlyFileLimit + kMMapLimit + 5; + leveldb::RandomAccessFile* files[kNumFiles] = {0}; + for (int i = 0; i < kNumFiles; i++) { + ASSERT_OK(env_->NewRandomAccessFile(test_file, &files[i])); + } + char scratch; + Slice read_result; + for (int i = 0; i < kNumFiles; i++) { + ASSERT_OK(files[i]->Read(i, 1, &read_result, &scratch)); + ASSERT_EQ(kFileData[i], read_result[0]); + } + for (int i = 0; i < kNumFiles; i++) { + delete files[i]; + } + ASSERT_OK(env_->DeleteFile(test_file)); +} + +} // namespace leveldb + +int main(int argc, char** argv) { + // All tests currently run with the same read-only file limits. + leveldb::EnvPosixTest::SetFileLimits(leveldb::kReadOnlyFileLimit, + leveldb::kMMapLimit); + return leveldb::test::RunAllTests(); +} diff --git a/util/env_posix_test_helper.h b/util/env_posix_test_helper.h new file mode 100644 index 000000000..038696059 --- /dev/null +++ b/util/env_posix_test_helper.h @@ -0,0 +1,28 @@ +// Copyright 2017 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_UTIL_ENV_POSIX_TEST_HELPER_H_ +#define STORAGE_LEVELDB_UTIL_ENV_POSIX_TEST_HELPER_H_ + +namespace leveldb { + +class EnvPosixTest; + +// A helper for the POSIX Env to facilitate testing. +class EnvPosixTestHelper { + private: + friend class EnvPosixTest; + + // Set the maximum number of read-only files that will be opened. + // Must be called before creating an Env. + static void SetReadOnlyFDLimit(int limit); + + // Set the maximum number of read-only files that will be mapped via mmap. + // Must be called before creating an Env. + static void SetReadOnlyMMapLimit(int limit); +}; + +} // namespace leveldb + +#endif // STORAGE_LEVELDB_UTIL_ENV_POSIX_TEST_HELPER_H_ diff --git a/util/env_test.cc b/util/env_test.cc index b72cb4438..839ae56a1 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -10,29 +10,31 @@ namespace leveldb { static const int kDelayMicros = 100000; +static const int kReadOnlyFileLimit = 4; +static const int kMMapLimit = 4; -class EnvPosixTest { +class EnvTest { private: port::Mutex mu_; std::string events_; public: Env* env_; - EnvPosixTest() : env_(Env::Default()) { } + EnvTest() : env_(Env::Default()) { } }; static void SetBool(void* ptr) { reinterpret_cast(ptr)->NoBarrier_Store(ptr); } -TEST(EnvPosixTest, RunImmediately) { +TEST(EnvTest, RunImmediately) { port::AtomicPointer called (NULL); env_->Schedule(&SetBool, &called); - Env::Default()->SleepForMicroseconds(kDelayMicros); + env_->SleepForMicroseconds(kDelayMicros); ASSERT_TRUE(called.NoBarrier_Load() != NULL); } -TEST(EnvPosixTest, RunMany) { +TEST(EnvTest, RunMany) { port::AtomicPointer last_id (NULL); struct CB { @@ -59,7 +61,7 @@ TEST(EnvPosixTest, RunMany) { env_->Schedule(&CB::Run, &cb3); env_->Schedule(&CB::Run, &cb4); - Env::Default()->SleepForMicroseconds(kDelayMicros); + env_->SleepForMicroseconds(kDelayMicros); void* cur = last_id.Acquire_Load(); ASSERT_EQ(4, reinterpret_cast(cur)); } @@ -78,7 +80,7 @@ static void ThreadBody(void* arg) { s->mu.Unlock(); } -TEST(EnvPosixTest, StartThread) { +TEST(EnvTest, StartThread) { State state; state.val = 0; state.num_running = 3; @@ -92,7 +94,7 @@ TEST(EnvPosixTest, StartThread) { if (num == 0) { break; } - Env::Default()->SleepForMicroseconds(kDelayMicros); + env_->SleepForMicroseconds(kDelayMicros); } ASSERT_EQ(state.val, 3); } diff --git a/util/env_win.cc b/util/env_win.cc index b074b7579..d32c4e676 100644 --- a/util/env_win.cc +++ b/util/env_win.cc @@ -1,7 +1,7 @@ // This file contains source that originates from: // http://code.google.com/p/leveldbwin/source/browse/trunk/win32_impl_src/env_win32.h // http://code.google.com/p/leveldbwin/source/browse/trunk/win32_impl_src/port_win32.cc -// Those files dont' have any explict license headers but the +// Those files don't have any explicit license headers but the // project (http://code.google.com/p/leveldbwin/) lists the 'New BSD License' // as the license. #if defined(LEVELDB_PLATFORM_WINDOWS) @@ -355,11 +355,13 @@ BOOL Win32SequentialFile::_Init() ToWidePath(_filename, path); _hFile = CreateFileW(path.c_str(), GENERIC_READ, - FILE_SHARE_READ, + FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, - FILE_ATTRIBUTE_NORMAL, + FILE_ATTRIBUTE_NORMAL | FILE_FLAG_SEQUENTIAL_SCAN, NULL); + if (_hFile == INVALID_HANDLE_VALUE) + _hFile = NULL; return _hFile ? TRUE : FALSE; } @@ -403,7 +405,7 @@ BOOL Win32RandomAccessFile::_Init( LPCWSTR path ) { BOOL bRet = FALSE; if(!_hFile) - _hFile = ::CreateFileW(path,GENERIC_READ,FILE_SHARE_READ,NULL,OPEN_EXISTING, + _hFile = ::CreateFileW(path,GENERIC_READ,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL | FILE_FLAG_RANDOM_ACCESS,NULL); if(!_hFile || _hFile == INVALID_HANDLE_VALUE ) _hFile = NULL; @@ -669,7 +671,7 @@ Status Win32Env::GetFileSize( const std::string& fname, uint64_t* file_size ) ToWidePath(ModifyPath(path), wpath); HANDLE file = ::CreateFileW(wpath.c_str(), - GENERIC_READ,FILE_SHARE_READ,NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL); + GENERIC_READ,FILE_SHARE_READ|FILE_SHARE_WRITE,NULL,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,NULL); LARGE_INTEGER li; if(::GetFileSizeEx(file,&li)){ *file_size = (uint64_t)li.QuadPart; diff --git a/util/options.cc b/util/options.cc index 8b618fb1a..b5e622761 100644 --- a/util/options.cc +++ b/util/options.cc @@ -21,6 +21,7 @@ Options::Options() block_cache(NULL), block_size(4096), block_restart_interval(16), + max_file_size(2<<20), compression(kSnappyCompression), reuse_logs(false), filter_policy(NULL) { From f261701265e140e21b2ed8c7b2787fb1f2fbfbef Mon Sep 17 00:00:00 2001 From: Cory Fields Date: Thu, 13 Apr 2017 13:09:19 -0400 Subject: [PATCH 08/19] leveldb: enable runtime-detected crc32 instructions --- configure.ac | 8 ++++++++ src/Makefile.am | 1 + src/Makefile.bench.include | 1 + src/Makefile.gtest.include | 1 + src/Makefile.leveldb.include | 11 +++++++++++ src/Makefile.test.include | 2 +- 6 files changed, 23 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 54f1729a7..88648c154 100644 --- a/configure.ac +++ b/configure.ac @@ -292,6 +292,12 @@ if test "x$CXXFLAGS_overridden" = "xno"; then AX_CHECK_COMPILE_FLAG([-Wself-assign],[CXXFLAGS="$CXXFLAGS -Wno-self-assign"],,[[$CXXFLAG_WERROR]]) AX_CHECK_COMPILE_FLAG([-Wunused-local-typedef],[CXXFLAGS="$CXXFLAGS -Wno-unused-local-typedef"],,[[$CXXFLAG_WERROR]]) AX_CHECK_COMPILE_FLAG([-Wdeprecated-register],[CXXFLAGS="$CXXFLAGS -Wno-deprecated-register"],,[[$CXXFLAG_WERROR]]) + + # Check for optional instruction set support. Enabling these does _not_ imply that all code will + # be compiled with them, rather that specific objects/libs may use them after checking for runtime + # compatibility. + AX_CHECK_COMPILE_FLAG([-msse4.2],[[enable_sse42=yes; SSE42_CXXFLAGS="-msse4.2"]],,[[$CXXFLAG_WERROR]]) + fi CPPFLAGS="$CPPFLAGS -DHAVE_BUILD_INFO -D__STDC_FORMAT_MACROS" @@ -895,6 +901,7 @@ AM_CONDITIONAL([ENABLE_BENCH],[test x$use_bench = xyes]) AM_CONDITIONAL([USE_LCOV],[test x$use_lcov = xyes]) AM_CONDITIONAL([GLIBC_BACK_COMPAT],[test x$use_glibc_compat = xyes]) AM_CONDITIONAL([HARDEN],[test x$use_hardening = xyes]) +AM_CONDITIONAL([ENABLE_SSE42],[test x$enable_sse42 = xyes]) AC_DEFINE(CLIENT_VERSION_MAJOR, _CLIENT_VERSION_MAJOR, [Major version]) AC_DEFINE(CLIENT_VERSION_MINOR, _CLIENT_VERSION_MINOR, [Minor version]) @@ -927,6 +934,7 @@ AC_SUBST(PIC_FLAGS) AC_SUBST(PIE_FLAGS) AC_SUBST(SANITIZER_CXXFLAGS) AC_SUBST(SANITIZER_LDFLAGS) +AC_SUBST(SSE42_CXXFLAGS) AC_SUBST(LIBTOOL_APP_LDFLAGS) AC_SUBST(BOOST_LIBS) AC_SUBST(TESTDEFS) diff --git a/src/Makefile.am b/src/Makefile.am index d2b5633f2..05b2a53eb 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -473,6 +473,7 @@ zcashd_LDADD = \ $(LIBZCASH) \ $(LIBRUSTZCASH) \ $(LIBLEVELDB) \ + $(LIBLEVELDB_SSE42) \ $(LIBMEMENV) \ $(LIBSECP256K1) diff --git a/src/Makefile.bench.include b/src/Makefile.bench.include index 9a4f3ccd5..44d0c073f 100644 --- a/src/Makefile.bench.include +++ b/src/Makefile.bench.include @@ -30,6 +30,7 @@ bench_bench_bitcoin_LDADD = \ $(LIBBITCOIN_UTIL) \ $(LIBBITCOIN_CRYPTO) \ $(LIBLEVELDB) \ + $(LIBLEVELDB_SSE42) \ $(LIBMEMENV) \ $(LIBSECP256K1) \ $(LIBZCASH) \ diff --git a/src/Makefile.gtest.include b/src/Makefile.gtest.include index 3a36ca598..347209290 100644 --- a/src/Makefile.gtest.include +++ b/src/Makefile.gtest.include @@ -74,6 +74,7 @@ zcash_gtest_LDADD = \ $(LIBBITCOIN_CRYPTO) \ $(LIBUNIVALUE) \ $(LIBLEVELDB) \ + $(LIBLEVELDB_SSE42) \ $(LIBMEMENV) \ $(LIBSECP256K1) diff --git a/src/Makefile.leveldb.include b/src/Makefile.leveldb.include index 358f39cbe..b61088ef3 100644 --- a/src/Makefile.leveldb.include +++ b/src/Makefile.leveldb.include @@ -4,12 +4,15 @@ LIBLEVELDB_INT = leveldb/libleveldb.a LIBMEMENV_INT = leveldb/libmemenv.a +LIBLEVELDB_SSE42_INT = leveldb/libleveldb_sse42.a EXTRA_LIBRARIES += $(LIBLEVELDB_INT) EXTRA_LIBRARIES += $(LIBMEMENV_INT) +EXTRA_LIBRARIES += $(LIBLEVELDB_SSE42_INT) LIBLEVELDB += $(LIBLEVELDB_INT) LIBMEMENV += $(LIBMEMENV_INT) +LIBLEVELDB_SSE42 = $(LIBLEVELDB_SSE42_INT) LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/include LEVELDB_CPPFLAGS += -I$(srcdir)/leveldb/helpers/memenv @@ -135,3 +138,11 @@ leveldb_libmemenv_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS) leveldb_libmemenv_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS) leveldb_libmemenv_a_SOURCES = leveldb/helpers/memenv/memenv.cc leveldb_libmemenv_a_SOURCES += leveldb/helpers/memenv/memenv.h + +leveldb_libleveldb_sse42_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS) +leveldb_libleveldb_sse42_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS) +if ENABLE_SSE42 +leveldb_libleveldb_sse42_a_CPPFLAGS += -DLEVELDB_PLATFORM_POSIX_SSE +leveldb_libleveldb_sse42_a_CXXFLAGS += $(SSE42_CXXFLAGS) +endif +leveldb_libleveldb_sse42_a_SOURCES = leveldb/port/port_posix_sse.cc diff --git a/src/Makefile.test.include b/src/Makefile.test.include index 0cd0326a8..df67a6399 100644 --- a/src/Makefile.test.include +++ b/src/Makefile.test.include @@ -127,7 +127,7 @@ if ENABLE_WALLET test_test_bitcoin_LDADD += $(LIBBITCOIN_WALLET) endif test_test_bitcoin_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) \ - $(LIBLEVELDB) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS) + $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS) test_test_bitcoin_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS) test_test_bitcoin_LDADD += $(LIBZCASH_CONSENSUS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(LIBZCASH) $(LIBRUSTZCASH) $(LIBZCASH_LIBS) From f204f575aa4c7e082a8f18ddd1d565816e4551c3 Mon Sep 17 00:00:00 2001 From: MarcoFalke Date: Fri, 9 Jun 2017 13:20:42 -0700 Subject: [PATCH 09/19] Add extra LevelDB source to Makefile --- src/Makefile.leveldb.include | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Makefile.leveldb.include b/src/Makefile.leveldb.include index b61088ef3..ac38141f4 100644 --- a/src/Makefile.leveldb.include +++ b/src/Makefile.leveldb.include @@ -77,6 +77,7 @@ leveldb_libleveldb_a_SOURCES += leveldb/table/merger.h leveldb_libleveldb_a_SOURCES += leveldb/table/format.h leveldb_libleveldb_a_SOURCES += leveldb/table/iterator_wrapper.h leveldb_libleveldb_a_SOURCES += leveldb/util/crc32c.h +leveldb_libleveldb_a_SOURCES += leveldb/util/env_posix_test_helper.h leveldb_libleveldb_a_SOURCES += leveldb/util/arena.h leveldb_libleveldb_a_SOURCES += leveldb/util/random.h leveldb_libleveldb_a_SOURCES += leveldb/util/posix_logger.h From b892fe13f8b6d32588304f164129aafb2720beb9 Mon Sep 17 00:00:00 2001 From: Dimitris Tsapakidis Date: Tue, 20 Jun 2017 01:57:31 +0300 Subject: [PATCH 10/19] Fixed multiple typos A few "a->an" and "an->a". "Shows, if the supplied default SOCKS5 proxy" -> "Shows if the supplied default SOCKS5 proxy". Change made on 3 occurrences. "without fully understanding the ramification of a command" -> "without fully understanding the ramifications of a command". Removed duplicate words such as "the the". Zcash: Only the changes to files and code that we have. --- qa/rpc-tests/rest.py | 22 +++++++++++----------- qa/rpc-tests/smartfees.py | 2 +- src/httpserver.h | 4 ++-- src/leveldb/db/version_set.h | 2 +- src/pow.cpp | 2 +- src/rpc/net.cpp | 16 ++++++++-------- src/test/arith_uint256_tests.cpp | 2 +- src/torcontrol.cpp | 2 +- src/utilstrencodings.cpp | 4 ++-- src/wallet/wallet.cpp | 2 +- 10 files changed, 29 insertions(+), 29 deletions(-) diff --git a/qa/rpc-tests/rest.py b/qa/rpc-tests/rest.py index 79dbe0921..acc5d943f 100755 --- a/qa/rpc-tests/rest.py +++ b/qa/rpc-tests/rest.py @@ -94,9 +94,9 @@ class RESTTest (BitcoinTestFramework): n = vout['n'] - ###################################### - # GETUTXOS: query a unspent outpoint # - ###################################### + ####################################### + # GETUTXOS: query an unspent outpoint # + ####################################### json_request = '/checkmempool/'+txid+'-'+str(n) json_string = http_get_call(url.hostname, url.port, '/rest/getutxos'+json_request+self.FORMAT_SEPARATOR+'json') json_obj = json.loads(json_string) @@ -109,9 +109,9 @@ class RESTTest (BitcoinTestFramework): assert_equal(json_obj['utxos'][0]['value'], 0.1) - ################################################ - # GETUTXOS: now query a already spent outpoint # - ################################################ + ################################################# + # GETUTXOS: now query an already spent outpoint # + ################################################# json_request = '/checkmempool/'+vintx+'-0' json_string = http_get_call(url.hostname, url.port, '/rest/getutxos'+json_request+self.FORMAT_SEPARATOR+'json') json_obj = json.loads(json_string) @@ -173,24 +173,24 @@ class RESTTest (BitcoinTestFramework): json_request = '/'+txid+'-'+str(n) json_string = http_get_call(url.hostname, url.port, '/rest/getutxos'+json_request+self.FORMAT_SEPARATOR+'json') json_obj = json.loads(json_string) - assert_equal(len(json_obj['utxos']), 0) # there should be a outpoint because it has just added to the mempool + assert_equal(len(json_obj['utxos']), 0) # there should be an outpoint because it has just added to the mempool json_request = '/checkmempool/'+txid+'-'+str(n) json_string = http_get_call(url.hostname, url.port, '/rest/getutxos'+json_request+self.FORMAT_SEPARATOR+'json') json_obj = json.loads(json_string) - assert_equal(len(json_obj['utxos']), 1) # there should be a outpoint because it has just added to the mempool + assert_equal(len(json_obj['utxos']), 1) # there should be an outpoint because it has just added to the mempool # do some invalid requests json_request = '{"checkmempool' response = http_post_call(url.hostname, url.port, '/rest/getutxos'+self.FORMAT_SEPARATOR+'json', json_request, True) - assert_equal(response.status, 500) # must be a 500 because we send a invalid json request + assert_equal(response.status, 500) # must be a 500 because we send an invalid json request json_request = '{"checkmempool' response = http_post_call(url.hostname, url.port, '/rest/getutxos'+self.FORMAT_SEPARATOR+'bin', json_request, True) - assert_equal(response.status, 500) # must be a 500 because we send a invalid bin request + assert_equal(response.status, 500) # must be a 500 because we send an invalid bin request response = http_post_call(url.hostname, url.port, '/rest/getutxos/checkmempool'+self.FORMAT_SEPARATOR+'bin', '', True) - assert_equal(response.status, 500) # must be a 500 because we send a invalid bin request + assert_equal(response.status, 500) # must be a 500 because we send an invalid bin request # test limits json_request = '/checkmempool/' diff --git a/qa/rpc-tests/smartfees.py b/qa/rpc-tests/smartfees.py index 18ac087be..8caab8808 100755 --- a/qa/rpc-tests/smartfees.py +++ b/qa/rpc-tests/smartfees.py @@ -15,7 +15,7 @@ import random from decimal import Decimal, ROUND_DOWN # Construct 2 trivial P2SH's and the ScriptSigs that spend them -# So we can create many many transactions without needing to spend +# So we can create many transactions without needing to spend # time signing. P2SH_1 = "2MySexEGVzZpRgNQ1JdjdP5bRETznm3roQ2" # P2SH of "OP_1 OP_DROP" P2SH_2 = "2NBdpwq8Aoo1EEKEXPNrKvr5xQr3M9UfcZA" # P2SH of "OP_2 OP_DROP" diff --git a/src/httpserver.h b/src/httpserver.h index 4a775f757..0a7c9e43e 100644 --- a/src/httpserver.h +++ b/src/httpserver.h @@ -86,7 +86,7 @@ public: /** * Get the request header specified by hdr, or an empty string. - * Return an pair (isPresent,string). + * Return a pair (isPresent,string). */ virtual std::pair GetHeader(const std::string& hdr); @@ -125,7 +125,7 @@ public: virtual ~HTTPClosure() {} }; -/** Event class. This can be used either as an cross-thread trigger or as a timer. +/** Event class. This can be used either as a cross-thread trigger or as a timer. */ class HTTPEvent { diff --git a/src/leveldb/db/version_set.h b/src/leveldb/db/version_set.h index c4e7ac360..7935a965a 100644 --- a/src/leveldb/db/version_set.h +++ b/src/leveldb/db/version_set.h @@ -376,7 +376,7 @@ class Compaction { // Each compaction reads inputs from "level_" and "level_+1" std::vector inputs_[2]; // The two sets of inputs - // State used to check for number of of overlapping grandparent files + // State used to check for number of overlapping grandparent files // (parent == level_ + 1, grandparent == level_ + 2) std::vector grandparents_; size_t grandparent_index_; // Index in grandparent_starts_ diff --git a/src/pow.cpp b/src/pow.cpp index 5e4afaa22..e8ebcd304 100644 --- a/src/pow.cpp +++ b/src/pow.cpp @@ -138,7 +138,7 @@ arith_uint256 GetBlockProof(const CBlockIndex& block) if (fNegative || fOverflow || bnTarget == 0) return 0; // We need to compute 2**256 / (bnTarget+1), but we can't represent 2**256 - // as it's too large for a arith_uint256. However, as 2**256 is at least as large + // as it's too large for an arith_uint256. However, as 2**256 is at least as large // as bnTarget+1, it is equal to ((2**256 - bnTarget - 1) / (bnTarget+1)) + 1, // or ~bnTarget / (bnTarget+1) + 1. return (~bnTarget / (bnTarget + 1)) + 1; diff --git a/src/rpc/net.cpp b/src/rpc/net.cpp index 9f8b4cf72..279f04cae 100644 --- a/src/rpc/net.cpp +++ b/src/rpc/net.cpp @@ -84,7 +84,7 @@ UniValue getpeerinfo(const UniValue& params, bool fHelp) "[\n" " {\n" " \"id\": n, (numeric) Peer index\n" - " \"addr\":\"host:port\", (string) The ip address and port of the peer\n" + " \"addr\":\"host:port\", (string) The IP address and port of the peer\n" " \"addrlocal\":\"ip:port\", (string) local address\n" " \"services\":\"xxxxxxxxxxxxxxxx\", (string) The services offered\n" " \"lastsend\": ttt, (numeric) The time in seconds since epoch (Jan 1 1970 GMT) of the last send\n" @@ -173,7 +173,7 @@ UniValue addnode(const UniValue& params, bool fHelp) (strCommand != "onetry" && strCommand != "add" && strCommand != "remove")) throw runtime_error( "addnode \"node\" \"add|remove|onetry\"\n" - "\nAttempts add or remove a node from the addnode list.\n" + "\nAttempts to add or remove a node from the addnode list.\n" "Or try a connection to a node once.\n" "\nArguments:\n" "1. \"node\" (string, required) The node (see getpeerinfo for nodes)\n" @@ -251,7 +251,7 @@ UniValue getaddednodeinfo(const UniValue& params, bool fHelp) "\nResult:\n" "[\n" " {\n" - " \"addednode\" : \"192.168.0.201\", (string) The node ip address\n" + " \"addednode\" : \"192.168.0.201\", (string) The node IP address\n" " \"connected\" : true|false, (boolean) If connected\n" " \"addresses\" : [\n" " {\n" @@ -501,12 +501,12 @@ UniValue setban(const UniValue& params, bool fHelp) (strCommand != "add" && strCommand != "remove")) throw runtime_error( "setban \"ip(/netmask)\" \"add|remove\" (bantime) (absolute)\n" - "\nAttempts add or remove a IP/Subnet from the banned list.\n" + "\nAttempts to add or remove an IP/Subnet from the banned list.\n" "\nArguments:\n" - "1. \"ip(/netmask)\" (string, required) The IP/Subnet (see getpeerinfo for nodes ip) with a optional netmask (default is /32 = single ip)\n" - "2. \"command\" (string, required) 'add' to add a IP/Subnet to the list, 'remove' to remove a IP/Subnet from the list\n" - "3. \"bantime\" (numeric, optional) time in seconds how long (or until when if [absolute] is set) the ip is banned (0 or empty means using the default time of 24h which can also be overwritten by the -bantime startup argument)\n" - "4. \"absolute\" (boolean, optional) If set, the bantime must be a absolute timestamp in seconds since epoch (Jan 1 1970 GMT)\n" + "1. \"ip(/netmask)\" (string, required) The IP/Subnet (see getpeerinfo for nodes IP) with an optional netmask (default is /32 = single IP)\n" + "2. \"command\" (string, required) 'add' to add an IP/Subnet to the list, 'remove' to remove an IP/Subnet from the list\n" + "3. \"bantime\" (numeric, optional) time in seconds how long (or until when if [absolute] is set) the IP is banned (0 or empty means using the default time of 24h which can also be overwritten by the -bantime startup argument)\n" + "4. \"absolute\" (boolean, optional) If set, the bantime must be an absolute timestamp in seconds since epoch (Jan 1 1970 GMT)\n" "\nExamples:\n" + HelpExampleCli("setban", "\"192.168.0.6\" \"add\" 86400") + HelpExampleCli("setban", "\"192.168.0.0/24\" \"add\"") diff --git a/src/test/arith_uint256_tests.cpp b/src/test/arith_uint256_tests.cpp index ba0b6141c..a1b7e2625 100644 --- a/src/test/arith_uint256_tests.cpp +++ b/src/test/arith_uint256_tests.cpp @@ -219,7 +219,7 @@ BOOST_AUTO_TEST_CASE( unaryOperators ) // ! ~ - // Check if doing _A_ _OP_ _B_ results in the same as applying _OP_ onto each -// element of Aarray and Barray, and then converting the result into a arith_uint256. +// element of Aarray and Barray, and then converting the result into an arith_uint256. #define CHECKBITWISEOPERATOR(_A_,_B_,_OP_) \ for (unsigned int i = 0; i < 32; ++i) { TmpArray[i] = _A_##Array[i] _OP_ _B_##Array[i]; } \ BOOST_CHECK(arith_uint256V(std::vector(TmpArray,TmpArray+32)) == (_A_##L _OP_ _B_##L)); diff --git a/src/torcontrol.cpp b/src/torcontrol.cpp index 6a4f80078..ef3cf73c0 100644 --- a/src/torcontrol.cpp +++ b/src/torcontrol.cpp @@ -406,7 +406,7 @@ static bool WriteBinaryFile(const std::string &filename, const std::string &data /****** Bitcoin specific TorController implementation ********/ /** Controller that connects to Tor control socket, authenticate, then create - * and maintain a ephemeral hidden service. + * and maintain an ephemeral hidden service. */ class TorController { diff --git a/src/utilstrencodings.cpp b/src/utilstrencodings.cpp index 1abbde4ad..6528b6cd4 100644 --- a/src/utilstrencodings.cpp +++ b/src/utilstrencodings.cpp @@ -282,7 +282,7 @@ bool ParseInt32(const std::string& str, int32_t *out) errno = 0; // strtol will not set errno if valid long int n = strtol(str.c_str(), &endp, 10); if(out) *out = (int32_t)n; - // Note that strtol returns a *long int*, so even if strtol doesn't report a over/underflow + // Note that strtol returns a *long int*, so even if strtol doesn't report an over/underflow // we still have to check that the returned value is within the range of an *int32_t*. On 64-bit // platforms the size of these types may be different. return endp && *endp == 0 && !errno && @@ -298,7 +298,7 @@ bool ParseInt64(const std::string& str, int64_t *out) errno = 0; // strtoll will not set errno if valid long long int n = strtoll(str.c_str(), &endp, 10); if(out) *out = (int64_t)n; - // Note that strtoll returns a *long long int*, so even if strtol doesn't report a over/underflow + // Note that strtoll returns a *long long int*, so even if strtol doesn't report an over/underflow // we still have to check that the returned value is within the range of an *int64_t*. return endp && *endp == 0 && !errno && n >= std::numeric_limits::min() && diff --git a/src/wallet/wallet.cpp b/src/wallet/wallet.cpp index d8eb2d7cc..874c4983b 100644 --- a/src/wallet/wallet.cpp +++ b/src/wallet/wallet.cpp @@ -4769,7 +4769,7 @@ bool CWallet::InitLoadWallet(bool clearWitnessCaches) if (chainActive.Tip() && chainActive.Tip() != pindexRescan) { // We can't rescan beyond non-pruned blocks, stop and throw an error. - // This might happen if a user uses a old wallet within a pruned node, + // This might happen if a user uses an old wallet within a pruned node, // or if they ran -disablewallet for a longer time, then decided to re-enable. if (fPruneMode) { From 55b4aa19421e1cfa052237d95c18f5150166bc0a Mon Sep 17 00:00:00 2001 From: Cory Fields Date: Wed, 12 Jul 2017 12:58:20 -0400 Subject: [PATCH 11/19] build: verify that the assembler can handle crc32 functions Also, enable crc32 even if -msse4.2 wasn't added by us, as long as it works. This allows custom flags (such as -march=native) to work as expected. --- configure.ac | 27 +++++++++++++++++++++++++-- src/Makefile.leveldb.include | 2 +- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index 88648c154..476a632e1 100644 --- a/configure.ac +++ b/configure.ac @@ -296,9 +296,32 @@ if test "x$CXXFLAGS_overridden" = "xno"; then # Check for optional instruction set support. Enabling these does _not_ imply that all code will # be compiled with them, rather that specific objects/libs may use them after checking for runtime # compatibility. - AX_CHECK_COMPILE_FLAG([-msse4.2],[[enable_sse42=yes; SSE42_CXXFLAGS="-msse4.2"]],,[[$CXXFLAG_WERROR]]) + AX_CHECK_COMPILE_FLAG([-msse4.2],[[SSE42_CXXFLAGS="-msse4.2"]],,[[$CXXFLAG_WERROR]]) fi + +TEMP_CXXFLAGS="$CXXFLAGS" +CXXFLAGS="$CXXFLAGS $SSE42_CXXFLAGS" +AC_MSG_CHECKING(for assembler crc32 support) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + #include + #if defined(_MSC_VER) + #include + #elif defined(__GNUC__) && defined(__SSE4_2__) + #include + #endif + ]],[[ + uint64_t l = 0; + l = _mm_crc32_u8(l, 0); + l = _mm_crc32_u32(l, 0); + l = _mm_crc32_u64(l, 0); + return l; + ]])], + [ AC_MSG_RESULT(yes); enable_hwcrc32=yes], + [ AC_MSG_RESULT(no)] +) +CXXFLAGS="$TEMP_CXXFLAGS" + CPPFLAGS="$CPPFLAGS -DHAVE_BUILD_INFO -D__STDC_FORMAT_MACROS" AC_ARG_WITH([utils], @@ -901,7 +924,7 @@ AM_CONDITIONAL([ENABLE_BENCH],[test x$use_bench = xyes]) AM_CONDITIONAL([USE_LCOV],[test x$use_lcov = xyes]) AM_CONDITIONAL([GLIBC_BACK_COMPAT],[test x$use_glibc_compat = xyes]) AM_CONDITIONAL([HARDEN],[test x$use_hardening = xyes]) -AM_CONDITIONAL([ENABLE_SSE42],[test x$enable_sse42 = xyes]) +AM_CONDITIONAL([ENABLE_HWCRC32],[test x$enable_hwcrc32 = xyes]) AC_DEFINE(CLIENT_VERSION_MAJOR, _CLIENT_VERSION_MAJOR, [Major version]) AC_DEFINE(CLIENT_VERSION_MINOR, _CLIENT_VERSION_MINOR, [Minor version]) diff --git a/src/Makefile.leveldb.include b/src/Makefile.leveldb.include index ac38141f4..833f3d2a1 100644 --- a/src/Makefile.leveldb.include +++ b/src/Makefile.leveldb.include @@ -142,7 +142,7 @@ leveldb_libmemenv_a_SOURCES += leveldb/helpers/memenv/memenv.h leveldb_libleveldb_sse42_a_CPPFLAGS = $(leveldb_libleveldb_a_CPPFLAGS) leveldb_libleveldb_sse42_a_CXXFLAGS = $(leveldb_libleveldb_a_CXXFLAGS) -if ENABLE_SSE42 +if ENABLE_HWCRC32 leveldb_libleveldb_sse42_a_CPPFLAGS += -DLEVELDB_PLATFORM_POSIX_SSE leveldb_libleveldb_sse42_a_CXXFLAGS += $(SSE42_CXXFLAGS) endif From b3b77fb7581b437b0fd80405eef1d2da0a1eb7b9 Mon Sep 17 00:00:00 2001 From: Jack Grigg Date: Wed, 23 Sep 2020 00:08:27 +0100 Subject: [PATCH 12/19] Squashed 'src/leveldb/' changes from 196962ff0..c521b3ac6 c521b3ac6 Merge #11: fixup define checks. Cleans up some oopses from #5. 8b1cd3753 fixup define checks. Cleans up some oopses from #5. 6b1508d6d Merge #6: Fixes typo fceb80542 Merge #10: Clean up compile-time warnings (gcc 7.1) 0ec2a343f Clean up compile-time warnings (gcc 7.1) d4c268a35 Merge #5: Move helper functions out of sse4.2 object 8d4eb0847 Add HasAcceleratedCRC32C to port_win.h 77cfbfd25 crc32: move helper functions out of port_posix_sse.cc 4c1e9e016 silence compiler warnings about uninitialized variables 495316485 Merge #2: Prefer std::atomic over MemoryBarrier 2953978ef Fixes typo f134284a1 Merge #1: Merge upstream LevelDB 1.20 ba8a445fd Prefer std::atomic over MemoryBarrier git-subtree-dir: src/leveldb git-subtree-split: c521b3ac654cfbe009c575eacf7e5a6e189bb5bb --- db/memtable.cc | 2 +- db/version_set.cc | 2 +- db/version_set.h | 2 +- port/atomic_pointer.h | 47 ++++++++++++++++++++++-------------------- port/port_example.h | 4 ++++ port/port_posix.cc | 14 +++++++++++++ port/port_posix.h | 1 + port/port_posix_sse.cc | 19 ----------------- port/port_win.cc | 11 ++++++++++ port/port_win.h | 1 + util/crc32c.cc | 4 ++++ util/logging.cc | 2 +- 12 files changed, 64 insertions(+), 45 deletions(-) diff --git a/db/memtable.cc b/db/memtable.cc index bfec0a7e7..287afdbdc 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -101,7 +101,7 @@ void MemTable::Add(SequenceNumber s, ValueType type, p += 8; p = EncodeVarint32(p, val_size); memcpy(p, value.data(), val_size); - assert((p + val_size) - buf == encoded_len); + assert(p + val_size == buf + encoded_len); table_.Insert(buf); } diff --git a/db/version_set.cc b/db/version_set.cc index b1256f90e..2cb6d80ed 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -20,7 +20,7 @@ namespace leveldb { -static int TargetFileSize(const Options* options) { +static size_t TargetFileSize(const Options* options) { return options->max_file_size; } diff --git a/db/version_set.h b/db/version_set.h index c4e7ac360..7935a965a 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -376,7 +376,7 @@ class Compaction { // Each compaction reads inputs from "level_" and "level_+1" std::vector inputs_[2]; // The two sets of inputs - // State used to check for number of of overlapping grandparent files + // State used to check for number of overlapping grandparent files // (parent == level_ + 1, grandparent == level_ + 2) std::vector grandparents_; size_t grandparent_index_; // Index in grandparent_starts_ diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h index 1c4c7aafc..d79a02230 100644 --- a/port/atomic_pointer.h +++ b/port/atomic_pointer.h @@ -46,6 +46,30 @@ namespace leveldb { namespace port { +// AtomicPointer based on if available +#if defined(LEVELDB_ATOMIC_PRESENT) +class AtomicPointer { + private: + std::atomic rep_; + public: + AtomicPointer() { } + explicit AtomicPointer(void* v) : rep_(v) { } + inline void* Acquire_Load() const { + return rep_.load(std::memory_order_acquire); + } + inline void Release_Store(void* v) { + rep_.store(v, std::memory_order_release); + } + inline void* NoBarrier_Load() const { + return rep_.load(std::memory_order_relaxed); + } + inline void NoBarrier_Store(void* v) { + rep_.store(v, std::memory_order_relaxed); + } +}; + +#else + // Define MemoryBarrier() if available // Windows on x86 #if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY) @@ -142,28 +166,6 @@ class AtomicPointer { } }; -// AtomicPointer based on -#elif defined(LEVELDB_ATOMIC_PRESENT) -class AtomicPointer { - private: - std::atomic rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - return rep_.load(std::memory_order_acquire); - } - inline void Release_Store(void* v) { - rep_.store(v, std::memory_order_release); - } - inline void* NoBarrier_Load() const { - return rep_.load(std::memory_order_relaxed); - } - inline void NoBarrier_Store(void* v) { - rep_.store(v, std::memory_order_relaxed); - } -}; - // Atomic pointer based on sparc memory barriers #elif defined(__sparcv9) && defined(__GNUC__) class AtomicPointer { @@ -228,6 +230,7 @@ class AtomicPointer { #else #error Please implement AtomicPointer for this platform. +#endif #endif #undef LEVELDB_HAVE_MEMORY_BARRIER diff --git a/port/port_example.h b/port/port_example.h index 97bd669a5..5b1d027de 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -129,6 +129,10 @@ extern bool Snappy_Uncompress(const char* input_data, size_t input_length, // The concatenation of all "data[0,n-1]" fragments is the heap profile. extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg); +// Determine whether a working accelerated crc32 implementation exists +// Returns true if AcceleratedCRC32C is safe to call +bool HasAcceleratedCRC32C(); + // Extend the CRC to include the first n bytes of buf. // // Returns zero if the CRC cannot be extended using acceleration, else returns diff --git a/port/port_posix.cc b/port/port_posix.cc index 30e8007ae..ec39e9219 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -8,6 +8,10 @@ #include #include +#if (defined(__x86_64__) || defined(__i386__)) && defined(__GNUC__) +#include +#endif + namespace leveldb { namespace port { @@ -49,5 +53,15 @@ void InitOnce(OnceType* once, void (*initializer)()) { PthreadCall("once", pthread_once(once, initializer)); } +bool HasAcceleratedCRC32C() { +#if (defined(__x86_64__) || defined(__i386__)) && defined(__GNUC__) + unsigned int eax, ebx, ecx, edx; + __get_cpuid(1, &eax, &ebx, &ecx, &edx); + return (ecx & (1 << 20)) != 0; +#else + return false; +#endif +} + } // namespace port } // namespace leveldb diff --git a/port/port_posix.h b/port/port_posix.h index 7e8213b22..d85fa5d63 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -152,6 +152,7 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { return false; } +bool HasAcceleratedCRC32C(); uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); } // namespace port diff --git a/port/port_posix_sse.cc b/port/port_posix_sse.cc index 1e519ba0b..2d49c21dd 100644 --- a/port/port_posix_sse.cc +++ b/port/port_posix_sse.cc @@ -19,7 +19,6 @@ #include #elif defined(__GNUC__) && defined(__SSE4_2__) #include -#include #endif #endif // defined(LEVELDB_PLATFORM_POSIX_SSE) @@ -48,20 +47,6 @@ static inline uint64_t LE_LOAD64(const uint8_t *p) { #endif // defined(_M_X64) || defined(__x86_64__) -static inline bool HaveSSE42() { -#if defined(_MSC_VER) - int cpu_info[4]; - __cpuid(cpu_info, 1); - return (cpu_info[2] & (1 << 20)) != 0; -#elif defined(__GNUC__) - unsigned int eax, ebx, ecx, edx; - __get_cpuid(1, &eax, &ebx, &ecx, &edx); - return (ecx & (1 << 20)) != 0; -#else - return false; -#endif -} - #endif // defined(LEVELDB_PLATFORM_POSIX_SSE) // For further improvements see Intel publication at: @@ -70,10 +55,6 @@ uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) { #if !defined(LEVELDB_PLATFORM_POSIX_SSE) return 0; #else - static bool have = HaveSSE42(); - if (!have) { - return 0; - } const uint8_t *p = reinterpret_cast(buf); const uint8_t *e = p + size; diff --git a/port/port_win.cc b/port/port_win.cc index 1b0f060a1..1be9e8d5b 100644 --- a/port/port_win.cc +++ b/port/port_win.cc @@ -32,6 +32,7 @@ #include #include +#include namespace leveldb { namespace port { @@ -143,5 +144,15 @@ void AtomicPointer::NoBarrier_Store(void* v) { rep_ = v; } +bool HasAcceleratedCRC32C() { +#if defined(__x86_64__) || defined(__i386__) + int cpu_info[4]; + __cpuid(cpu_info, 1); + return (cpu_info[2] & (1 << 20)) != 0; +#else + return false; +#endif +} + } } diff --git a/port/port_win.h b/port/port_win.h index 500900482..e8bf46ef2 100644 --- a/port/port_win.h +++ b/port/port_win.h @@ -168,6 +168,7 @@ inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) { return false; } +bool HasAcceleratedCRC32C(); uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size); } diff --git a/util/crc32c.cc b/util/crc32c.cc index edd61cfd6..b3f40eeee 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -288,6 +288,10 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) { // Determine if the CPU running this program can accelerate the CRC32C // calculation. static bool CanAccelerateCRC32C() { + if (!port::HasAcceleratedCRC32C()) + return false; + + // Double-check that the accelerated implementation functions correctly. // port::AcceleretedCRC32C returns zero when unable to accelerate. static const char kTestCRCBuffer[] = "TestCRCBuffer"; static const char kBufSize = sizeof(kTestCRCBuffer) - 1; diff --git a/util/logging.cc b/util/logging.cc index ca6b32440..db6160c8f 100644 --- a/util/logging.cc +++ b/util/logging.cc @@ -49,7 +49,7 @@ bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { uint64_t v = 0; int digits = 0; while (!in->empty()) { - char c = (*in)[0]; + unsigned char c = (*in)[0]; if (c >= '0' && c <= '9') { ++digits; const int delta = (c - '0'); From b3f8c6e5f45c2c51013f150279bc844919d8c183 Mon Sep 17 00:00:00 2001 From: Jack Grigg Date: Wed, 23 Sep 2020 00:09:29 +0100 Subject: [PATCH 13/19] Squashed 'src/leveldb/' changes from c521b3ac6..64052c76c 64052c76c Merge #15: Add filename to corruption errors 135ed0fb4 Add filename to corruption errors git-subtree-dir: src/leveldb git-subtree-split: 64052c76c567cff3dad32d1db0ef969d97b5882f --- db/db_impl.cc | 2 +- db/leveldbutil.cc | 1 + db/log_reader.cc | 2 +- db/repair.cc | 2 +- helpers/memenv/memenv.cc | 3 +++ include/leveldb/env.h | 9 +++++++++ table/format.cc | 10 +++++----- util/env_posix.cc | 8 ++++++++ util/env_win.cc | 3 +++ 9 files changed, 32 insertions(+), 8 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index f43ad7679..3bb58e560 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -414,7 +414,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log, status.ok()) { if (record.size() < 12) { reporter.Corruption( - record.size(), Status::Corruption("log record too small")); + record.size(), Status::Corruption("log record too small", fname)); continue; } WriteBatchInternal::SetContents(&batch, record); diff --git a/db/leveldbutil.cc b/db/leveldbutil.cc index 9f4b7dd70..d06d64d64 100644 --- a/db/leveldbutil.cc +++ b/db/leveldbutil.cc @@ -19,6 +19,7 @@ class StdoutPrinter : public WritableFile { virtual Status Close() { return Status::OK(); } virtual Status Flush() { return Status::OK(); } virtual Status Sync() { return Status::OK(); } + virtual std::string GetName() const { return "[stdout]"; } }; bool HandleDumpCommand(Env* env, char** files, int num) { diff --git a/db/log_reader.cc b/db/log_reader.cc index a6d304545..8b6ad136d 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -186,7 +186,7 @@ uint64_t Reader::LastRecordOffset() { } void Reader::ReportCorruption(uint64_t bytes, const char* reason) { - ReportDrop(bytes, Status::Corruption(reason)); + ReportDrop(bytes, Status::Corruption(reason, file_->GetName())); } void Reader::ReportDrop(uint64_t bytes, const Status& reason) { diff --git a/db/repair.cc b/db/repair.cc index 4cd4bb047..7281e3d34 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -203,7 +203,7 @@ class Repairer { while (reader.ReadRecord(&record, &scratch)) { if (record.size() < 12) { reporter.Corruption( - record.size(), Status::Corruption("log record too small")); + record.size(), Status::Corruption("log record too small", logname)); continue; } WriteBatchInternal::SetContents(&batch, record); diff --git a/helpers/memenv/memenv.cc b/helpers/memenv/memenv.cc index 9a98884da..68c0614a5 100644 --- a/helpers/memenv/memenv.cc +++ b/helpers/memenv/memenv.cc @@ -176,6 +176,7 @@ class SequentialFileImpl : public SequentialFile { return Status::OK(); } + virtual std::string GetName() const { return "[memenv]"; } private: FileState* file_; uint64_t pos_; @@ -196,6 +197,7 @@ class RandomAccessFileImpl : public RandomAccessFile { return file_->Read(offset, n, result, scratch); } + virtual std::string GetName() const { return "[memenv]"; } private: FileState* file_; }; @@ -218,6 +220,7 @@ class WritableFileImpl : public WritableFile { virtual Status Flush() { return Status::OK(); } virtual Status Sync() { return Status::OK(); } + virtual std::string GetName() const { return "[memenv]"; } private: FileState* file_; }; diff --git a/include/leveldb/env.h b/include/leveldb/env.h index 99b6c2141..275d441ea 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -191,6 +191,9 @@ class SequentialFile { // REQUIRES: External synchronization virtual Status Skip(uint64_t n) = 0; + // Get a name for the file, only for error reporting + virtual std::string GetName() const = 0; + private: // No copying allowed SequentialFile(const SequentialFile&); @@ -215,6 +218,9 @@ class RandomAccessFile { virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const = 0; + // Get a name for the file, only for error reporting + virtual std::string GetName() const = 0; + private: // No copying allowed RandomAccessFile(const RandomAccessFile&); @@ -234,6 +240,9 @@ class WritableFile { virtual Status Flush() = 0; virtual Status Sync() = 0; + // Get a name for the file, only for error reporting + virtual std::string GetName() const = 0; + private: // No copying allowed WritableFile(const WritableFile&); diff --git a/table/format.cc b/table/format.cc index 24e4e0244..285e1c0de 100644 --- a/table/format.cc +++ b/table/format.cc @@ -82,7 +82,7 @@ Status ReadBlock(RandomAccessFile* file, } if (contents.size() != n + kBlockTrailerSize) { delete[] buf; - return Status::Corruption("truncated block read"); + return Status::Corruption("truncated block read", file->GetName()); } // Check the crc of the type and the block contents @@ -92,7 +92,7 @@ Status ReadBlock(RandomAccessFile* file, const uint32_t actual = crc32c::Value(data, n + 1); if (actual != crc) { delete[] buf; - s = Status::Corruption("block checksum mismatch"); + s = Status::Corruption("block checksum mismatch", file->GetName()); return s; } } @@ -119,13 +119,13 @@ Status ReadBlock(RandomAccessFile* file, size_t ulength = 0; if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) { delete[] buf; - return Status::Corruption("corrupted compressed block contents"); + return Status::Corruption("corrupted compressed block contents", file->GetName()); } char* ubuf = new char[ulength]; if (!port::Snappy_Uncompress(data, n, ubuf)) { delete[] buf; delete[] ubuf; - return Status::Corruption("corrupted compressed block contents"); + return Status::Corruption("corrupted compressed block contents", file->GetName()); } delete[] buf; result->data = Slice(ubuf, ulength); @@ -135,7 +135,7 @@ Status ReadBlock(RandomAccessFile* file, } default: delete[] buf; - return Status::Corruption("bad block type"); + return Status::Corruption("bad block type", file->GetName()); } return Status::OK(); diff --git a/util/env_posix.cc b/util/env_posix.cc index dd852af35..4676bc224 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -121,6 +121,8 @@ class PosixSequentialFile: public SequentialFile { } return Status::OK(); } + + virtual std::string GetName() const { return filename_; } }; // pread() based random-access @@ -172,6 +174,8 @@ class PosixRandomAccessFile: public RandomAccessFile { } return s; } + + virtual std::string GetName() const { return filename_; } }; // mmap() based random-access @@ -206,6 +210,8 @@ class PosixMmapReadableFile: public RandomAccessFile { } return s; } + + virtual std::string GetName() const { return filename_; } }; class PosixWritableFile : public WritableFile { @@ -287,6 +293,8 @@ class PosixWritableFile : public WritableFile { } return s; } + + virtual std::string GetName() const { return filename_; } }; static int LockOrUnlock(int fd, bool lock) { diff --git a/util/env_win.cc b/util/env_win.cc index d32c4e676..81380216b 100644 --- a/util/env_win.cc +++ b/util/env_win.cc @@ -78,6 +78,7 @@ public: virtual Status Read(size_t n, Slice* result, char* scratch); virtual Status Skip(uint64_t n); BOOL isEnable(); + virtual std::string GetName() const { return _filename; } private: BOOL _Init(); void _CleanUp(); @@ -94,6 +95,7 @@ public: virtual ~Win32RandomAccessFile(); virtual Status Read(uint64_t offset, size_t n, Slice* result,char* scratch) const; BOOL isEnable(); + virtual std::string GetName() const { return _filename; } private: BOOL _Init(LPCWSTR path); void _CleanUp(); @@ -114,6 +116,7 @@ public: virtual Status Flush(); virtual Status Sync(); BOOL isEnable(); + virtual std::string GetName() const { return filename_; } private: std::string filename_; ::HANDLE _hFile; From 55f90917f519aa124455bd81ab81ff67a1c7fc84 Mon Sep 17 00:00:00 2001 From: Jack Grigg Date: Wed, 23 Sep 2020 00:10:53 +0100 Subject: [PATCH 14/19] Squashed 'src/leveldb/' changes from 64052c76c..524b7e36a 524b7e36a Merge #19: Increase maximum read-only mmap()s used from 1000 to 4096 on 64-bit systems 4874cb8d3 Increase maximum number of read-only mmap()s used from 1000 to 4096 on 64 bit systems. git-subtree-dir: src/leveldb git-subtree-split: 524b7e36a8e3bce6fcbcd1b5df09024283f325ba --- util/env_posix.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/env_posix.cc b/util/env_posix.cc index 4676bc224..f77918313 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -585,8 +585,8 @@ static int MaxMmaps() { if (mmap_limit >= 0) { return mmap_limit; } - // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes. - mmap_limit = sizeof(void*) >= 8 ? 1000 : 0; + // Up to 4096 mmaps for 64-bit binaries; none for smaller pointer sizes. + mmap_limit = sizeof(void*) >= 8 ? 4096 : 0; return mmap_limit; } From dd6b34250c396d3715c098a18a42c37e988a4a9f Mon Sep 17 00:00:00 2001 From: Jack Grigg Date: Wed, 23 Sep 2020 00:11:45 +0100 Subject: [PATCH 15/19] Squashed 'src/leveldb/' changes from 524b7e36a..f545dfabf f545dfabf Merge #18: Use utf-8 to decode filename f8e797a05 Use utf-8 to decode filename 2fc114812 Merge #14: Fixes to allow building with msvc. d6eab9313 Fixes to allow building with msvc. git-subtree-dir: src/leveldb git-subtree-split: f545dfabff4c2e9836efed094dba99a34fbc6b88 --- db/c.cc | 2 ++ port/port_win.h | 7 +++++++ util/env_win.cc | 25 +++++++++++++------------ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/db/c.cc b/db/c.cc index 08ff0ad90..b23e3dcc9 100644 --- a/db/c.cc +++ b/db/c.cc @@ -5,7 +5,9 @@ #include "leveldb/c.h" #include +#ifndef WIN32 #include +#endif #include "leveldb/cache.h" #include "leveldb/comparator.h" #include "leveldb/db.h" diff --git a/port/port_win.h b/port/port_win.h index e8bf46ef2..989c15cd9 100644 --- a/port/port_win.h +++ b/port/port_win.h @@ -32,9 +32,16 @@ #define STORAGE_LEVELDB_PORT_PORT_WIN_H_ #ifdef _MSC_VER +#if !(_MSC_VER >= 1900) #define snprintf _snprintf +#endif #define close _close #define fread_unlocked _fread_nolock +#ifdef _WIN64 +#define ssize_t int64_t +#else +#define ssize_t int32_t +#endif #endif #include diff --git a/util/env_win.cc b/util/env_win.cc index 81380216b..830332abe 100644 --- a/util/env_win.cc +++ b/util/env_win.cc @@ -203,24 +203,16 @@ public: void ToWidePath(const std::string& value, std::wstring& target) { wchar_t buffer[MAX_PATH]; - MultiByteToWideChar(CP_ACP, 0, value.c_str(), -1, buffer, MAX_PATH); + MultiByteToWideChar(CP_UTF8, 0, value.c_str(), -1, buffer, MAX_PATH); target = buffer; } void ToNarrowPath(const std::wstring& value, std::string& target) { char buffer[MAX_PATH]; - WideCharToMultiByte(CP_ACP, 0, value.c_str(), -1, buffer, MAX_PATH, NULL, NULL); + WideCharToMultiByte(CP_UTF8, 0, value.c_str(), -1, buffer, MAX_PATH, NULL, NULL); target = buffer; } -std::string GetCurrentDir() -{ - CHAR path[MAX_PATH]; - ::GetModuleFileNameA(::GetModuleHandleA(NULL),path,MAX_PATH); - *strrchr(path,'\\') = 0; - return std::string(path); -} - std::wstring GetCurrentDirW() { WCHAR path[MAX_PATH]; @@ -229,6 +221,13 @@ std::wstring GetCurrentDirW() return std::wstring(path); } +std::string GetCurrentDir() +{ + std::string path; + ToNarrowPath(GetCurrentDirW(), path); + return path; +} + std::string& ModifyPath(std::string& path) { if(path[0] == '/' || path[0] == '\\'){ @@ -764,14 +763,16 @@ uint64_t Win32Env::NowMicros() static Status CreateDirInner( const std::string& dirname ) { Status sRet; - DWORD attr = ::GetFileAttributes(dirname.c_str()); + std::wstring dirnameW; + ToWidePath(dirname, dirnameW); + DWORD attr = ::GetFileAttributesW(dirnameW.c_str()); if (attr == INVALID_FILE_ATTRIBUTES) { // doesn't exist: std::size_t slash = dirname.find_last_of("\\"); if (slash != std::string::npos){ sRet = CreateDirInner(dirname.substr(0, slash)); if (!sRet.ok()) return sRet; } - BOOL result = ::CreateDirectory(dirname.c_str(), NULL); + BOOL result = ::CreateDirectoryW(dirnameW.c_str(), NULL); if (result == FALSE) { sRet = Status::IOError(dirname, "Could not create directory."); return sRet; From 3d79da1beb2a1c931effe493f0b960f2019aa26e Mon Sep 17 00:00:00 2001 From: Jack Grigg Date: Fri, 25 Sep 2020 15:39:08 +0100 Subject: [PATCH 16/19] build: out-of-tree fixups Accidentally excluded from 9cc749769290af1a51f2fe98f80931e49ad11d71 when backporting upstream commit 0cb0f2626e1553426e16a52fc6928d35824827f5. --- src/Makefile.am | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index 05b2a53eb..442fb9440 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -595,11 +595,10 @@ CLEANFILES = *.gcda *.gcno */*.gcno wallet/*/*.gcno DISTCLEANFILES = obj/build.h -EXTRA_DIST = leveldb $(CTAES_DIST) rust +EXTRA_DIST = $(CTAES_DIST) rust clean-local: rm -f $(top_srcdir)/.cargo/config $(top_srcdir)/.cargo/.configured-for-* - -$(MAKE) -C leveldb clean -$(MAKE) -C secp256k1 clean -$(MAKE) -C univalue clean rm -f leveldb/*/*.gcno leveldb/helpers/memenv/*.gcno From f5582677a6a0ea05da52ab96569b5609f9bf8f76 Mon Sep 17 00:00:00 2001 From: Jack Grigg Date: Fri, 25 Sep 2020 15:59:05 +0100 Subject: [PATCH 17/19] leveldb: Assert that ssize_t is the same size as size_t on Windows Co-authored-by: Daira Hopwood --- src/leveldb/port/port_win.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/leveldb/port/port_win.h b/src/leveldb/port/port_win.h index 989c15cd9..ba7b17faf 100644 --- a/src/leveldb/port/port_win.h +++ b/src/leveldb/port/port_win.h @@ -46,6 +46,7 @@ #include #include +static_assert(sizeof(ssize_t) == sizeof(size_t), "ssize_t should be the same size as size_t"); #ifdef SNAPPY #include #endif From e2eeabe9e06e66466300677fe8c8fc25b5a5c6b2 Mon Sep 17 00:00:00 2001 From: str4d Date: Fri, 25 Sep 2020 16:02:25 +0100 Subject: [PATCH 18/19] Update license headers Co-authored-by: Daira Hopwood --- src/Makefile.bench.include | 3 ++- src/Makefile.leveldb.include | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Makefile.bench.include b/src/Makefile.bench.include index 44d0c073f..bea170507 100644 --- a/src/Makefile.bench.include +++ b/src/Makefile.bench.include @@ -1,6 +1,7 @@ +# Copyright (c) 2020 The Zcash developers # Copyright (c) 2015-2016 The Bitcoin Core developers # Distributed under the MIT software license, see the accompanying -# file COPYING or http://www.opensource.org/licenses/mit-license.php. +# file COPYING or https://www.opensource.org/licenses/mit-license.php . bin_PROGRAMS += bench/bench_bitcoin BENCH_SRCDIR = bench diff --git a/src/Makefile.leveldb.include b/src/Makefile.leveldb.include index 833f3d2a1..2d9567343 100644 --- a/src/Makefile.leveldb.include +++ b/src/Makefile.leveldb.include @@ -1,6 +1,7 @@ -# Copyright (c) 2016 The Bitcoin Core developers +# Copyright (c) 2020 The Zcash developers +# Copyright (c) 2016-2017 The Bitcoin Core developers # Distributed under the MIT software license, see the accompanying -# file COPYING or http://www.opensource.org/licenses/mit-license.php. +# file COPYING or https://www.opensource.org/licenses/mit-license.php . LIBLEVELDB_INT = leveldb/libleveldb.a LIBMEMENV_INT = leveldb/libmemenv.a From f4c78c45b56bdb95f79e1156b552fb619de87301 Mon Sep 17 00:00:00 2001 From: str4d Date: Fri, 25 Sep 2020 16:02:52 +0100 Subject: [PATCH 19/19] leveldb: Fix typo Co-authored-by: Dimitris Apostolou --- src/leveldb/db/db_bench.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leveldb/db/db_bench.cc b/src/leveldb/db/db_bench.cc index 3ad19a512..9ac04c416 100644 --- a/src/leveldb/db/db_bench.cc +++ b/src/leveldb/db/db_bench.cc @@ -88,7 +88,7 @@ static int FLAGS_write_buffer_size = 0; // (initialized to default value by "main") static int FLAGS_max_file_size = 0; -// Approximate size of user data packed per block (before compression. +// Approximate size of user data packed per block (before compression). // (initialized to default value by "main") static int FLAGS_block_size = 0;