radiance/pkg/compactindex/compactindex.go

// Package compactindex is an immutable hashtable index format inspired by djb's constant database (cdb).
//
// # Design
//
// Compactindex is used to create secondary indexes over arbitrary flat files.
// Each index is a single, immutable flat file.
//
// Index files consist of a space-optimized and query-optimized key-value-like table.
//
// Instead of storing actual keys, the format stores FKS dynamic perfect hashes.
// And instead of storing values, the format contains offsets into some file.
//
// As a result, the database effectively only supports two operations, similarly to cdb.
// (Note that the actual Go interface is a bit more flexible).
//
//	func Create(kv map[[]byte]uint64) *Index
//	func (*Index) Lookup(key []byte) (value uint64, exist bool)
//
// # Buckets
//
// The set of items is split into buckets of approx 10000 records.
// The number of buckets is unlimited.
//
// The key-to-bucket assignment is determined by xxHash3 using uniform discrete hashing over the key space.
//
// The index file header also mentions the number of buckets and the file offset of each bucket.
//
// # Tables
//
// Each bucket contains a table of entries, indexed by a collision-free hash function.
//
// The hash function used in the entry table is xxHash.
// A 32-bit hash domain is prefixed to mine collision-free sets of hashes (FKS scheme).
// This hash domain is also recorded at the bucket header.
//
// Each bucket entry is a constant-size record consisting of a 3-byte hash and an offset to the value.
// The size of the offset integer is the minimal byte-aligned integer width that can represent the target file size.
//
// # Querying
//
// The query interface (DB) is backend-agnostic, supporting any storage medium that provides random reads.
// To name a few: Memory buffers, local files, arbitrary embedded buffers, HTTP range requests, plan9, etc...
//
// The DB struct itself performs zero memory allocations and therefore also doesn't cache.
// It is therefore recommended to provide a io.ReaderAt backed by a cache to improve performance.
//
// Given a key, the query strategy is simple:
//
//  1. Hash key to bucket using global hash function
//  2. Retrieve bucket offset from bucket header table
//  3. Hash key to entry using per-bucket hash function
//  4. Search for entry in bucket (binary search)
//
// The search strategy for locating entries in buckets can be adjusted to fit the latency/bandwidth profile of the underlying storage medium.
//
// For example, the fastest lookup strategy in memory is a binary search retrieving double cache lines at a time.
// When doing range requests against high-latency remote storage (e.g. S3 buckets),
// it is typically faster to retrieve and scan through large parts of a bucket (multiple kilobytes) at once.
//
// # Construction
//
// Constructing a compactindex requires upfront knowledge of the number of items and highest possible target offset (read: target file size).
//
// The process requires scratch space of around 16 bytes per entry. During generation, data is offloaded to disk for memory efficiency.
//
// The process works as follows:
//
//  1. Determine number of buckets and offset integer width
//     based on known input params (item count and target file size).
//  2. Linear pass over input data, populating temporary files that
//     contain the unsorted entries of each bucket.
//  3. For each bucket, brute force a perfect hash function that
//     defines a bijection between hash values and keys in the bucket.
//  4. For each bucket, sort by hash values.
//  5. Store to index.
//
// An alternative construction approach is available when the number of items or target file size is unknown.
// In this case, a set of keys is first serialized to a flat file.
package compactindex

import (
	"encoding/binary"
	"fmt"
	"math"
	"math/bits"
	"sort"

	"github.com/cespare/xxhash/v2"
)

// Magic are the first eight bytes of an index.
var Magic = [8]byte{'r', 'd', 'c', 'e', 'c', 'i', 'd', 'x'}

// Header occurs once at the beginning of the index.
type Header struct {
	FileSize   uint64
	NumBuckets uint32
}

// headerSize is the size of the header at the beginning of the file.
const headerSize = 32

// Load checks the Magic sequence and loads the header fields.
func (h *Header) Load(buf *[headerSize]byte) error {
	// Use a magic byte sequence to bail fast when user passes a corrupted/unrelated stream.
	if *(*[8]byte)(buf[:8]) != Magic {
		return fmt.Errorf("not a radiance compactindex file")
	}
	*h = Header{
		FileSize:   binary.LittleEndian.Uint64(buf[8:16]),
		NumBuckets: binary.LittleEndian.Uint32(buf[16:20]),
	}
	// 12 bytes to spare for now. Might use it in the future.
	// Force to zero for now.
	for _, b := range buf[20:32] {
		if b != 0x00 {
			return fmt.Errorf("unsupported index version")
		}
	}
	return nil
}

func (h *Header) Store(buf *[headerSize]byte) {
	copy(buf[0:8], Magic[:])
	binary.LittleEndian.PutUint64(buf[8:16], h.FileSize)
	binary.LittleEndian.PutUint32(buf[16:20], h.NumBuckets)
	for i := 20; i < 32; i++ {
		buf[i] = 0
	}
}

// BucketHash returns the bucket index for the given key.
//
// Uses a truncated xxHash64 rotated until the result fits.
func (h *Header) BucketHash(key []byte) uint {
	xsum := xxhash.Sum64(key)
	mask := maxCls64(uint64(h.NumBuckets))
	for {
		index := xsum & mask
		if index < uint64(h.NumBuckets) {
			return uint(index)
		}
		xsum = bits.RotateLeft64(xsum, bits.LeadingZeros64(mask))
	}
}

// BucketHeader occurs at the beginning of each bucket.
type BucketHeader struct {
	HashDomain uint32
	NumEntries uint32
	HashLen    uint8
	FileOffset uint64
}

// bucketHdrLen is the size of the header preceding the hash table entries.
const bucketHdrLen = 16

func (b *BucketHeader) Store(buf *[bucketHdrLen]byte) {
	binary.LittleEndian.PutUint32(buf[0:4], b.HashDomain)
	binary.LittleEndian.PutUint32(buf[4:8], b.NumEntries)
	buf[8] = b.HashLen
	buf[9] = 0
	putUintLe(buf[10:16], b.FileOffset)
}

func (b *BucketHeader) Load(buf *[bucketHdrLen]byte) {
	b.HashDomain = binary.LittleEndian.Uint32(buf[0:4])
	b.NumEntries = binary.LittleEndian.Uint32(buf[4:8])
	b.HashLen = buf[8]
	b.FileOffset = uintLe(buf[10:16])
}

// Hash returns the per-bucket hash of a key.
func (b *BucketHeader) Hash(key []byte) uint64 {
	xsum := EntryHash64(b.HashDomain, key)
	// Mask sum by hash length.
	return xsum & (math.MaxUint64 >> (64 - b.HashLen*8))
}

type BucketDescriptor struct {
	BucketHeader
	Stride      uint8 // size of one entry in bucket
	OffsetWidth uint8 // with of offset field in bucket
}

func (b *BucketDescriptor) unmarshalEntry(buf []byte) (e Entry) {
	e.Hash = uintLe(buf[0:b.HashLen])
	e.Value = uintLe(buf[b.HashLen : b.HashLen+b.OffsetWidth])
	return
}

func (b *BucketDescriptor) marshalEntry(buf []byte, e Entry) {
	if len(buf) < int(b.Stride) {
		panic("serializeEntry: buf too small")
	}
	putUintLe(buf[0:b.HashLen], e.Hash)
	putUintLe(buf[b.HashLen:b.HashLen+b.OffsetWidth], e.Value)
}

// SearchSortedEntries performs an in-memory binary search for a given hash.
func SearchSortedEntries(entries []Entry, hash uint64) *Entry {
	i, found := sort.Find(len(entries), func(i int) int {
		other := entries[i].Hash
		// Note: This is safe because neither side exceeds 2^24.
		return int(hash) - int(other)
	})
	if !found {
		return nil
	}
	if i >= len(entries) || entries[i].Hash != hash {
		return nil
	}
	return &entries[i]
}

// EntryHash64 is a xxHash-based hash function using an arbitrary prefix.
func EntryHash64(prefix uint32, key []byte) uint64 {
	const blockSize = 32
	var prefixBlock [blockSize]byte
	binary.LittleEndian.PutUint32(prefixBlock[:4], prefix)

	var digest xxhash.Digest
	digest.Reset()
	digest.Write(prefixBlock[:])
	digest.Write(key)
	return digest.Sum64()
}

// Entry is a single element in a hash table.
type Entry struct {
	Hash  uint64
	Value uint64
}

// intWidth returns the number of bytes minimally required to represent the given integer.
func intWidth(n uint64) uint8 {
	msb := 64 - bits.LeadingZeros64(n)
	return uint8((msb + 7) / 8)
}

// maxCls64 returns the max integer that has the same amount of leading zeros as n.
func maxCls64(n uint64) uint64 {
	return math.MaxUint64 >> bits.LeadingZeros64(n)
}

// uintLe decodes an unsigned little-endian integer without bounds assertions.
// out-of-bounds bits are set to zero.
func uintLe(buf []byte) uint64 {
	var full [8]byte
	copy(full[:], buf)
	return binary.LittleEndian.Uint64(full[:])
}

// putUintLe encodes an unsigned little-endian integer without bounds assertions.
// Returns true if the integer fully fit in the provided buffer.
func putUintLe(buf []byte, x uint64) bool {
	var full [8]byte
	binary.LittleEndian.PutUint64(full[:], x)
	copy(buf, full[:])
	return int(intWidth(x)) <= len(buf)
}