compactindex: add binary search query strategy
This commit is contained in:
parent
dd5fa9a000
commit
a5fada037d
|
@ -137,7 +137,7 @@ func (b *Builder) sealBucket(ctx context.Context, i int, f *os.File) error {
|
||||||
wr := bufio.NewWriter(f)
|
wr := bufio.NewWriter(f)
|
||||||
entryBuf := make([]byte, desc.HashLen+intWidth(b.FileSize)) // TODO remove hardcoded constant
|
entryBuf := make([]byte, desc.HashLen+intWidth(b.FileSize)) // TODO remove hardcoded constant
|
||||||
for _, entry := range entries {
|
for _, entry := range entries {
|
||||||
desc.storeEntry(entryBuf, entry)
|
desc.marshalEntry(entryBuf, entry)
|
||||||
if _, err := wr.Write(entryBuf[:]); err != nil {
|
if _, err := wr.Write(entryBuf[:]); err != nil {
|
||||||
return fmt.Errorf("failed to write record to index: %w", err)
|
return fmt.Errorf("failed to write record to index: %w", err)
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,6 +14,7 @@ import (
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
"github.com/vbauerster/mpb/v8/decor"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestBuilder(t *testing.T) {
|
func TestBuilder(t *testing.T) {
|
||||||
|
@ -181,7 +182,7 @@ func TestBuilder_Random(t *testing.T) {
|
||||||
const numKeys = uint(500000)
|
const numKeys = uint(500000)
|
||||||
const keySize = uint(16)
|
const keySize = uint(16)
|
||||||
const maxOffset = uint64(1000000)
|
const maxOffset = uint64(1000000)
|
||||||
const queries = int(1000)
|
const queries = int(10000)
|
||||||
|
|
||||||
// Create new builder session.
|
// Create new builder session.
|
||||||
builder, err := NewBuilder("", numKeys, maxOffset)
|
builder, err := NewBuilder("", numKeys, maxOffset)
|
||||||
|
@ -221,7 +222,7 @@ func TestBuilder_Random(t *testing.T) {
|
||||||
// Print some stats.
|
// Print some stats.
|
||||||
targetStat, err := targetFile.Stat()
|
targetStat, err := targetFile.Stat()
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
t.Logf("Index size: %d", targetStat.Size())
|
t.Logf("Index size: %d (% .2f)", targetStat.Size(), decor.SizeB1000(targetStat.Size()))
|
||||||
t.Logf("Bytes per entry: %f", float64(targetStat.Size())/float64(numKeys))
|
t.Logf("Bytes per entry: %f", float64(targetStat.Size())/float64(numKeys))
|
||||||
t.Logf("Indexing speed: %f/s", float64(numKeys)/time.Since(preInsert).Seconds())
|
t.Logf("Indexing speed: %f/s", float64(numKeys)/time.Since(preInsert).Seconds())
|
||||||
|
|
||||||
|
@ -237,16 +238,12 @@ func TestBuilder_Random(t *testing.T) {
|
||||||
keyN := uint64(rand.Int63n(int64(numKeys)))
|
keyN := uint64(rand.Int63n(int64(numKeys)))
|
||||||
binary.LittleEndian.PutUint64(key, keyN)
|
binary.LittleEndian.PutUint64(key, keyN)
|
||||||
|
|
||||||
bucket, err := db.FindBucket(key)
|
bucket, err := db.LookupBucket(key)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
hash := EntryHash64(bucket.HashDomain, key) & 0xFFFFFF // TODO fix mask
|
value, err := bucket.Lookup(key)
|
||||||
entries, err := bucket.Load(1024)
|
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
require.True(t, value > 0)
|
||||||
// XXX use external binary search here
|
|
||||||
entry := SearchSortedEntries(entries, hash)
|
|
||||||
require.NotNil(t, entry)
|
|
||||||
}
|
}
|
||||||
t.Logf("Queried %d items", queries)
|
t.Logf("Queried %d items", queries)
|
||||||
t.Logf("Query speed: %f/s", float64(queries)/time.Since(preQuery).Seconds())
|
t.Logf("Query speed: %f/s", float64(queries)/time.Since(preQuery).Seconds())
|
||||||
|
|
|
@ -183,13 +183,13 @@ type BucketDescriptor struct {
|
||||||
OffsetWidth uint8 // with of offset field in bucket
|
OffsetWidth uint8 // with of offset field in bucket
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *BucketDescriptor) loadEntry(buf []byte) (e Entry) {
|
func (b *BucketDescriptor) unmarshalEntry(buf []byte) (e Entry) {
|
||||||
e.Hash = uintLe(buf[0:b.HashLen])
|
e.Hash = uintLe(buf[0:b.HashLen])
|
||||||
e.Value = uintLe(buf[b.HashLen : b.HashLen+b.OffsetWidth])
|
e.Value = uintLe(buf[b.HashLen : b.HashLen+b.OffsetWidth])
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *BucketDescriptor) storeEntry(buf []byte, e Entry) {
|
func (b *BucketDescriptor) marshalEntry(buf []byte, e Entry) {
|
||||||
if len(buf) < int(b.Stride) {
|
if len(buf) < int(b.Stride) {
|
||||||
panic("serializeEntry: buf too small")
|
panic("serializeEntry: buf too small")
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,8 +33,19 @@ func Open(stream io.ReaderAt) (*DB, error) {
|
||||||
return db, nil
|
return db, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// FindBucket returns a handle to the bucket that might contain the given key.
|
// Lookup queries for a key in the index and returns the value (offset), if any.
|
||||||
func (db *DB) FindBucket(key []byte) (*Bucket, error) {
|
//
|
||||||
|
// Returns ErrNotFound if the key is unknown.
|
||||||
|
func (db *DB) Lookup(key []byte) (uint64, error) {
|
||||||
|
bucket, err := db.LookupBucket(key)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
return bucket.Lookup(key)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LookupBucket returns a handle to the bucket that might contain the given key.
|
||||||
|
func (db *DB) LookupBucket(key []byte) (*Bucket, error) {
|
||||||
return db.GetBucket(db.Header.BucketHash(key))
|
return db.GetBucket(db.Header.BucketHash(key))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,7 +113,7 @@ const targetEntriesPerBucket = 10000
|
||||||
// Load retrieves all entries in the hashtable.
|
// Load retrieves all entries in the hashtable.
|
||||||
func (b *Bucket) Load(batchSize int) ([]Entry, error) {
|
func (b *Bucket) Load(batchSize int) ([]Entry, error) {
|
||||||
if batchSize <= 0 {
|
if batchSize <= 0 {
|
||||||
batchSize = 1
|
batchSize = 512 // default to reasonable batch size
|
||||||
}
|
}
|
||||||
// TODO bounds check
|
// TODO bounds check
|
||||||
if b.NumEntries > maxEntriesPerBucket {
|
if b.NumEntries > maxEntriesPerBucket {
|
||||||
|
@ -119,7 +130,7 @@ func (b *Bucket) Load(batchSize int) ([]Entry, error) {
|
||||||
// Decode all entries in it.
|
// Decode all entries in it.
|
||||||
sub := buf[:n]
|
sub := buf[:n]
|
||||||
for len(sub) >= stride {
|
for len(sub) >= stride {
|
||||||
entries = append(entries, b.loadEntry(sub))
|
entries = append(entries, b.unmarshalEntry(sub))
|
||||||
sub = sub[stride:]
|
sub = sub[stride:]
|
||||||
off += int64(stride)
|
off += int64(stride)
|
||||||
}
|
}
|
||||||
|
@ -134,4 +145,42 @@ func (b *Bucket) Load(batchSize int) ([]Entry, error) {
|
||||||
return entries, nil
|
return entries, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: This binary search algo is not optimized for high-latency remotes yet.
|
||||||
|
|
||||||
|
// Lookup queries for a key using binary search.
|
||||||
|
func (b *Bucket) Lookup(key []byte) (uint64, error) {
|
||||||
|
return b.binarySearch(b.Hash(key))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *Bucket) binarySearch(target uint64) (uint64, error) {
|
||||||
|
low := 0
|
||||||
|
high := int(b.NumEntries)
|
||||||
|
for low <= high {
|
||||||
|
median := (low + high) / 2
|
||||||
|
entry, err := b.loadEntry(median)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
if entry.Hash == target {
|
||||||
|
return entry.Value, nil
|
||||||
|
} else if entry.Hash < target {
|
||||||
|
low = median + 1
|
||||||
|
} else {
|
||||||
|
high = median - 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, ErrNotFound
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *Bucket) loadEntry(i int) (Entry, error) {
|
||||||
|
off := int64(i) * int64(b.Stride)
|
||||||
|
buf := make([]byte, b.Stride)
|
||||||
|
n, err := b.Entries.ReadAt(buf, off)
|
||||||
|
if n != len(buf) {
|
||||||
|
return Entry{}, err
|
||||||
|
}
|
||||||
|
return b.unmarshalEntry(buf), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ErrNotFound marks a missing entry.
|
||||||
var ErrNotFound = errors.New("not found")
|
var ErrNotFound = errors.New("not found")
|
||||||
|
|
Loading…
Reference in New Issue