Merge pull request #181 from benbjohnson/split-merge

Allow split nodes to be merged with the next node.
2025-07-10 20:49:45 +00:00 · 2014-06-03 16:44:58 -06:00 · 2014-06-03 16:44:58 -06:00 · 510143d852
commit 510143d852
parent 10074ee8f4 54cad40a78
4 changed files with 185 additions and 72 deletions
--- a/bucket.go
+++ b/bucket.go
@ -613,6 +613,7 @@ func (b *Bucket) rebalance() {
 // node creates a node from a page and associates it with a given parent.
 func (b *Bucket) node(pgid pgid, parent *node) *node {
 	_assert(b.nodes != nil, "nodes map expected")
+
 	// Retrieve node if it's already been created.
 	if n := b.nodes[pgid]; n != nil {
 		return n
--- a/bucket_test.go
+++ b/bucket_test.go
@ -4,6 +4,7 @@ import (
 	"bytes"
 	"errors"
 	"fmt"
+	"math/rand"
 	"os"
 	"strconv"
 	"strings"
@ -560,35 +561,36 @@ func TestBucket_Put_KeyTooLarge(t *testing.T) {
 // Ensure a bucket can calculate stats.
 func TestBucket_Stats(t *testing.T) {
 	withOpenDB(func(db *DB, path string) {
+		// Add bucket with fewer keys but one big value.
 		big_key := []byte("really-big-value")
+		for i := 0; i < 500; i++ {
+			db.Update(func(tx *Tx) error {
+				b, _ := tx.CreateBucketIfNotExists([]byte("woojits"))
+				return b.Put([]byte(fmt.Sprintf("%03d", i)), []byte(strconv.Itoa(i)))
+			})
+		}
 		db.Update(func(tx *Tx) error {
-			// Add bucket with fewer keys but one big value.
-			b, err := tx.CreateBucket([]byte("woojits"))
-			assert.NoError(t, err)
-			for i := 0; i < 500; i++ {
-				b.Put([]byte(fmt.Sprintf("%03d", i)), []byte(strconv.Itoa(i)))
-			}
-			b.Put(big_key, []byte(strings.Repeat("*", 10000)))
-
-			return nil
+			b, _ := tx.CreateBucketIfNotExists([]byte("woojits"))
+			return b.Put(big_key, []byte(strings.Repeat("*", 10000)))
 		})
+
 		mustCheck(db)
 		db.View(func(tx *Tx) error {
 			b := tx.Bucket([]byte("woojits"))
 			stats := b.Stats()
 			assert.Equal(t, 1, stats.BranchPageN, "BranchPageN")
 			assert.Equal(t, 0, stats.BranchOverflowN, "BranchOverflowN")
-			assert.Equal(t, 6, stats.LeafPageN, "LeafPageN")
+			assert.Equal(t, 7, stats.LeafPageN, "LeafPageN")
 			assert.Equal(t, 2, stats.LeafOverflowN, "LeafOverflowN")
 			assert.Equal(t, 501, stats.KeyN, "KeyN")
 			assert.Equal(t, 2, stats.Depth, "Depth")

 			branchInuse := pageHeaderSize            // branch page header
-			branchInuse += 6 * branchPageElementSize // branch elements
-			branchInuse += 6 * 3                     // branch keys (6 3-byte keys)
+			branchInuse += 7 * branchPageElementSize // branch elements
+			branchInuse += 7 * 3                     // branch keys (6 3-byte keys)
 			assert.Equal(t, branchInuse, stats.BranchInuse, "BranchInuse")

-			leafInuse := 6 * pageHeaderSize          // leaf page header
+			leafInuse := 7 * pageHeaderSize          // leaf page header
 			leafInuse += 501 * leafPageElementSize   // leaf elements
 			leafInuse += 500*3 + len(big_key)        // leaf keys
 			leafInuse += 1*10 + 2*90 + 3*400 + 10000 // leaf values
@ -597,7 +599,7 @@ func TestBucket_Stats(t *testing.T) {
 			if os.Getpagesize() == 4096 {
 				// Incompatible page size
 				assert.Equal(t, 4096, stats.BranchAlloc, "BranchAlloc")
-				assert.Equal(t, 32768, stats.LeafAlloc, "LeafAlloc")
+				assert.Equal(t, 36864, stats.LeafAlloc, "LeafAlloc")
 			}

 			assert.Equal(t, 1, stats.BucketN, "BucketN")
@ -608,6 +610,53 @@ func TestBucket_Stats(t *testing.T) {
 	})
 }

+// Ensure a bucket with random insertion utilizes fill percentage correctly.
+func TestBucket_Stats_RandomFill(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+	if os.Getpagesize() != 4096 {
+		t.Skip("invalid page size for test")
+	}
+
+	withOpenDB(func(db *DB, path string) {
+		db.FillPercent = 0.9
+
+		// Add a set of values in random order. It will be the same random
+		// order so we can maintain consistency between test runs.
+		var count int
+		r := rand.New(rand.NewSource(42))
+		for _, i := range r.Perm(1000) {
+			db.Update(func(tx *Tx) error {
+				b, _ := tx.CreateBucketIfNotExists([]byte("woojits"))
+				for _, j := range r.Perm(100) {
+					index := (j * 10000) + i
+					b.Put([]byte(fmt.Sprintf("%d000000000000000", index)), []byte("0000000000"))
+					count++
+				}
+				return nil
+			})
+		}
+		mustCheck(db)
+
+		db.View(func(tx *Tx) error {
+			s := tx.Bucket([]byte("woojits")).Stats()
+			assert.Equal(t, 100000, s.KeyN, "KeyN")
+
+			assert.Equal(t, 22, s.BranchPageN, "BranchPageN")
+			assert.Equal(t, 0, s.BranchOverflowN, "BranchOverflowN")
+			assert.Equal(t, 61708, s.BranchInuse, "BranchInuse")
+			assert.Equal(t, 90112, s.BranchAlloc, "BranchAlloc")
+
+			assert.Equal(t, 1643, s.LeafPageN, "LeafPageN")
+			assert.Equal(t, 0, s.LeafOverflowN, "LeafOverflowN")
+			assert.Equal(t, 4714178, s.LeafInuse, "LeafInuse")
+			assert.Equal(t, 6729728, s.LeafAlloc, "LeafAlloc")
+			return nil
+		})
+	})
+}
+
 // Ensure a bucket can calculate stats.
 func TestBucket_Stats_Small(t *testing.T) {

@ -750,11 +799,11 @@ func TestBucket_Stats_Large(t *testing.T) {

 	withOpenDB(func(db *DB, path string) {
 		var index int
-		for i := 0; i < 1000; i++ {
+		for i := 0; i < 10000; i++ {
 			db.Update(func(tx *Tx) error {
 				// Add bucket with lots of keys.
 				b, _ := tx.CreateBucketIfNotExists([]byte("widgets"))
-				for i := 0; i < 100; i++ {
+				for i := 0; i < 10; i++ {
 					b.Put([]byte(strconv.Itoa(index)), []byte(strconv.Itoa(index)))
 					index++
 				}
@ -766,18 +815,18 @@ func TestBucket_Stats_Large(t *testing.T) {
 		db.View(func(tx *Tx) error {
 			b := tx.Bucket([]byte("widgets"))
 			stats := b.Stats()
-			assert.Equal(t, 19, stats.BranchPageN, "BranchPageN")
+			assert.Equal(t, 13, stats.BranchPageN, "BranchPageN")
 			assert.Equal(t, 0, stats.BranchOverflowN, "BranchOverflowN")
-			assert.Equal(t, 1291, stats.LeafPageN, "LeafPageN")
+			assert.Equal(t, 1195, stats.LeafPageN, "LeafPageN")
 			assert.Equal(t, 0, stats.LeafOverflowN, "LeafOverflowN")
 			assert.Equal(t, 100000, stats.KeyN, "KeyN")
 			assert.Equal(t, 3, stats.Depth, "Depth")
-			assert.Equal(t, 27007, stats.BranchInuse, "BranchInuse")
-			assert.Equal(t, 2598436, stats.LeafInuse, "LeafInuse")
+			assert.Equal(t, 25208, stats.BranchInuse, "BranchInuse")
+			assert.Equal(t, 2596900, stats.LeafInuse, "LeafInuse")
 			if os.Getpagesize() == 4096 {
 				// Incompatible page size
-				assert.Equal(t, 77824, stats.BranchAlloc, "BranchAlloc")
-				assert.Equal(t, 5287936, stats.LeafAlloc, "LeafAlloc")
+				assert.Equal(t, 53248, stats.BranchAlloc, "BranchAlloc")
+				assert.Equal(t, 4894720, stats.LeafAlloc, "LeafAlloc")
 			}
 			assert.Equal(t, 1, stats.BucketN, "BucketN")
 			assert.Equal(t, 0, stats.InlineBucketN, "InlineBucketN")
--- a/db_test.go
+++ b/db_test.go
@ -7,6 +7,8 @@ import (
 	"io/ioutil"
 	"os"
 	"regexp"
+	"sort"
+	"strings"
 	"testing"
 	"time"
 	"unsafe"
@ -520,6 +522,38 @@ func mustCheck(db *DB) {
 	}
 }

+// mustContainKeys checks that a bucket contains a given set of keys.
+func mustContainKeys(b *Bucket, m map[string]string) {
+	found := make(map[string]string)
+	b.ForEach(func(k, _ []byte) error {
+		found[string(k)] = ""
+		return nil
+	})
+
+	// Check for keys found in bucket that shouldn't be there.
+	var keys []string
+	for k, _ := range found {
+		if _, ok := m[string(k)]; !ok {
+			keys = append(keys, k)
+		}
+	}
+	if len(keys) > 0 {
+		sort.Strings(keys)
+		panic(fmt.Sprintf("keys found(%d): %s", len(keys), strings.Join(keys, ",")))
+	}
+
+	// Check for keys not found in bucket that should be there.
+	for k, _ := range m {
+		if _, ok := found[string(k)]; !ok {
+			keys = append(keys, k)
+		}
+	}
+	if len(keys) > 0 {
+		sort.Strings(keys)
+		panic(fmt.Sprintf("keys not found(%d): %s", len(keys), strings.Join(keys, ",")))
+	}
+}
+
 func trunc(b []byte, length int) []byte {
 	if length < len(b) {
 		return b[:length]
--- a/node.go
+++ b/node.go
@ -14,7 +14,7 @@ type node struct {
 	key        []byte
 	pgid       pgid
 	parent     *node
-	children   []*node
+	children   nodes
 	inodes     inodes
 }

@ -205,15 +205,14 @@ func (n *node) write(p *page) {
 	// DEBUG ONLY: n.dump()
 }

-// split breaks up a node into smaller nodes, if appropriate.
+// split breaks up a node into two smaller nodes, if appropriate.
 // This should only be called from the spill() function.
 func (n *node) split(pageSize int) []*node {
-	var nodes = []*node{n}
-
 	// Ignore the split if the page doesn't have at least enough nodes for
-	// multiple pages or if the data can fit on a single page.
-	if len(n.inodes) <= (minKeysPerPage*2) || n.size() < pageSize {
-		return nodes
+	// two pages or if the data can fit on a single page.
+	sz := n.size()
+	if len(n.inodes) <= (minKeysPerPage*2) || sz < pageSize {
+		return []*node{n}
 	}

 	// Determine the threshold before starting a new node.
@ -225,43 +224,60 @@ func (n *node) split(pageSize int) []*node {
 	}
 	threshold := int(float64(pageSize) * fillPercent)

-	// Group into smaller pages and target a given fill size.
-	size := pageHeaderSize
-	internalNodes := n.inodes
-	current := n
-	current.inodes = nil
+	// Determine split position and sizes of the two pages.
+	splitIndex, sz0 := n.splitIndex(threshold)
+	sz1 := pageHeaderSize + (sz - sz0)

-	// Loop over every inode and split once we reach our threshold.
-	for i, inode := range internalNodes {
-		elemSize := n.pageElementSize() + len(inode.key) + len(inode.value)
-
-		// Split once we reach our threshold split size. However, this should
-		// only be done if we have enough keys for this node and we will have
-		// enough keys for the next node.
-		if len(current.inodes) >= minKeysPerPage && i < len(internalNodes)-minKeysPerPage && size+elemSize > threshold {
-			// If there's no parent then we need to create one.
-			if n.parent == nil {
-				n.parent = &node{bucket: n.bucket, children: []*node{n}}
-			}
-
-			// Create a new node and add it to the parent.
-			current = &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
-			n.parent.children = append(n.parent.children, current)
-			nodes = append(nodes, current)
-
-			// Reset our running total back to zero (plus header size).
-			size = pageHeaderSize
-
-			// Update the statistics.
-			n.bucket.tx.stats.Split++
-		}
-
-		// Increase our running total of the size and append the inode.
-		size += elemSize
-		current.inodes = append(current.inodes, inode)
+	// If we can fit our extra keys on the next page then merge into it.
+	if next := n.nextSibling(); next != nil && next.size()+sz1 < threshold {
+		next.inodes = append(n.inodes[splitIndex:], next.inodes...)
+		n.inodes = n.inodes[:splitIndex]
+		return []*node{n}
 	}

-	return nodes
+	// Otherwise split node into two separate nodes. If there's no parent then
+	// we'll need to create one.
+	if n.parent == nil {
+		n.parent = &node{bucket: n.bucket, children: []*node{n}}
+	}
+
+	// Create a new node and add it to the parent.
+	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
+	n.parent.children = append(n.parent.children, next)
+
+	// Split inodes across two nodes.
+	next.inodes = n.inodes[splitIndex:]
+	n.inodes = n.inodes[:splitIndex]
+
+	// Update the statistics.
+	n.bucket.tx.stats.Split++
+
+	return []*node{n, next}
+}
+
+// splitIndex finds the position where a page will fill a given threshold.
+// It returns the index as well as the size of the first page.
+// This is only be called from split().
+func (n *node) splitIndex(threshold int) (index, sz int) {
+	sz = pageHeaderSize
+
+	// Loop until we only have the minimum number of keys required for the second page.
+	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
+		index = i
+		inode := n.inodes[i]
+		elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
+
+		// If we have at least the minimum number of keys and adding another
+		// node would put us over the threshold then exit and return.
+		if i >= minKeysPerPage && sz+elsize > threshold {
+			break
+		}
+
+		// Add the element size to the total size.
+		sz += elsize
+	}
+
+	return
 }

 // spill writes the nodes to dirty pages and splits nodes as it goes.
@ -269,22 +285,29 @@ func (n *node) split(pageSize int) []*node {
 func (n *node) spill() error {
 	var tx = n.bucket.tx

-	// Spill child nodes first.
-	for _, child := range n.children {
-		if err := child.spill(); err != nil {
+	// Spill child nodes first. Child nodes can materialize sibling nodes in
+	// the case of split-merge so we cannot use a range loop. We have to check
+	// the children size on every loop iteration.
+	sort.Sort(n.children)
+	for i := 0; i < len(n.children); i++ {
+		if err := n.children[i].spill(); err != nil {
 			return err
 		}
 	}

-	// Add node's page to the freelist if it's not new.
-	if n.pgid > 0 {
-		tx.db.freelist.free(tx.id(), tx.page(n.pgid))
-		n.pgid = 0
-	}
+	// We no longer need the child list because it's only used for spill tracking.
+	n.children = nil

-	// Spill nodes by deepest first.
+	// Spill nodes by deepest first. The first node returned from split() will
+	// always be "n".
 	var nodes = n.split(tx.db.pageSize)
 	for _, node := range nodes {
+		// Add node's page to the freelist if it's not new.
+		if node.pgid > 0 {
+			tx.db.freelist.free(tx.id(), tx.page(node.pgid))
+			node.pgid = 0
+		}
+
 		// Allocate contiguous space for the node.
 		p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
 		if err != nil {
@ -550,6 +573,12 @@ func (n *node) dump() {
 }
 */

+type nodes []*node
+
+func (s nodes) Len() int           { return len(s) }
+func (s nodes) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
+
 // inode represents an internal node inside of a node.
 // It can be used to point to elements in a page or point
 // to an element which hasn't been added to a page yet.