From a96185e8b69725985e48926b7b28747ddbbfe9b6 Mon Sep 17 00:00:00 2001
From: Ben Johnson <benbjohnson@yahoo.com>
Date: Mon, 2 Jun 2014 15:26:58 -0600
Subject: [PATCH 1/2] Allow split nodes to be merged with the next node.

This commit changes the node.split() functionality to check if the next node has
available space and, if so, it will merge the newly split keys into the next node.

Previously, keys could be continually put into the left side of a split causing that
first half to split off small right side nodes. This was especially problematic with
databases with a high fill percent.
---
 bucket_test.go | 78 ++++++++++++++++++++++++++++++++---------
 freelist.go    |  5 +++
 node.go        | 95 +++++++++++++++++++++++++++++---------------------
 3 files changed, 121 insertions(+), 57 deletions(-)

diff --git a/bucket_test.go b/bucket_test.go
index 9ed63ee..803e043 100644
--- a/bucket_test.go
+++ b/bucket_test.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"errors"
 	"fmt"
+	"math/rand"
 	"os"
 	"strconv"
 	"strings"
@@ -560,18 +561,19 @@ func TestBucket_Put_KeyTooLarge(t *testing.T) {
 // Ensure a bucket can calculate stats.
 func TestBucket_Stats(t *testing.T) {
 	withOpenDB(func(db *DB, path string) {
+		// Add bucket with fewer keys but one big value.
 		big_key := []byte("really-big-value")
+		for i := 0; i < 500; i++ {
+			db.Update(func(tx *Tx) error {
+				b, _ := tx.CreateBucketIfNotExists([]byte("woojits"))
+				return b.Put([]byte(fmt.Sprintf("%03d", i)), []byte(strconv.Itoa(i)))
+			})
+		}
 		db.Update(func(tx *Tx) error {
-			// Add bucket with fewer keys but one big value.
-			b, err := tx.CreateBucket([]byte("woojits"))
-			assert.NoError(t, err)
-			for i := 0; i < 500; i++ {
-				b.Put([]byte(fmt.Sprintf("%03d", i)), []byte(strconv.Itoa(i)))
-			}
-			b.Put(big_key, []byte(strings.Repeat("*", 10000)))
-
-			return nil
+			b, _ := tx.CreateBucketIfNotExists([]byte("woojits"))
+			return b.Put(big_key, []byte(strings.Repeat("*", 10000)))
 		})
+
 		mustCheck(db)
 		db.View(func(tx *Tx) error {
 			b := tx.Bucket([]byte("woojits"))
@@ -608,6 +610,48 @@ func TestBucket_Stats(t *testing.T) {
 	})
 }
 
+// Ensure a bucket with random insertion utilizes fill percentage correctly.
+func TestBucket_Stats_RandomFill(t *testing.T) {
+	if os.Getpagesize() != 4096 {
+		t.Skip("invalid page size for test")
+	}
+
+	withOpenDB(func(db *DB, path string) {
+		db.FillPercent = 0.9
+
+		// Add a set of values in random order. It will be the same random
+		// order so we can maintain consistency between test runs.
+		r := rand.New(rand.NewSource(42))
+		for _, i := range r.Perm(1000) {
+			db.Update(func(tx *Tx) error {
+				b, _ := tx.CreateBucketIfNotExists([]byte("woojits"))
+				for _, j := range r.Perm(100) {
+					index := (j * 10000) + i
+					b.Put([]byte(fmt.Sprintf("%d000000000000000", index)), []byte("0000000000"))
+				}
+				return nil
+			})
+			mustCheck(db)
+		}
+
+		db.View(func(tx *Tx) error {
+			s := tx.Bucket([]byte("woojits")).Stats()
+			assert.Equal(t, 100000, s.KeyN, "KeyN")
+
+			assert.Equal(t, 22, s.BranchPageN, "BranchPageN")
+			assert.Equal(t, 0, s.BranchOverflowN, "BranchOverflowN")
+			assert.Equal(t, 62963, s.BranchInuse, "BranchInuse")
+			assert.Equal(t, 90112, s.BranchAlloc, "BranchAlloc")
+
+			assert.Equal(t, 1677, s.LeafPageN, "LeafPageN")
+			assert.Equal(t, 0, s.LeafOverflowN, "LeafOverflowN")
+			assert.Equal(t, 4714722, s.LeafInuse, "LeafInuse")
+			assert.Equal(t, 6868992, s.LeafAlloc, "LeafAlloc")
+			return nil
+		})
+	})
+}
+
 // Ensure a bucket can calculate stats.
 func TestBucket_Stats_Small(t *testing.T) {
 
@@ -750,11 +794,11 @@ func TestBucket_Stats_Large(t *testing.T) {
 
 	withOpenDB(func(db *DB, path string) {
 		var index int
-		for i := 0; i < 1000; i++ {
+		for i := 0; i < 10000; i++ {
 			db.Update(func(tx *Tx) error {
 				// Add bucket with lots of keys.
 				b, _ := tx.CreateBucketIfNotExists([]byte("widgets"))
-				for i := 0; i < 100; i++ {
+				for i := 0; i < 10; i++ {
 					b.Put([]byte(strconv.Itoa(index)), []byte(strconv.Itoa(index)))
 					index++
 				}
@@ -766,18 +810,18 @@ func TestBucket_Stats_Large(t *testing.T) {
 		db.View(func(tx *Tx) error {
 			b := tx.Bucket([]byte("widgets"))
 			stats := b.Stats()
-			assert.Equal(t, 19, stats.BranchPageN, "BranchPageN")
+			assert.Equal(t, 13, stats.BranchPageN, "BranchPageN")
 			assert.Equal(t, 0, stats.BranchOverflowN, "BranchOverflowN")
-			assert.Equal(t, 1291, stats.LeafPageN, "LeafPageN")
+			assert.Equal(t, 1195, stats.LeafPageN, "LeafPageN")
 			assert.Equal(t, 0, stats.LeafOverflowN, "LeafOverflowN")
 			assert.Equal(t, 100000, stats.KeyN, "KeyN")
 			assert.Equal(t, 3, stats.Depth, "Depth")
-			assert.Equal(t, 27007, stats.BranchInuse, "BranchInuse")
-			assert.Equal(t, 2598436, stats.LeafInuse, "LeafInuse")
+			assert.Equal(t, 25208, stats.BranchInuse, "BranchInuse")
+			assert.Equal(t, 2596900, stats.LeafInuse, "LeafInuse")
 			if os.Getpagesize() == 4096 {
 				// Incompatible page size
-				assert.Equal(t, 77824, stats.BranchAlloc, "BranchAlloc")
-				assert.Equal(t, 5287936, stats.LeafAlloc, "LeafAlloc")
+				assert.Equal(t, 53248, stats.BranchAlloc, "BranchAlloc")
+				assert.Equal(t, 4894720, stats.LeafAlloc, "LeafAlloc")
 			}
 			assert.Equal(t, 1, stats.BucketN, "BucketN")
 			assert.Equal(t, 0, stats.InlineBucketN, "InlineBucketN")
diff --git a/freelist.go b/freelist.go
index a236079..2fa4a94 100644
--- a/freelist.go
+++ b/freelist.go
@@ -70,12 +70,14 @@ func (f *freelist) allocate(n int) pgid {
 // free releases a page and its overflow for a given transaction id.
 // If the page is already free then a panic will occur.
 func (f *freelist) free(txid txid, p *page) {
+	warn("free:", txid, p.id, p.overflow)
 	_assert(p.id > 1, "cannot free page 0 or 1: %d", p.id)
 
 	// Verify that page is not already free.
 	minid, maxid := p.id, p.id+pgid(p.overflow)
 	for _, id := range f.ids {
 		if id >= minid && id <= maxid {
+			warn("  ‡", id, "|", minid, maxid)
 			panic(fmt.Sprintf("page %d already freed in tx", id))
 		}
 	}
@@ -90,6 +92,9 @@ func (f *freelist) free(txid txid, p *page) {
 	// Free page and all its overflow pages.
 	var ids = f.pending[txid]
 	for i := 0; i < int(p.overflow+1); i++ {
+		if p.id+pgid(i) == 55 {
+			warn("  •", txid, p.id+pgid(i))
+		}
 		ids = append(ids, p.id+pgid(i))
 	}
 	f.pending[txid] = ids
diff --git a/node.go b/node.go
index 1502be0..57a50e9 100644
--- a/node.go
+++ b/node.go
@@ -205,15 +205,14 @@ func (n *node) write(p *page) {
 	// DEBUG ONLY: n.dump()
 }
 
-// split breaks up a node into smaller nodes, if appropriate.
+// split breaks up a node into two smaller nodes, if appropriate.
 // This should only be called from the spill() function.
 func (n *node) split(pageSize int) []*node {
-	var nodes = []*node{n}
-
 	// Ignore the split if the page doesn't have at least enough nodes for
-	// multiple pages or if the data can fit on a single page.
-	if len(n.inodes) <= (minKeysPerPage*2) || n.size() < pageSize {
-		return nodes
+	// two pages or if the data can fit on a single page.
+	sz := n.size()
+	if len(n.inodes) <= (minKeysPerPage*2) || sz < pageSize {
+		return []*node{n}
 	}
 
 	// Determine the threshold before starting a new node.
@@ -225,43 +224,59 @@ func (n *node) split(pageSize int) []*node {
 	}
 	threshold := int(float64(pageSize) * fillPercent)
 
-	// Group into smaller pages and target a given fill size.
-	size := pageHeaderSize
-	internalNodes := n.inodes
-	current := n
-	current.inodes = nil
+	// Determine split position and sizes of the two pages.
+	splitIndex, sz0 := n.splitIndex(threshold)
+	sz1 := pageHeaderSize + (sz - sz0)
 
-	// Loop over every inode and split once we reach our threshold.
-	for i, inode := range internalNodes {
-		elemSize := n.pageElementSize() + len(inode.key) + len(inode.value)
-
-		// Split once we reach our threshold split size. However, this should
-		// only be done if we have enough keys for this node and we will have
-		// enough keys for the next node.
-		if len(current.inodes) >= minKeysPerPage && i < len(internalNodes)-minKeysPerPage && size+elemSize > threshold {
-			// If there's no parent then we need to create one.
-			if n.parent == nil {
-				n.parent = &node{bucket: n.bucket, children: []*node{n}}
-			}
-
-			// Create a new node and add it to the parent.
-			current = &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
-			n.parent.children = append(n.parent.children, current)
-			nodes = append(nodes, current)
-
-			// Reset our running total back to zero (plus header size).
-			size = pageHeaderSize
-
-			// Update the statistics.
-			n.bucket.tx.stats.Split++
-		}
-
-		// Increase our running total of the size and append the inode.
-		size += elemSize
-		current.inodes = append(current.inodes, inode)
+	// If we can fit our extra keys on the next page then merge into it.
+	if next := n.nextSibling(); next != nil && next.size()+sz1 < threshold {
+		next.inodes = append(n.inodes[splitIndex:], next.inodes...)
+		return []*node{n}
 	}
 
-	return nodes
+	// Otherwise split node into two separate nodes. If there's no parent then
+	// we'll need to create one.
+	if n.parent == nil {
+		n.parent = &node{bucket: n.bucket, children: []*node{n}}
+	}
+
+	// Create a new node and add it to the parent.
+	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
+	n.parent.children = append(n.parent.children, next)
+
+	// Split inodes across two nodes.
+	next.inodes = n.inodes[splitIndex:]
+	n.inodes = n.inodes[:splitIndex]
+
+	// Update the statistics.
+	n.bucket.tx.stats.Split++
+
+	return []*node{n, next}
+}
+
+// splitIndex finds the position where a page will fill a given threshold.
+// It returns the index as well as the size of the first page.
+// This is only be called from split().
+func (n *node) splitIndex(threshold int) (index, sz int) {
+	sz = pageHeaderSize
+
+	// Loop until we only have the minimum number of keys required for the second page.
+	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
+		index = i
+		inode := n.inodes[i]
+		elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
+
+		// If we have at least the minimum number of keys and adding another
+		// node would put us over the threshold then exit and return.
+		if i >= minKeysPerPage && sz+elsize > threshold {
+			break
+		}
+
+		// Add the element size to the total size.
+		sz += elsize
+	}
+
+	return
 }
 
 // spill writes the nodes to dirty pages and splits nodes as it goes.

From 54cad40a785deddda2d4b903f048fdfb43a94927 Mon Sep 17 00:00:00 2001
From: Ben Johnson <benbjohnson@yahoo.com>
Date: Tue, 3 Jun 2014 13:21:28 -0600
Subject: [PATCH 2/2] Fix merge-split spill issues.

---
 bucket.go      |  1 +
 bucket_test.go | 25 +++++++++++++++----------
 db_test.go     | 34 ++++++++++++++++++++++++++++++++++
 freelist.go    |  5 -----
 node.go        | 34 ++++++++++++++++++++++++----------
 5 files changed, 74 insertions(+), 25 deletions(-)

diff --git a/bucket.go b/bucket.go
index 2338a8b..e66d41c 100644
--- a/bucket.go
+++ b/bucket.go
@@ -613,6 +613,7 @@ func (b *Bucket) rebalance() {
 // node creates a node from a page and associates it with a given parent.
 func (b *Bucket) node(pgid pgid, parent *node) *node {
 	_assert(b.nodes != nil, "nodes map expected")
+
 	// Retrieve node if it's already been created.
 	if n := b.nodes[pgid]; n != nil {
 		return n
diff --git a/bucket_test.go b/bucket_test.go
index 803e043..bae3941 100644
--- a/bucket_test.go
+++ b/bucket_test.go
@@ -580,17 +580,17 @@ func TestBucket_Stats(t *testing.T) {
 			stats := b.Stats()
 			assert.Equal(t, 1, stats.BranchPageN, "BranchPageN")
 			assert.Equal(t, 0, stats.BranchOverflowN, "BranchOverflowN")
-			assert.Equal(t, 6, stats.LeafPageN, "LeafPageN")
+			assert.Equal(t, 7, stats.LeafPageN, "LeafPageN")
 			assert.Equal(t, 2, stats.LeafOverflowN, "LeafOverflowN")
 			assert.Equal(t, 501, stats.KeyN, "KeyN")
 			assert.Equal(t, 2, stats.Depth, "Depth")
 
 			branchInuse := pageHeaderSize            // branch page header
-			branchInuse += 6 * branchPageElementSize // branch elements
-			branchInuse += 6 * 3                     // branch keys (6 3-byte keys)
+			branchInuse += 7 * branchPageElementSize // branch elements
+			branchInuse += 7 * 3                     // branch keys (6 3-byte keys)
 			assert.Equal(t, branchInuse, stats.BranchInuse, "BranchInuse")
 
-			leafInuse := 6 * pageHeaderSize          // leaf page header
+			leafInuse := 7 * pageHeaderSize          // leaf page header
 			leafInuse += 501 * leafPageElementSize   // leaf elements
 			leafInuse += 500*3 + len(big_key)        // leaf keys
 			leafInuse += 1*10 + 2*90 + 3*400 + 10000 // leaf values
@@ -599,7 +599,7 @@ func TestBucket_Stats(t *testing.T) {
 			if os.Getpagesize() == 4096 {
 				// Incompatible page size
 				assert.Equal(t, 4096, stats.BranchAlloc, "BranchAlloc")
-				assert.Equal(t, 32768, stats.LeafAlloc, "LeafAlloc")
+				assert.Equal(t, 36864, stats.LeafAlloc, "LeafAlloc")
 			}
 
 			assert.Equal(t, 1, stats.BucketN, "BucketN")
@@ -612,6 +612,9 @@ func TestBucket_Stats(t *testing.T) {
 
 // Ensure a bucket with random insertion utilizes fill percentage correctly.
 func TestBucket_Stats_RandomFill(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
 	if os.Getpagesize() != 4096 {
 		t.Skip("invalid page size for test")
 	}
@@ -621,6 +624,7 @@ func TestBucket_Stats_RandomFill(t *testing.T) {
 
 		// Add a set of values in random order. It will be the same random
 		// order so we can maintain consistency between test runs.
+		var count int
 		r := rand.New(rand.NewSource(42))
 		for _, i := range r.Perm(1000) {
 			db.Update(func(tx *Tx) error {
@@ -628,11 +632,12 @@ func TestBucket_Stats_RandomFill(t *testing.T) {
 				for _, j := range r.Perm(100) {
 					index := (j * 10000) + i
 					b.Put([]byte(fmt.Sprintf("%d000000000000000", index)), []byte("0000000000"))
+					count++
 				}
 				return nil
 			})
-			mustCheck(db)
 		}
+		mustCheck(db)
 
 		db.View(func(tx *Tx) error {
 			s := tx.Bucket([]byte("woojits")).Stats()
@@ -640,13 +645,13 @@ func TestBucket_Stats_RandomFill(t *testing.T) {
 
 			assert.Equal(t, 22, s.BranchPageN, "BranchPageN")
 			assert.Equal(t, 0, s.BranchOverflowN, "BranchOverflowN")
-			assert.Equal(t, 62963, s.BranchInuse, "BranchInuse")
+			assert.Equal(t, 61708, s.BranchInuse, "BranchInuse")
 			assert.Equal(t, 90112, s.BranchAlloc, "BranchAlloc")
 
-			assert.Equal(t, 1677, s.LeafPageN, "LeafPageN")
+			assert.Equal(t, 1643, s.LeafPageN, "LeafPageN")
 			assert.Equal(t, 0, s.LeafOverflowN, "LeafOverflowN")
-			assert.Equal(t, 4714722, s.LeafInuse, "LeafInuse")
-			assert.Equal(t, 6868992, s.LeafAlloc, "LeafAlloc")
+			assert.Equal(t, 4714178, s.LeafInuse, "LeafInuse")
+			assert.Equal(t, 6729728, s.LeafAlloc, "LeafAlloc")
 			return nil
 		})
 	})
diff --git a/db_test.go b/db_test.go
index 691c8fb..9fd4662 100644
--- a/db_test.go
+++ b/db_test.go
@@ -7,6 +7,8 @@ import (
 	"io/ioutil"
 	"os"
 	"regexp"
+	"sort"
+	"strings"
 	"testing"
 	"time"
 	"unsafe"
@@ -520,6 +522,38 @@ func mustCheck(db *DB) {
 	}
 }
 
+// mustContainKeys checks that a bucket contains a given set of keys.
+func mustContainKeys(b *Bucket, m map[string]string) {
+	found := make(map[string]string)
+	b.ForEach(func(k, _ []byte) error {
+		found[string(k)] = ""
+		return nil
+	})
+
+	// Check for keys found in bucket that shouldn't be there.
+	var keys []string
+	for k, _ := range found {
+		if _, ok := m[string(k)]; !ok {
+			keys = append(keys, k)
+		}
+	}
+	if len(keys) > 0 {
+		sort.Strings(keys)
+		panic(fmt.Sprintf("keys found(%d): %s", len(keys), strings.Join(keys, ",")))
+	}
+
+	// Check for keys not found in bucket that should be there.
+	for k, _ := range m {
+		if _, ok := found[string(k)]; !ok {
+			keys = append(keys, k)
+		}
+	}
+	if len(keys) > 0 {
+		sort.Strings(keys)
+		panic(fmt.Sprintf("keys not found(%d): %s", len(keys), strings.Join(keys, ",")))
+	}
+}
+
 func trunc(b []byte, length int) []byte {
 	if length < len(b) {
 		return b[:length]
diff --git a/freelist.go b/freelist.go
index 2fa4a94..a236079 100644
--- a/freelist.go
+++ b/freelist.go
@@ -70,14 +70,12 @@ func (f *freelist) allocate(n int) pgid {
 // free releases a page and its overflow for a given transaction id.
 // If the page is already free then a panic will occur.
 func (f *freelist) free(txid txid, p *page) {
-	warn("free:", txid, p.id, p.overflow)
 	_assert(p.id > 1, "cannot free page 0 or 1: %d", p.id)
 
 	// Verify that page is not already free.
 	minid, maxid := p.id, p.id+pgid(p.overflow)
 	for _, id := range f.ids {
 		if id >= minid && id <= maxid {
-			warn("  ‡", id, "|", minid, maxid)
 			panic(fmt.Sprintf("page %d already freed in tx", id))
 		}
 	}
@@ -92,9 +90,6 @@ func (f *freelist) free(txid txid, p *page) {
 	// Free page and all its overflow pages.
 	var ids = f.pending[txid]
 	for i := 0; i < int(p.overflow+1); i++ {
-		if p.id+pgid(i) == 55 {
-			warn("  •", txid, p.id+pgid(i))
-		}
 		ids = append(ids, p.id+pgid(i))
 	}
 	f.pending[txid] = ids
diff --git a/node.go b/node.go
index 57a50e9..f0978ca 100644
--- a/node.go
+++ b/node.go
@@ -14,7 +14,7 @@ type node struct {
 	key        []byte
 	pgid       pgid
 	parent     *node
-	children   []*node
+	children   nodes
 	inodes     inodes
 }
 
@@ -231,6 +231,7 @@ func (n *node) split(pageSize int) []*node {
 	// If we can fit our extra keys on the next page then merge into it.
 	if next := n.nextSibling(); next != nil && next.size()+sz1 < threshold {
 		next.inodes = append(n.inodes[splitIndex:], next.inodes...)
+		n.inodes = n.inodes[:splitIndex]
 		return []*node{n}
 	}
 
@@ -284,22 +285,29 @@ func (n *node) splitIndex(threshold int) (index, sz int) {
 func (n *node) spill() error {
 	var tx = n.bucket.tx
 
-	// Spill child nodes first.
-	for _, child := range n.children {
-		if err := child.spill(); err != nil {
+	// Spill child nodes first. Child nodes can materialize sibling nodes in
+	// the case of split-merge so we cannot use a range loop. We have to check
+	// the children size on every loop iteration.
+	sort.Sort(n.children)
+	for i := 0; i < len(n.children); i++ {
+		if err := n.children[i].spill(); err != nil {
 			return err
 		}
 	}
 
-	// Add node's page to the freelist if it's not new.
-	if n.pgid > 0 {
-		tx.db.freelist.free(tx.id(), tx.page(n.pgid))
-		n.pgid = 0
-	}
+	// We no longer need the child list because it's only used for spill tracking.
+	n.children = nil
 
-	// Spill nodes by deepest first.
+	// Spill nodes by deepest first. The first node returned from split() will
+	// always be "n".
 	var nodes = n.split(tx.db.pageSize)
 	for _, node := range nodes {
+		// Add node's page to the freelist if it's not new.
+		if node.pgid > 0 {
+			tx.db.freelist.free(tx.id(), tx.page(node.pgid))
+			node.pgid = 0
+		}
+
 		// Allocate contiguous space for the node.
 		p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
 		if err != nil {
@@ -565,6 +573,12 @@ func (n *node) dump() {
 }
 */
 
+type nodes []*node
+
+func (s nodes) Len() int           { return len(s) }
+func (s nodes) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
+
 // inode represents an internal node inside of a node.
 // It can be used to point to elements in a page or point
 // to an element which hasn't been added to a page yet.