diff --git a/bucket.go b/bucket.go index 2338a8b..e66d41c 100644 --- a/bucket.go +++ b/bucket.go @@ -613,6 +613,7 @@ func (b *Bucket) rebalance() { // node creates a node from a page and associates it with a given parent. func (b *Bucket) node(pgid pgid, parent *node) *node { _assert(b.nodes != nil, "nodes map expected") + // Retrieve node if it's already been created. if n := b.nodes[pgid]; n != nil { return n diff --git a/bucket_test.go b/bucket_test.go index 9ed63ee..bae3941 100644 --- a/bucket_test.go +++ b/bucket_test.go @@ -4,6 +4,7 @@ import ( "bytes" "errors" "fmt" + "math/rand" "os" "strconv" "strings" @@ -560,35 +561,36 @@ func TestBucket_Put_KeyTooLarge(t *testing.T) { // Ensure a bucket can calculate stats. func TestBucket_Stats(t *testing.T) { withOpenDB(func(db *DB, path string) { + // Add bucket with fewer keys but one big value. big_key := []byte("really-big-value") + for i := 0; i < 500; i++ { + db.Update(func(tx *Tx) error { + b, _ := tx.CreateBucketIfNotExists([]byte("woojits")) + return b.Put([]byte(fmt.Sprintf("%03d", i)), []byte(strconv.Itoa(i))) + }) + } db.Update(func(tx *Tx) error { - // Add bucket with fewer keys but one big value. - b, err := tx.CreateBucket([]byte("woojits")) - assert.NoError(t, err) - for i := 0; i < 500; i++ { - b.Put([]byte(fmt.Sprintf("%03d", i)), []byte(strconv.Itoa(i))) - } - b.Put(big_key, []byte(strings.Repeat("*", 10000))) - - return nil + b, _ := tx.CreateBucketIfNotExists([]byte("woojits")) + return b.Put(big_key, []byte(strings.Repeat("*", 10000))) }) + mustCheck(db) db.View(func(tx *Tx) error { b := tx.Bucket([]byte("woojits")) stats := b.Stats() assert.Equal(t, 1, stats.BranchPageN, "BranchPageN") assert.Equal(t, 0, stats.BranchOverflowN, "BranchOverflowN") - assert.Equal(t, 6, stats.LeafPageN, "LeafPageN") + assert.Equal(t, 7, stats.LeafPageN, "LeafPageN") assert.Equal(t, 2, stats.LeafOverflowN, "LeafOverflowN") assert.Equal(t, 501, stats.KeyN, "KeyN") assert.Equal(t, 2, stats.Depth, "Depth") branchInuse := pageHeaderSize // branch page header - branchInuse += 6 * branchPageElementSize // branch elements - branchInuse += 6 * 3 // branch keys (6 3-byte keys) + branchInuse += 7 * branchPageElementSize // branch elements + branchInuse += 7 * 3 // branch keys (6 3-byte keys) assert.Equal(t, branchInuse, stats.BranchInuse, "BranchInuse") - leafInuse := 6 * pageHeaderSize // leaf page header + leafInuse := 7 * pageHeaderSize // leaf page header leafInuse += 501 * leafPageElementSize // leaf elements leafInuse += 500*3 + len(big_key) // leaf keys leafInuse += 1*10 + 2*90 + 3*400 + 10000 // leaf values @@ -597,7 +599,7 @@ func TestBucket_Stats(t *testing.T) { if os.Getpagesize() == 4096 { // Incompatible page size assert.Equal(t, 4096, stats.BranchAlloc, "BranchAlloc") - assert.Equal(t, 32768, stats.LeafAlloc, "LeafAlloc") + assert.Equal(t, 36864, stats.LeafAlloc, "LeafAlloc") } assert.Equal(t, 1, stats.BucketN, "BucketN") @@ -608,6 +610,53 @@ func TestBucket_Stats(t *testing.T) { }) } +// Ensure a bucket with random insertion utilizes fill percentage correctly. +func TestBucket_Stats_RandomFill(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + if os.Getpagesize() != 4096 { + t.Skip("invalid page size for test") + } + + withOpenDB(func(db *DB, path string) { + db.FillPercent = 0.9 + + // Add a set of values in random order. It will be the same random + // order so we can maintain consistency between test runs. + var count int + r := rand.New(rand.NewSource(42)) + for _, i := range r.Perm(1000) { + db.Update(func(tx *Tx) error { + b, _ := tx.CreateBucketIfNotExists([]byte("woojits")) + for _, j := range r.Perm(100) { + index := (j * 10000) + i + b.Put([]byte(fmt.Sprintf("%d000000000000000", index)), []byte("0000000000")) + count++ + } + return nil + }) + } + mustCheck(db) + + db.View(func(tx *Tx) error { + s := tx.Bucket([]byte("woojits")).Stats() + assert.Equal(t, 100000, s.KeyN, "KeyN") + + assert.Equal(t, 22, s.BranchPageN, "BranchPageN") + assert.Equal(t, 0, s.BranchOverflowN, "BranchOverflowN") + assert.Equal(t, 61708, s.BranchInuse, "BranchInuse") + assert.Equal(t, 90112, s.BranchAlloc, "BranchAlloc") + + assert.Equal(t, 1643, s.LeafPageN, "LeafPageN") + assert.Equal(t, 0, s.LeafOverflowN, "LeafOverflowN") + assert.Equal(t, 4714178, s.LeafInuse, "LeafInuse") + assert.Equal(t, 6729728, s.LeafAlloc, "LeafAlloc") + return nil + }) + }) +} + // Ensure a bucket can calculate stats. func TestBucket_Stats_Small(t *testing.T) { @@ -750,11 +799,11 @@ func TestBucket_Stats_Large(t *testing.T) { withOpenDB(func(db *DB, path string) { var index int - for i := 0; i < 1000; i++ { + for i := 0; i < 10000; i++ { db.Update(func(tx *Tx) error { // Add bucket with lots of keys. b, _ := tx.CreateBucketIfNotExists([]byte("widgets")) - for i := 0; i < 100; i++ { + for i := 0; i < 10; i++ { b.Put([]byte(strconv.Itoa(index)), []byte(strconv.Itoa(index))) index++ } @@ -766,18 +815,18 @@ func TestBucket_Stats_Large(t *testing.T) { db.View(func(tx *Tx) error { b := tx.Bucket([]byte("widgets")) stats := b.Stats() - assert.Equal(t, 19, stats.BranchPageN, "BranchPageN") + assert.Equal(t, 13, stats.BranchPageN, "BranchPageN") assert.Equal(t, 0, stats.BranchOverflowN, "BranchOverflowN") - assert.Equal(t, 1291, stats.LeafPageN, "LeafPageN") + assert.Equal(t, 1195, stats.LeafPageN, "LeafPageN") assert.Equal(t, 0, stats.LeafOverflowN, "LeafOverflowN") assert.Equal(t, 100000, stats.KeyN, "KeyN") assert.Equal(t, 3, stats.Depth, "Depth") - assert.Equal(t, 27007, stats.BranchInuse, "BranchInuse") - assert.Equal(t, 2598436, stats.LeafInuse, "LeafInuse") + assert.Equal(t, 25208, stats.BranchInuse, "BranchInuse") + assert.Equal(t, 2596900, stats.LeafInuse, "LeafInuse") if os.Getpagesize() == 4096 { // Incompatible page size - assert.Equal(t, 77824, stats.BranchAlloc, "BranchAlloc") - assert.Equal(t, 5287936, stats.LeafAlloc, "LeafAlloc") + assert.Equal(t, 53248, stats.BranchAlloc, "BranchAlloc") + assert.Equal(t, 4894720, stats.LeafAlloc, "LeafAlloc") } assert.Equal(t, 1, stats.BucketN, "BucketN") assert.Equal(t, 0, stats.InlineBucketN, "InlineBucketN") diff --git a/db_test.go b/db_test.go index 691c8fb..9fd4662 100644 --- a/db_test.go +++ b/db_test.go @@ -7,6 +7,8 @@ import ( "io/ioutil" "os" "regexp" + "sort" + "strings" "testing" "time" "unsafe" @@ -520,6 +522,38 @@ func mustCheck(db *DB) { } } +// mustContainKeys checks that a bucket contains a given set of keys. +func mustContainKeys(b *Bucket, m map[string]string) { + found := make(map[string]string) + b.ForEach(func(k, _ []byte) error { + found[string(k)] = "" + return nil + }) + + // Check for keys found in bucket that shouldn't be there. + var keys []string + for k, _ := range found { + if _, ok := m[string(k)]; !ok { + keys = append(keys, k) + } + } + if len(keys) > 0 { + sort.Strings(keys) + panic(fmt.Sprintf("keys found(%d): %s", len(keys), strings.Join(keys, ","))) + } + + // Check for keys not found in bucket that should be there. + for k, _ := range m { + if _, ok := found[string(k)]; !ok { + keys = append(keys, k) + } + } + if len(keys) > 0 { + sort.Strings(keys) + panic(fmt.Sprintf("keys not found(%d): %s", len(keys), strings.Join(keys, ","))) + } +} + func trunc(b []byte, length int) []byte { if length < len(b) { return b[:length] diff --git a/node.go b/node.go index 1502be0..f0978ca 100644 --- a/node.go +++ b/node.go @@ -14,7 +14,7 @@ type node struct { key []byte pgid pgid parent *node - children []*node + children nodes inodes inodes } @@ -205,15 +205,14 @@ func (n *node) write(p *page) { // DEBUG ONLY: n.dump() } -// split breaks up a node into smaller nodes, if appropriate. +// split breaks up a node into two smaller nodes, if appropriate. // This should only be called from the spill() function. func (n *node) split(pageSize int) []*node { - var nodes = []*node{n} - // Ignore the split if the page doesn't have at least enough nodes for - // multiple pages or if the data can fit on a single page. - if len(n.inodes) <= (minKeysPerPage*2) || n.size() < pageSize { - return nodes + // two pages or if the data can fit on a single page. + sz := n.size() + if len(n.inodes) <= (minKeysPerPage*2) || sz < pageSize { + return []*node{n} } // Determine the threshold before starting a new node. @@ -225,43 +224,60 @@ func (n *node) split(pageSize int) []*node { } threshold := int(float64(pageSize) * fillPercent) - // Group into smaller pages and target a given fill size. - size := pageHeaderSize - internalNodes := n.inodes - current := n - current.inodes = nil + // Determine split position and sizes of the two pages. + splitIndex, sz0 := n.splitIndex(threshold) + sz1 := pageHeaderSize + (sz - sz0) - // Loop over every inode and split once we reach our threshold. - for i, inode := range internalNodes { - elemSize := n.pageElementSize() + len(inode.key) + len(inode.value) - - // Split once we reach our threshold split size. However, this should - // only be done if we have enough keys for this node and we will have - // enough keys for the next node. - if len(current.inodes) >= minKeysPerPage && i < len(internalNodes)-minKeysPerPage && size+elemSize > threshold { - // If there's no parent then we need to create one. - if n.parent == nil { - n.parent = &node{bucket: n.bucket, children: []*node{n}} - } - - // Create a new node and add it to the parent. - current = &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} - n.parent.children = append(n.parent.children, current) - nodes = append(nodes, current) - - // Reset our running total back to zero (plus header size). - size = pageHeaderSize - - // Update the statistics. - n.bucket.tx.stats.Split++ - } - - // Increase our running total of the size and append the inode. - size += elemSize - current.inodes = append(current.inodes, inode) + // If we can fit our extra keys on the next page then merge into it. + if next := n.nextSibling(); next != nil && next.size()+sz1 < threshold { + next.inodes = append(n.inodes[splitIndex:], next.inodes...) + n.inodes = n.inodes[:splitIndex] + return []*node{n} } - return nodes + // Otherwise split node into two separate nodes. If there's no parent then + // we'll need to create one. + if n.parent == nil { + n.parent = &node{bucket: n.bucket, children: []*node{n}} + } + + // Create a new node and add it to the parent. + next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} + n.parent.children = append(n.parent.children, next) + + // Split inodes across two nodes. + next.inodes = n.inodes[splitIndex:] + n.inodes = n.inodes[:splitIndex] + + // Update the statistics. + n.bucket.tx.stats.Split++ + + return []*node{n, next} +} + +// splitIndex finds the position where a page will fill a given threshold. +// It returns the index as well as the size of the first page. +// This is only be called from split(). +func (n *node) splitIndex(threshold int) (index, sz int) { + sz = pageHeaderSize + + // Loop until we only have the minimum number of keys required for the second page. + for i := 0; i < len(n.inodes)-minKeysPerPage; i++ { + index = i + inode := n.inodes[i] + elsize := n.pageElementSize() + len(inode.key) + len(inode.value) + + // If we have at least the minimum number of keys and adding another + // node would put us over the threshold then exit and return. + if i >= minKeysPerPage && sz+elsize > threshold { + break + } + + // Add the element size to the total size. + sz += elsize + } + + return } // spill writes the nodes to dirty pages and splits nodes as it goes. @@ -269,22 +285,29 @@ func (n *node) split(pageSize int) []*node { func (n *node) spill() error { var tx = n.bucket.tx - // Spill child nodes first. - for _, child := range n.children { - if err := child.spill(); err != nil { + // Spill child nodes first. Child nodes can materialize sibling nodes in + // the case of split-merge so we cannot use a range loop. We have to check + // the children size on every loop iteration. + sort.Sort(n.children) + for i := 0; i < len(n.children); i++ { + if err := n.children[i].spill(); err != nil { return err } } - // Add node's page to the freelist if it's not new. - if n.pgid > 0 { - tx.db.freelist.free(tx.id(), tx.page(n.pgid)) - n.pgid = 0 - } + // We no longer need the child list because it's only used for spill tracking. + n.children = nil - // Spill nodes by deepest first. + // Spill nodes by deepest first. The first node returned from split() will + // always be "n". var nodes = n.split(tx.db.pageSize) for _, node := range nodes { + // Add node's page to the freelist if it's not new. + if node.pgid > 0 { + tx.db.freelist.free(tx.id(), tx.page(node.pgid)) + node.pgid = 0 + } + // Allocate contiguous space for the node. p, err := tx.allocate((node.size() / tx.db.pageSize) + 1) if err != nil { @@ -550,6 +573,12 @@ func (n *node) dump() { } */ +type nodes []*node + +func (s nodes) Len() int { return len(s) } +func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 } + // inode represents an internal node inside of a node. // It can be used to point to elements in a page or point // to an element which hasn't been added to a page yet.