From 55e71b090259eb775c1bb74a2c3ec23bdfba8db5 Mon Sep 17 00:00:00 2001 From: Ben Johnson Date: Mon, 5 May 2014 10:27:02 -0600 Subject: [PATCH 1/4] Add inline bucket support. This commit adds support for writing small buckets directly inline to their value in their parent's leaf node. Previously, subbuckets would simply have a bucket header stored in their parent bucket which pointed to the root page. This required that every bucket use at least a single page. This has a high overhead for buckets with only one or two small items. Inline buckets checks subbuckets to see if they only have a small amount of data (about 1kb) and no subbuckets. If these conditions are met then the bucket's root node is written to a fake page which is simply a pointer to the end of the bucket's header. Fixes #124. --- bucket.go | 161 +++++++++++++++++++++++++++++++++++++++++-------- bucket_test.go | 20 +++--- db.go | 6 +- db_test.go | 10 +-- freelist.go | 5 +- node.go | 34 ++++++++++- tx_test.go | 2 +- 7 files changed, 189 insertions(+), 49 deletions(-) diff --git a/bucket.go b/bucket.go index 5dd79dd..6672356 100644 --- a/bucket.go +++ b/bucket.go @@ -52,11 +52,14 @@ const ( minInt = -maxInt - 1 ) +const bucketHeaderSize = int(unsafe.Sizeof(bucket{})) + // Bucket represents a collection of key/value pairs inside the database. type Bucket struct { *bucket tx *Tx buckets map[string]*Bucket + page *page rootNode *node nodes map[pgid]*node } @@ -128,6 +131,11 @@ func (b *Bucket) Bucket(name []byte) *Bucket { *child.bucket = *(*bucket)(unsafe.Pointer(&v[0])) b.buckets[string(name)] = &child + // Save a reference to the inline page if the bucket is inline. + if child.root == 0 { + child.page = (*page)(unsafe.Pointer(&v[bucketHeaderSize])) + } + return &child } @@ -155,22 +163,19 @@ func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) { } } - // Create a blank root leaf page. - p, err := b.tx.allocate(1) - if err != nil { - return nil, err - } - p.flags = leafPageFlag - - // Insert key/value. - value := make([]byte, unsafe.Sizeof(bucket{})) - bucket := (*bucket)(unsafe.Pointer(&value[0])) - bucket.root = p.id + // Create empty, inline bucket. + var bucket = Bucket{bucket: &bucket{}, rootNode: &node{isLeaf: true}} + var value = bucket.write() // Insert into node. key = cloneBytes(key) c.node().put(key, key, value, 0, bucketLeafFlag) + // De-inline the parent bucket, if necessary. + if b.root == 0 { + b.page = nil + } + return b.Bucket(key), nil } @@ -224,9 +229,9 @@ func (b *Bucket) DeleteBucket(key []byte) error { delete(b.buckets, string(key)) // Release all bucket pages to freelist. - b.tx.forEachPage(child.root, 0, func(p *page, _ int) { - b.tx.db.freelist.free(b.tx.id(), p) - }) + child.nodes = nil + child.rootNode = nil + child.free() // Delete the node if we have a matching key. c.node().del(key) @@ -348,7 +353,7 @@ func (b *Bucket) ForEach(fn func(k, v []byte) error) error { func (b *Bucket) Stats() BucketStats { var s BucketStats pageSize := b.tx.db.pageSize - b.tx.forEachPage(b.root, 0, func(p *page, depth int) { + b.forEachPage(func(p *page, depth int) { if (p.flags & leafPageFlag) != 0 { s.LeafPageN++ s.KeyN += int(p.count) @@ -375,21 +380,47 @@ func (b *Bucket) Stats() BucketStats { return s } +// forEachPage iterates over every page in a bucket, including inline pages. +func (b *Bucket) forEachPage(fn func(*page, int)) { + // If we have an inline page then just use that. + if b.page != nil { + fn(b.page, 0) + return + } + + // Otherwise traverse the page hierarchy. + b.tx.forEachPage(b.root, 0, fn) +} + // spill writes all the nodes for this bucket to dirty pages. func (b *Bucket) spill() error { // Spill all child buckets first. for name, child := range b.buckets { - if err := child.spill(); err != nil { - return err + // If the child bucket is small enough and it has no child buckets then + // write it inline into the parent bucket's page. Otherwise spill it + // like a normal bucket and make the parent value a pointer to the page. + var value []byte + if child.inlineable() { + child.free() + value = child.write() + } else { + if err := child.spill(); err != nil { + return err + } + + // Update the child bucket header in this bucket. + value = make([]byte, unsafe.Sizeof(bucket{})) + bucket := (*bucket)(unsafe.Pointer(&value[0])) + *bucket = *child.bucket } - // Update the child bucket header in this bucket. - value := make([]byte, unsafe.Sizeof(bucket{})) - bucket := (*bucket)(unsafe.Pointer(&value[0])) - *bucket = *child.bucket + // Skip writing the bucket if there are no materialized nodes. + if child.rootNode == nil { + continue + } // Update parent node. - c := b.Cursor() + var c = b.Cursor() k, _, flags := c.seek([]byte(name)) _assert(bytes.Equal([]byte(name), k), "misplaced bucket header: %x -> %x", []byte(name), k) _assert(flags&bucketLeafFlag != 0, "unexpected bucket header flag: %x", flags) @@ -413,6 +444,54 @@ func (b *Bucket) spill() error { return nil } +// inlineable returns true if a bucket is small enough to be written inline +// and if it contains no subbuckets. Otherwise returns false. +func (b *Bucket) inlineable() bool { + var n = b.rootNode + + // Bucket must only contain a single leaf node. + if n == nil || !n.isLeaf { + return false + } + + // Bucket is not inlineable if it contains subbuckets or if it goes beyond + // our threshold for inline bucket size. + var size = pageHeaderSize + for _, inode := range n.inodes { + size += leafPageElementSize + len(inode.key) + len(inode.value) + + if inode.flags&bucketLeafFlag != 0 { + return false + } else if size > b.maxInlineBucketSize() { + return false + } + } + + return true +} + +// Returns the maximum total size of a bucket to make it a candidate for inlining. +func (b *Bucket) maxInlineBucketSize() int { + return b.tx.db.pageSize / 4 +} + +// write allocates and writes a bucket to a byte slice. +func (b *Bucket) write() []byte { + // Allocate the appropriate size. + var n = b.rootNode + var value = make([]byte, bucketHeaderSize+pageHeaderSize+n.size()) + + // Write a bucket header. + var bucket = (*bucket)(unsafe.Pointer(&value[0])) + *bucket = *b.bucket + + // Convert byte slice to a fake page and write the root node. + var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize])) + n.write(p) + + return value +} + // rebalance attempts to balance all nodes. func (b *Bucket) rebalance() { for _, n := range b.nodes { @@ -431,14 +510,22 @@ func (b *Bucket) node(pgid pgid, parent *node) *node { return n } - // Otherwise create a branch node and cache it. + // Otherwise create a node and cache it. n := &node{bucket: b, parent: parent} if parent == nil { b.rootNode = n } else { parent.children = append(parent.children, n) } - n.read(b.tx.page(pgid)) + + // Use the inline page if this is an inline bucket. + var p = b.page + if p == nil { + p = b.tx.page(pgid) + } + + // Read the page into the node and cache it. + n.read(p) b.nodes[pgid] = n // Update statistics. @@ -447,6 +534,19 @@ func (b *Bucket) node(pgid pgid, parent *node) *node { return n } +// free recursively frees all pages in the bucket. +func (b *Bucket) free() { + if b.root == 0 { + return + } + + var tx = b.tx + tx.forEachPage(b.root, 0, func(p *page, _ int) { + tx.db.freelist.free(tx.id(), p) + }) + b.root = 0 +} + // dereference removes all references to the old mmap. func (b *Bucket) dereference() { for _, n := range b.nodes { @@ -464,11 +564,24 @@ func (b *Bucket) dereference() { // pageNode returns the in-memory node, if it exists. // Otherwise returns the underlying page. func (b *Bucket) pageNode(id pgid) (*page, *node) { + // Inline buckets have a fake page embedded in their value so treat them + // differently. We'll return the rootNode (if available) or the fake page. + if b.root == 0 { + _assert(id == 0, "inline bucket non-zero page access(2): %d != 0", id) + if b.rootNode != nil { + return nil, b.rootNode + } + return b.page, nil + } + + // Check the node cache for non-inline buckets. if b.nodes != nil { if n := b.nodes[id]; n != nil { return nil, n } } + + // Finally lookup the page from the transaction if no node is materialized. return b.tx.page(id), nil } diff --git a/bucket_test.go b/bucket_test.go index dc89bcc..8fcf84c 100644 --- a/bucket_test.go +++ b/bucket_test.go @@ -583,18 +583,18 @@ func TestBucket_Stats_Small(t *testing.T) { db.View(func(tx *Tx) error { b := tx.Bucket([]byte("whozawhats")) stats := b.Stats() - assert.Equal(t, stats.BranchPageN, 0) - assert.Equal(t, stats.BranchOverflowN, 0) - assert.Equal(t, stats.LeafPageN, 1) - assert.Equal(t, stats.LeafOverflowN, 0) - assert.Equal(t, stats.KeyN, 1) - assert.Equal(t, stats.Depth, 1) + assert.Equal(t, 0, stats.BranchPageN) + assert.Equal(t, 0, stats.BranchOverflowN) + assert.Equal(t, 1, stats.LeafPageN) + assert.Equal(t, 0, stats.LeafOverflowN) + assert.Equal(t, 1, stats.KeyN) + assert.Equal(t, 1, stats.Depth) if os.Getpagesize() != 4096 { // Incompatible page size - assert.Equal(t, stats.BranchInuse, 0) - assert.Equal(t, stats.BranchAlloc, 0) - assert.Equal(t, stats.LeafInuse, 38) - assert.Equal(t, stats.LeafAlloc, 4096) + assert.Equal(t, 0, stats.BranchInuse) + assert.Equal(t, 0, stats.BranchAlloc) + assert.Equal(t, 38, stats.LeafInuse) + assert.Equal(t, 4096, stats.LeafAlloc) } return nil }) diff --git a/db.go b/db.go index 9754f21..d96a161 100644 --- a/db.go +++ b/db.go @@ -563,6 +563,11 @@ func (db *DB) Check() error { } func (db *DB) checkBucket(b *Bucket, reachable map[pgid]*page, errors *ErrorList) { + // Ignore inline buckets. + if b.root == 0 { + return + } + // Check every page used by this bucket. b.tx.forEachPage(b.root, 0, func(p *page, _ int) { // Ensure each page is only referenced once. @@ -576,7 +581,6 @@ func (db *DB) checkBucket(b *Bucket, reachable map[pgid]*page, errors *ErrorList // Retrieve page info. info, err := b.tx.Page(int(p.id)) - // warnf("[page] %d + %d (%s)", p.id, p.overflow, info.Type) if err != nil { *errors = append(*errors, err) } else if info == nil { diff --git a/db_test.go b/db_test.go index b7e2a2f..903f65e 100644 --- a/db_test.go +++ b/db_test.go @@ -275,7 +275,7 @@ func TestDB_Stats(t *testing.T) { return err }) stats := db.Stats() - assert.Equal(t, 3, stats.TxStats.PageCount) + assert.Equal(t, 2, stats.TxStats.PageCount) }) } @@ -324,13 +324,7 @@ func TestDB_Consistency(t *testing.T) { if p, _ := tx.Page(5); assert.NotNil(t, p) { assert.Equal(t, "leaf", p.Type) // root leaf } - if p, _ := tx.Page(6); assert.NotNil(t, p) { - assert.Equal(t, "leaf", p.Type) - } - if p, _ := tx.Page(7); assert.NotNil(t, p) { - assert.Equal(t, "free", p.Type) - } - p, _ := tx.Page(8) + p, _ := tx.Page(6) assert.Nil(t, p) return nil }) diff --git a/freelist.go b/freelist.go index ebe2810..6c527fe 100644 --- a/freelist.go +++ b/freelist.go @@ -1,6 +1,7 @@ package bolt import ( + "fmt" "sort" "unsafe" ) @@ -63,7 +64,7 @@ func (f *freelist) free(txid txid, p *page) { } f.pending[txid] = ids - // DEBUG ONLY: f.check() + f.check() // DEBUG ONLY: } // release moves all page ids for a transaction id (or older) to the freelist. @@ -114,7 +115,6 @@ func (f *freelist) write(p *page) { // check verifies there are no double free pages. // This is slow so it should only be used while debugging. // If errors are found then a panic invoked. -/* func (f *freelist) check() { var lookup = make(map[pgid]txid) for _, id := range f.ids { @@ -132,7 +132,6 @@ func (f *freelist) check() { } } } -*/ type reverseSortedPgids []pgid diff --git a/node.go b/node.go index fd737ea..72145c4 100644 --- a/node.go +++ b/node.go @@ -2,6 +2,8 @@ package bolt import ( "bytes" + "fmt" + "io" "sort" "unsafe" ) @@ -191,6 +193,8 @@ func (n *node) write(p *page) { copy(b[0:], item.value) b = b[len(item.value):] } + + // DEBUG ONLY: n.dump(os.Stderr) } // split breaks up a node into smaller nodes, if appropriate. @@ -261,6 +265,7 @@ func (n *node) spill() error { // Add node's page to the freelist if it's not new. if n.pgid > 0 { tx.db.freelist.free(tx.id(), tx.page(n.pgid)) + n.pgid = 0 } // Spill nodes by deepest first. @@ -273,8 +278,8 @@ func (n *node) spill() error { } // Write the node. - node.write(p) node.pgid = p.id + node.write(p) // Insert into parent inodes. if node.parent != nil { @@ -302,8 +307,8 @@ func (n *node) spill() error { } // Write the new root. - parent.write(p) parent.pgid = p.id + parent.write(p) } return nil @@ -477,6 +482,31 @@ func (n *node) free() { } } +// dump writes the contents of the node for debugging purposes. +func (n *node) dump(w io.Writer) { + // Write node header. + var typ = "branch" + if n.isLeaf { + typ = "leaf" + } + fmt.Fprintf(w, "[NODE %d {type=%s count=%d}]\n", n.pgid, typ, len(n.inodes)) + + // Write out abbreviated version of each item. + for _, item := range n.inodes { + if n.isLeaf { + if item.flags&bucketLeafFlag != 0 { + bucket := (*bucket)(unsafe.Pointer(&item.value[0])) + fmt.Fprintf(w, "[L \"%s\" -> (bucket root=%d)]\n", item.key, bucket.root) + } else { + fmt.Fprintf(w, "[L \"%s\" -> \"%s\"]\n", item.key, item.value) + } + } else { + fmt.Fprintf(w, "[B \"%s\" -> pgid=%d]\n", item.key, item.pgid) + } + } + fmt.Fprint(w, "\n") +} + // inode represents an internal node inside of a node. // It can be used to point to elements in a page or point // to an element which hasn't been added to a page yet. diff --git a/tx_test.go b/tx_test.go index 61fa5d9..e466765 100644 --- a/tx_test.go +++ b/tx_test.go @@ -219,7 +219,7 @@ func TestTx_DeleteBucket(t *testing.T) { db.Update(func(tx *Tx) error { // Verify that the bucket's page is free. - assert.Equal(t, []pgid{7, 6, root, 2}, db.freelist.all()) + assert.Equal(t, []pgid{5, 4}, db.freelist.all()) // Create the bucket again and make sure there's not a phantom value. b, err := tx.CreateBucket([]byte("widgets")) From 0966dde0d44d5b42b132e3f2b8e8c6e3dc2c93fe Mon Sep 17 00:00:00 2001 From: Ben Johnson Date: Wed, 7 May 2014 10:37:50 -0600 Subject: [PATCH 2/4] Fix bucket free. --- bucket.go | 42 ++++++++++++++++++++++++++++++++++++++++-- bucket_test.go | 27 +++++++++++++++++++++++++++ cursor.go | 2 +- freelist.go | 5 +++-- node.go | 12 +++++++++++- 5 files changed, 82 insertions(+), 6 deletions(-) diff --git a/bucket.go b/bucket.go index 6672356..4ad62fe 100644 --- a/bucket.go +++ b/bucket.go @@ -392,6 +392,40 @@ func (b *Bucket) forEachPage(fn func(*page, int)) { b.tx.forEachPage(b.root, 0, fn) } +// forEachPageNode iterates over every page (or node) in a bucket. +// This also includes inline pages. +func (b *Bucket) forEachPageNode(fn func(*page, *node, int)) { + // If we have an inline page or root node then just use that. + if b.page != nil { + fn(b.page, nil, 0) + return + } + b._forEachPageNode(b.root, 0, fn) +} + +func (b *Bucket) _forEachPageNode(pgid pgid, depth int, fn func(*page, *node, int)) { + var p, n = b.pageNode(pgid) + + // Execute function. + fn(p, n, depth) + + // Recursively loop over children. + if p != nil { + if (p.flags & branchPageFlag) != 0 { + for i := 0; i < int(p.count); i++ { + elem := p.branchPageElement(uint16(i)) + b._forEachPageNode(elem.pgid, depth+1, fn) + } + } + } else { + if !n.isLeaf { + for _, inode := range n.inodes { + b._forEachPageNode(inode.pgid, depth+1, fn) + } + } + } +} + // spill writes all the nodes for this bucket to dirty pages. func (b *Bucket) spill() error { // Spill all child buckets first. @@ -541,8 +575,12 @@ func (b *Bucket) free() { } var tx = b.tx - tx.forEachPage(b.root, 0, func(p *page, _ int) { - tx.db.freelist.free(tx.id(), p) + b.forEachPageNode(func(p *page, n *node, _ int) { + if p != nil { + tx.db.freelist.free(tx.id(), p) + } else { + n.free() + } }) b.root = 0 } diff --git a/bucket_test.go b/bucket_test.go index 8fcf84c..b87031b 100644 --- a/bucket_test.go +++ b/bucket_test.go @@ -161,6 +161,33 @@ func TestBucket_Delete(t *testing.T) { }) } +// Ensure that deleting a large set of keys will work correctly. +func TestBucket_Delete_Large(t *testing.T) { + withOpenDB(func(db *DB, path string) { + db.Update(func(tx *Tx) error { + var b, _ = tx.CreateBucket([]byte("widgets")) + for i := 0; i < 100; i++ { + assert.NoError(t, b.Put([]byte(strconv.Itoa(i)), []byte(strings.Repeat("*", 1024)))) + } + return nil + }) + db.Update(func(tx *Tx) error { + var b = tx.Bucket([]byte("widgets")) + for i := 0; i < 100; i++ { + assert.NoError(t, b.Delete([]byte(strconv.Itoa(i)))) + } + return nil + }) + db.View(func(tx *Tx) error { + var b = tx.Bucket([]byte("widgets")) + for i := 0; i < 100; i++ { + assert.Nil(t, b.Get([]byte(strconv.Itoa(i)))) + } + return nil + }) + }) +} + // Ensure that accessing and updating nested buckets is ok across transactions. func TestBucket_Nested(t *testing.T) { withOpenDB(func(db *DB, path string) { diff --git a/cursor.go b/cursor.go index 56948fc..31a3d5f 100644 --- a/cursor.go +++ b/cursor.go @@ -148,7 +148,7 @@ func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) { func (c *Cursor) first() { for { // Exit when we hit a leaf page. - ref := &c.stack[len(c.stack)-1] + var ref = &c.stack[len(c.stack)-1] if ref.isLeaf() { break } diff --git a/freelist.go b/freelist.go index 6c527fe..ebe2810 100644 --- a/freelist.go +++ b/freelist.go @@ -1,7 +1,6 @@ package bolt import ( - "fmt" "sort" "unsafe" ) @@ -64,7 +63,7 @@ func (f *freelist) free(txid txid, p *page) { } f.pending[txid] = ids - f.check() // DEBUG ONLY: + // DEBUG ONLY: f.check() } // release moves all page ids for a transaction id (or older) to the freelist. @@ -115,6 +114,7 @@ func (f *freelist) write(p *page) { // check verifies there are no double free pages. // This is slow so it should only be used while debugging. // If errors are found then a panic invoked. +/* func (f *freelist) check() { var lookup = make(map[pgid]txid) for _, id := range f.ids { @@ -132,6 +132,7 @@ func (f *freelist) check() { } } } +*/ type reverseSortedPgids []pgid diff --git a/node.go b/node.go index 72145c4..1bc889d 100644 --- a/node.go +++ b/node.go @@ -336,7 +336,7 @@ func (n *node) rebalance() { // If root node is a branch and only has one node then collapse it. if !n.isLeaf && len(n.inodes) == 1 { // Move root's child up. - child := n.bucket.nodes[n.inodes[0].pgid] + child := n.bucket.node(n.inodes[0].pgid, n) n.isLeaf = child.isLeaf n.inodes = child.inodes[:] n.children = child.children @@ -357,6 +357,16 @@ func (n *node) rebalance() { return } + // If node has no keys then just remove it. + if n.numChildren() == 0 { + n.parent.del(n.key) + n.parent.removeChild(n) + delete(n.bucket.nodes, n.pgid) + n.free() + n.parent.rebalance() + return + } + _assert(n.parent.numChildren() > 1, "parent must have at least 2 children") // Destination node is right sibling if idx == 0, otherwise left sibling. From e99a24eb44bc1da2fc6773a9c8dcf8862746f028 Mon Sep 17 00:00:00 2001 From: Ben Johnson Date: Wed, 7 May 2014 11:23:30 -0600 Subject: [PATCH 3/4] Improve bucket code documentation. --- bucket.go | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/bucket.go b/bucket.go index 4ad62fe..b8dbf25 100644 --- a/bucket.go +++ b/bucket.go @@ -57,17 +57,20 @@ const bucketHeaderSize = int(unsafe.Sizeof(bucket{})) // Bucket represents a collection of key/value pairs inside the database. type Bucket struct { *bucket - tx *Tx - buckets map[string]*Bucket - page *page - rootNode *node - nodes map[pgid]*node + tx *Tx // the associated transaction + buckets map[string]*Bucket // subbucket cache + page *page // inline page reference + rootNode *node // materialized node for the root page. + nodes map[pgid]*node // node cache } // bucket represents the on-file representation of a bucket. +// This is stored as the "value" of a bucket key. If the bucket is small enough, +// then its root page can be stored inline in the "value", after the bucket +// header. In the case of inline buckets, the "root" will be 0. type bucket struct { - root pgid - sequence uint64 + root pgid // page id of the bucket's root-level page + sequence uint64 // monotonically incrementing, used by NextSequence() } // newBucket returns a new bucket associated with a transaction. @@ -171,10 +174,10 @@ func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) { key = cloneBytes(key) c.node().put(key, key, value, 0, bucketLeafFlag) - // De-inline the parent bucket, if necessary. - if b.root == 0 { - b.page = nil - } + // Since subbuckets are not allowed on inline buckets, we need to + // dereference the inline page, if it exists. This will cause the bucket + // to be treated as a regular, non-inline bucket for the rest of the tx. + b.page = nil return b.Bucket(key), nil } From d52d142275c783208a74dc90c81551a51d1e55c4 Mon Sep 17 00:00:00 2001 From: Ben Johnson Date: Wed, 7 May 2014 12:06:22 -0600 Subject: [PATCH 4/4] Minor fixes. --- bucket.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bucket.go b/bucket.go index b8dbf25..25ad1ba 100644 --- a/bucket.go +++ b/bucket.go @@ -447,7 +447,7 @@ func (b *Bucket) spill() error { // Update the child bucket header in this bucket. value = make([]byte, unsafe.Sizeof(bucket{})) - bucket := (*bucket)(unsafe.Pointer(&value[0])) + var bucket = (*bucket)(unsafe.Pointer(&value[0])) *bucket = *child.bucket }