mirror of https://github.com/etcd-io/bbolt.git
Refactor split/spill.
parent
a18135e055
commit
25fea2fd9f
94
bucket.go
94
bucket.go
|
@ -4,7 +4,6 @@ import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"sort"
|
|
||||||
"unsafe"
|
"unsafe"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -41,10 +40,10 @@ var (
|
||||||
// Bucket represents a collection of key/value pairs inside the database.
|
// Bucket represents a collection of key/value pairs inside the database.
|
||||||
type Bucket struct {
|
type Bucket struct {
|
||||||
*bucket
|
*bucket
|
||||||
tx *Tx
|
tx *Tx
|
||||||
buckets map[string]*Bucket
|
buckets map[string]*Bucket
|
||||||
nodes map[pgid]*node
|
rootNode *node
|
||||||
pending []*node
|
nodes map[pgid]*node
|
||||||
}
|
}
|
||||||
|
|
||||||
// bucket represents the on-file representation of a bucket.
|
// bucket represents the on-file representation of a bucket.
|
||||||
|
@ -382,76 +381,19 @@ func (b *Bucket) spill() error {
|
||||||
c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag)
|
c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ignore if there are no nodes to spill.
|
// Ignore if there's not a materialized root node.
|
||||||
if len(b.nodes) == 0 {
|
if b.rootNode == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort nodes by highest depth first.
|
// Spill nodes.
|
||||||
nodes := make(nodesByDepth, 0, len(b.nodes))
|
if err := b.rootNode.spill(); err != nil {
|
||||||
for _, n := range b.nodes {
|
return err
|
||||||
nodes = append(nodes, n)
|
|
||||||
}
|
}
|
||||||
sort.Sort(nodes)
|
b.rootNode = b.rootNode.root()
|
||||||
|
|
||||||
// Spill nodes by deepest first.
|
|
||||||
for i := 0; i < len(nodes); i++ {
|
|
||||||
n := nodes[i]
|
|
||||||
|
|
||||||
// Split nodes into appropriate sized nodes.
|
|
||||||
// The first node in this list will be a reference to n to preserve ancestry.
|
|
||||||
newNodes := n.split(b.tx.db.pageSize)
|
|
||||||
b.pending = newNodes
|
|
||||||
|
|
||||||
// If this is a root node that split then create a parent node.
|
|
||||||
if n.parent == nil && len(newNodes) > 1 {
|
|
||||||
n.parent = &node{bucket: b, isLeaf: false}
|
|
||||||
nodes = append(nodes, n.parent)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add node's page to the freelist.
|
|
||||||
if n.pgid > 0 {
|
|
||||||
b.tx.db.freelist.free(b.tx.id(), b.tx.page(n.pgid))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write nodes to dirty pages.
|
|
||||||
for i, newNode := range newNodes {
|
|
||||||
// Allocate contiguous space for the node.
|
|
||||||
p, err := b.tx.allocate((newNode.size() / b.tx.db.pageSize) + 1)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write the node to the page.
|
|
||||||
newNode.write(p)
|
|
||||||
newNode.pgid = p.id
|
|
||||||
newNode.parent = n.parent
|
|
||||||
|
|
||||||
// The first node should use the existing entry, other nodes are inserts.
|
|
||||||
var oldKey []byte
|
|
||||||
if i == 0 {
|
|
||||||
oldKey = n.key
|
|
||||||
} else {
|
|
||||||
oldKey = newNode.inodes[0].key
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update the parent entry.
|
|
||||||
if newNode.parent != nil {
|
|
||||||
newNode.parent.put(oldKey, newNode.inodes[0].key, nil, newNode.pgid, 0)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update the statistics.
|
|
||||||
b.tx.stats.Spill++
|
|
||||||
}
|
|
||||||
|
|
||||||
b.pending = nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear out nodes now that they are all spilled.
|
|
||||||
b.nodes = make(map[pgid]*node)
|
|
||||||
|
|
||||||
// Update the root node for this bucket.
|
// Update the root node for this bucket.
|
||||||
b.root = nodes[len(nodes)-1].pgid
|
b.root = b.rootNode.pgid
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -474,10 +416,12 @@ func (b *Bucket) node(pgid pgid, parent *node) *node {
|
||||||
return n
|
return n
|
||||||
}
|
}
|
||||||
|
|
||||||
// Otherwise create a branch and cache it.
|
// Otherwise create a branch node and cache it.
|
||||||
n := &node{bucket: b, parent: parent}
|
n := &node{bucket: b, parent: parent}
|
||||||
if n.parent != nil {
|
if parent == nil {
|
||||||
n.depth = n.parent.depth + 1
|
b.rootNode = n
|
||||||
|
} else {
|
||||||
|
parent.children = append(parent.children, n)
|
||||||
}
|
}
|
||||||
n.read(b.tx.page(pgid))
|
n.read(b.tx.page(pgid))
|
||||||
b.nodes[pgid] = n
|
b.nodes[pgid] = n
|
||||||
|
@ -494,16 +438,12 @@ func (b *Bucket) dereference() {
|
||||||
n.dereference()
|
n.dereference()
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, n := range b.pending {
|
|
||||||
n.dereference()
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, child := range b.buckets {
|
for _, child := range b.buckets {
|
||||||
child.dereference()
|
child.dereference()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update statistics
|
// Update statistics
|
||||||
b.tx.stats.NodeDeref += len(b.nodes) + len(b.pending)
|
b.tx.stats.NodeDeref += len(b.nodes)
|
||||||
}
|
}
|
||||||
|
|
||||||
// pageNode returns the in-memory node, if it exists.
|
// pageNode returns the in-memory node, if it exists.
|
||||||
|
|
103
bucket_test.go
103
bucket_test.go
|
@ -161,6 +161,64 @@ func TestBucket_Delete(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Ensure that accessing and updating nested buckets is ok across transactions.
|
||||||
|
func TestBucket_Nested(t *testing.T) {
|
||||||
|
withOpenDB(func(db *DB, path string) {
|
||||||
|
db.Update(func(tx *Tx) error {
|
||||||
|
// Create a widgets bucket.
|
||||||
|
b, err := tx.CreateBucket([]byte("widgets"))
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
// Create a widgets/foo bucket.
|
||||||
|
_, err = b.CreateBucket([]byte("foo"))
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
// Create a widgets/bar key.
|
||||||
|
assert.NoError(t, b.Put([]byte("bar"), []byte("0000")))
|
||||||
|
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
mustCheck(db)
|
||||||
|
|
||||||
|
// Update widgets/bar.
|
||||||
|
db.Update(func(tx *Tx) error {
|
||||||
|
var b = tx.Bucket([]byte("widgets"))
|
||||||
|
assert.NoError(t, b.Put([]byte("bar"), []byte("xxxx")))
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
mustCheck(db)
|
||||||
|
|
||||||
|
// Cause a split.
|
||||||
|
db.Update(func(tx *Tx) error {
|
||||||
|
var b = tx.Bucket([]byte("widgets"))
|
||||||
|
for i := 0; i < 10000; i++ {
|
||||||
|
assert.NoError(t, b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
mustCheck(db)
|
||||||
|
|
||||||
|
// Insert into widgets/foo/baz.
|
||||||
|
db.Update(func(tx *Tx) error {
|
||||||
|
var b = tx.Bucket([]byte("widgets"))
|
||||||
|
assert.NoError(t, b.Bucket([]byte("foo")).Put([]byte("baz"), []byte("yyyy")))
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
mustCheck(db)
|
||||||
|
|
||||||
|
// Verify.
|
||||||
|
db.View(func(tx *Tx) error {
|
||||||
|
var b = tx.Bucket([]byte("widgets"))
|
||||||
|
assert.Equal(t, []byte("yyyy"), b.Bucket([]byte("foo")).Get([]byte("baz")))
|
||||||
|
assert.Equal(t, []byte("xxxx"), b.Get([]byte("bar")))
|
||||||
|
for i := 0; i < 10000; i++ {
|
||||||
|
assert.Equal(t, []byte(strconv.Itoa(i)), b.Get([]byte(strconv.Itoa(i))))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
// Ensure that deleting a bucket using Delete() returns an error.
|
// Ensure that deleting a bucket using Delete() returns an error.
|
||||||
func TestBucket_Delete_Bucket(t *testing.T) {
|
func TestBucket_Delete_Bucket(t *testing.T) {
|
||||||
withOpenDB(func(db *DB, path string) {
|
withOpenDB(func(db *DB, path string) {
|
||||||
|
@ -550,31 +608,35 @@ func TestBucket_Stats_Large(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
withOpenDB(func(db *DB, path string) {
|
withOpenDB(func(db *DB, path string) {
|
||||||
db.Update(func(tx *Tx) error {
|
var index int
|
||||||
// Add bucket with lots of keys.
|
for i := 0; i < 1000; i++ {
|
||||||
tx.CreateBucket([]byte("widgets"))
|
db.Update(func(tx *Tx) error {
|
||||||
b := tx.Bucket([]byte("widgets"))
|
// Add bucket with lots of keys.
|
||||||
for i := 0; i < 100000; i++ {
|
b, _ := tx.CreateBucketIfNotExists([]byte("widgets"))
|
||||||
b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i)))
|
for i := 0; i < 100; i++ {
|
||||||
}
|
b.Put([]byte(strconv.Itoa(index)), []byte(strconv.Itoa(index)))
|
||||||
return nil
|
index++
|
||||||
})
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
}
|
||||||
mustCheck(db)
|
mustCheck(db)
|
||||||
|
|
||||||
db.View(func(tx *Tx) error {
|
db.View(func(tx *Tx) error {
|
||||||
b := tx.Bucket([]byte("widgets"))
|
b := tx.Bucket([]byte("widgets"))
|
||||||
stats := b.Stats()
|
stats := b.Stats()
|
||||||
assert.Equal(t, stats.BranchPageN, 15)
|
assert.Equal(t, 19, stats.BranchPageN)
|
||||||
assert.Equal(t, stats.BranchOverflowN, 0)
|
assert.Equal(t, 0, stats.BranchOverflowN)
|
||||||
assert.Equal(t, stats.LeafPageN, 1281)
|
assert.Equal(t, 1291, stats.LeafPageN)
|
||||||
assert.Equal(t, stats.LeafOverflowN, 0)
|
assert.Equal(t, 0, stats.LeafOverflowN)
|
||||||
assert.Equal(t, stats.KeyN, 100000)
|
assert.Equal(t, 100000, stats.KeyN)
|
||||||
assert.Equal(t, stats.Depth, 3)
|
assert.Equal(t, 3, stats.Depth)
|
||||||
if os.Getpagesize() != 4096 {
|
if os.Getpagesize() != 4096 {
|
||||||
// Incompatible page size
|
// Incompatible page size
|
||||||
assert.Equal(t, stats.BranchInuse, 27289)
|
assert.Equal(t, 27289, stats.BranchInuse)
|
||||||
assert.Equal(t, stats.BranchAlloc, 61440)
|
assert.Equal(t, 61440, stats.BranchAlloc)
|
||||||
assert.Equal(t, stats.LeafInuse, 2598276)
|
assert.Equal(t, 2598276, stats.LeafInuse)
|
||||||
assert.Equal(t, stats.LeafAlloc, 5246976)
|
assert.Equal(t, 5246976, stats.LeafAlloc)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
|
@ -703,13 +765,12 @@ func TestBucket_Delete_Quick(t *testing.T) {
|
||||||
db.View(func(tx *Tx) error {
|
db.View(func(tx *Tx) error {
|
||||||
b := tx.Bucket([]byte("widgets"))
|
b := tx.Bucket([]byte("widgets"))
|
||||||
for j, exp := range items {
|
for j, exp := range items {
|
||||||
|
value := b.Get(exp.Key)
|
||||||
if j > i {
|
if j > i {
|
||||||
value := b.Get(exp.Key)
|
|
||||||
if !assert.Equal(t, exp.Value, value) {
|
if !assert.Equal(t, exp.Value, value) {
|
||||||
t.FailNow()
|
t.FailNow()
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
value := b.Get(exp.Key)
|
|
||||||
if !assert.Nil(t, value) {
|
if !assert.Nil(t, value) {
|
||||||
t.FailNow()
|
t.FailNow()
|
||||||
}
|
}
|
||||||
|
|
|
@ -192,7 +192,7 @@ func (c *Cursor) last() {
|
||||||
func (c *Cursor) search(key []byte, pgid pgid) {
|
func (c *Cursor) search(key []byte, pgid pgid) {
|
||||||
p, n := c.bucket.pageNode(pgid)
|
p, n := c.bucket.pageNode(pgid)
|
||||||
if p != nil {
|
if p != nil {
|
||||||
_assert((p.flags&(branchPageFlag|leafPageFlag)) != 0, "invalid page type: "+p.typ())
|
_assert((p.flags&(branchPageFlag|leafPageFlag)) != 0, "invalid page type: %d: %s", p.id, p.typ())
|
||||||
}
|
}
|
||||||
e := elemRef{page: p, node: n}
|
e := elemRef{page: p, node: n}
|
||||||
c.stack = append(c.stack, e)
|
c.stack = append(c.stack, e)
|
||||||
|
|
141
node.go
141
node.go
|
@ -12,12 +12,20 @@ type node struct {
|
||||||
isLeaf bool
|
isLeaf bool
|
||||||
unbalanced bool
|
unbalanced bool
|
||||||
key []byte
|
key []byte
|
||||||
depth int
|
|
||||||
pgid pgid
|
pgid pgid
|
||||||
parent *node
|
parent *node
|
||||||
|
children []*node
|
||||||
inodes inodes
|
inodes inodes
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// root returns the top-level node this node is attached to.
|
||||||
|
func (n *node) root() *node {
|
||||||
|
if n.parent == nil {
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
return n.parent.root()
|
||||||
|
}
|
||||||
|
|
||||||
// minKeys returns the minimum number of inodes this node should have.
|
// minKeys returns the minimum number of inodes this node should have.
|
||||||
func (n *node) minKeys() int {
|
func (n *node) minKeys() int {
|
||||||
if n.isLeaf {
|
if n.isLeaf {
|
||||||
|
@ -185,12 +193,15 @@ func (n *node) write(p *page) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// split divides up the node into appropriately sized nodes.
|
// split breaks up a node into smaller nodes, if appropriate.
|
||||||
|
// This should only be called from the spill() function.
|
||||||
func (n *node) split(pageSize int) []*node {
|
func (n *node) split(pageSize int) []*node {
|
||||||
|
var nodes = []*node{n}
|
||||||
|
|
||||||
// Ignore the split if the page doesn't have at least enough nodes for
|
// Ignore the split if the page doesn't have at least enough nodes for
|
||||||
// multiple pages or if the data can fit on a single page.
|
// multiple pages or if the data can fit on a single page.
|
||||||
if len(n.inodes) <= (minKeysPerPage*2) || n.size() < pageSize {
|
if len(n.inodes) <= (minKeysPerPage*2) || n.size() < pageSize {
|
||||||
return []*node{n}
|
return nodes
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set fill threshold to 50%.
|
// Set fill threshold to 50%.
|
||||||
|
@ -198,28 +209,106 @@ func (n *node) split(pageSize int) []*node {
|
||||||
|
|
||||||
// Group into smaller pages and target a given fill size.
|
// Group into smaller pages and target a given fill size.
|
||||||
size := pageHeaderSize
|
size := pageHeaderSize
|
||||||
inodes := n.inodes
|
internalNodes := n.inodes
|
||||||
current := n
|
current := n
|
||||||
current.inodes = nil
|
current.inodes = nil
|
||||||
var nodes []*node
|
|
||||||
|
|
||||||
for i, inode := range inodes {
|
// Loop over every inode and split once we reach our threshold.
|
||||||
|
for i, inode := range internalNodes {
|
||||||
elemSize := n.pageElementSize() + len(inode.key) + len(inode.value)
|
elemSize := n.pageElementSize() + len(inode.key) + len(inode.value)
|
||||||
|
|
||||||
if len(current.inodes) >= minKeysPerPage && i < len(inodes)-minKeysPerPage && size+elemSize > threshold {
|
// Split once we reach our threshold split size. However, this should
|
||||||
size = pageHeaderSize
|
// only be done if we have enough keys for this node and we will have
|
||||||
|
// enough keys for the next node.
|
||||||
|
if len(current.inodes) >= minKeysPerPage && i < len(internalNodes)-minKeysPerPage && size+elemSize > threshold {
|
||||||
|
// If there's no parent then we need to create one.
|
||||||
|
if n.parent == nil {
|
||||||
|
n.parent = &node{bucket: n.bucket, children: []*node{n}}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new node and add it to the parent.
|
||||||
|
current = &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
|
||||||
|
n.parent.children = append(n.parent.children, current)
|
||||||
nodes = append(nodes, current)
|
nodes = append(nodes, current)
|
||||||
current = &node{bucket: n.bucket, isLeaf: n.isLeaf}
|
|
||||||
|
// Reset our running total back to zero (plus header size).
|
||||||
|
size = pageHeaderSize
|
||||||
|
|
||||||
|
// Update the statistics.
|
||||||
|
n.bucket.tx.stats.Split++
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Increase our running total of the size and append the inode.
|
||||||
size += elemSize
|
size += elemSize
|
||||||
current.inodes = append(current.inodes, inode)
|
current.inodes = append(current.inodes, inode)
|
||||||
}
|
}
|
||||||
nodes = append(nodes, current)
|
|
||||||
|
|
||||||
return nodes
|
return nodes
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// spill writes the nodes to dirty pages and splits nodes as it goes.
|
||||||
|
// Returns an error if dirty pages cannot be allocated.
|
||||||
|
func (n *node) spill() error {
|
||||||
|
var tx = n.bucket.tx
|
||||||
|
|
||||||
|
// Spill child nodes first.
|
||||||
|
for _, child := range n.children {
|
||||||
|
if err := child.spill(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add node's page to the freelist if it's not new.
|
||||||
|
if n.pgid > 0 {
|
||||||
|
tx.db.freelist.free(tx.id(), tx.page(n.pgid))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Spill nodes by deepest first.
|
||||||
|
var nodes = n.split(tx.db.pageSize)
|
||||||
|
for _, node := range nodes {
|
||||||
|
// Allocate contiguous space for the node.
|
||||||
|
p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the node.
|
||||||
|
node.write(p)
|
||||||
|
node.pgid = p.id
|
||||||
|
|
||||||
|
// Insert into parent inodes.
|
||||||
|
if node.parent != nil {
|
||||||
|
var key = node.key
|
||||||
|
if key == nil {
|
||||||
|
key = node.inodes[0].key
|
||||||
|
}
|
||||||
|
|
||||||
|
node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
|
||||||
|
node.key = node.inodes[0].key
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update the statistics.
|
||||||
|
tx.stats.Spill++
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is a special case where we need to write the parent if it is new
|
||||||
|
// and caused by a split in the root.
|
||||||
|
var parent = n.parent
|
||||||
|
if parent != nil && parent.pgid == 0 {
|
||||||
|
// Allocate contiguous space for the node.
|
||||||
|
p, err := tx.allocate((parent.size() / tx.db.pageSize) + 1)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the new root.
|
||||||
|
parent.write(p)
|
||||||
|
parent.pgid = p.id
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// rebalance attempts to combine the node with sibling nodes if the node fill
|
// rebalance attempts to combine the node with sibling nodes if the node fill
|
||||||
// size is below a threshold or if there are not enough keys.
|
// size is below a threshold or if there are not enough keys.
|
||||||
func (n *node) rebalance() {
|
func (n *node) rebalance() {
|
||||||
|
@ -241,10 +330,11 @@ func (n *node) rebalance() {
|
||||||
if n.parent == nil {
|
if n.parent == nil {
|
||||||
// If root node is a branch and only has one node then collapse it.
|
// If root node is a branch and only has one node then collapse it.
|
||||||
if !n.isLeaf && len(n.inodes) == 1 {
|
if !n.isLeaf && len(n.inodes) == 1 {
|
||||||
// Move child's children up.
|
// Move root's child up.
|
||||||
child := n.bucket.nodes[n.inodes[0].pgid]
|
child := n.bucket.nodes[n.inodes[0].pgid]
|
||||||
n.isLeaf = child.isLeaf
|
n.isLeaf = child.isLeaf
|
||||||
n.inodes = child.inodes[:]
|
n.inodes = child.inodes[:]
|
||||||
|
n.children = child.children
|
||||||
|
|
||||||
// Reparent all child nodes being moved.
|
// Reparent all child nodes being moved.
|
||||||
for _, inode := range n.inodes {
|
for _, inode := range n.inodes {
|
||||||
|
@ -278,7 +368,9 @@ func (n *node) rebalance() {
|
||||||
if useNextSibling {
|
if useNextSibling {
|
||||||
// Reparent and move node.
|
// Reparent and move node.
|
||||||
if child, ok := n.bucket.nodes[target.inodes[0].pgid]; ok {
|
if child, ok := n.bucket.nodes[target.inodes[0].pgid]; ok {
|
||||||
|
child.parent.removeChild(child)
|
||||||
child.parent = n
|
child.parent = n
|
||||||
|
child.parent.children = append(child.parent.children, child)
|
||||||
}
|
}
|
||||||
n.inodes = append(n.inodes, target.inodes[0])
|
n.inodes = append(n.inodes, target.inodes[0])
|
||||||
target.inodes = target.inodes[1:]
|
target.inodes = target.inodes[1:]
|
||||||
|
@ -289,7 +381,9 @@ func (n *node) rebalance() {
|
||||||
} else {
|
} else {
|
||||||
// Reparent and move node.
|
// Reparent and move node.
|
||||||
if child, ok := n.bucket.nodes[target.inodes[len(target.inodes)-1].pgid]; ok {
|
if child, ok := n.bucket.nodes[target.inodes[len(target.inodes)-1].pgid]; ok {
|
||||||
|
child.parent.removeChild(child)
|
||||||
child.parent = n
|
child.parent = n
|
||||||
|
child.parent.children = append(child.parent.children, child)
|
||||||
}
|
}
|
||||||
n.inodes = append(n.inodes, inode{})
|
n.inodes = append(n.inodes, inode{})
|
||||||
copy(n.inodes[1:], n.inodes)
|
copy(n.inodes[1:], n.inodes)
|
||||||
|
@ -309,26 +403,32 @@ func (n *node) rebalance() {
|
||||||
// Reparent all child nodes being moved.
|
// Reparent all child nodes being moved.
|
||||||
for _, inode := range target.inodes {
|
for _, inode := range target.inodes {
|
||||||
if child, ok := n.bucket.nodes[inode.pgid]; ok {
|
if child, ok := n.bucket.nodes[inode.pgid]; ok {
|
||||||
|
child.parent.removeChild(child)
|
||||||
child.parent = n
|
child.parent = n
|
||||||
|
child.parent.children = append(child.parent.children, child)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy over inodes from target and remove target.
|
// Copy over inodes from target and remove target.
|
||||||
n.inodes = append(n.inodes, target.inodes...)
|
n.inodes = append(n.inodes, target.inodes...)
|
||||||
n.parent.del(target.key)
|
n.parent.del(target.key)
|
||||||
|
n.parent.removeChild(target)
|
||||||
delete(n.bucket.nodes, target.pgid)
|
delete(n.bucket.nodes, target.pgid)
|
||||||
target.free()
|
target.free()
|
||||||
} else {
|
} else {
|
||||||
// Reparent all child nodes being moved.
|
// Reparent all child nodes being moved.
|
||||||
for _, inode := range n.inodes {
|
for _, inode := range n.inodes {
|
||||||
if child, ok := n.bucket.nodes[inode.pgid]; ok {
|
if child, ok := n.bucket.nodes[inode.pgid]; ok {
|
||||||
|
child.parent.removeChild(child)
|
||||||
child.parent = target
|
child.parent = target
|
||||||
|
child.parent.children = append(child.parent.children, child)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy over inodes to target and remove node.
|
// Copy over inodes to target and remove node.
|
||||||
target.inodes = append(target.inodes, n.inodes...)
|
target.inodes = append(target.inodes, n.inodes...)
|
||||||
n.parent.del(n.key)
|
n.parent.del(n.key)
|
||||||
|
n.parent.removeChild(n)
|
||||||
n.parent.put(target.key, target.inodes[0].key, nil, target.pgid, 0)
|
n.parent.put(target.key, target.inodes[0].key, nil, target.pgid, 0)
|
||||||
delete(n.bucket.nodes, n.pgid)
|
delete(n.bucket.nodes, n.pgid)
|
||||||
n.free()
|
n.free()
|
||||||
|
@ -338,6 +438,17 @@ func (n *node) rebalance() {
|
||||||
n.parent.rebalance()
|
n.parent.rebalance()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// removes a node from the list of in-memory children.
|
||||||
|
// This does not affect the inodes.
|
||||||
|
func (n *node) removeChild(target *node) {
|
||||||
|
for i, child := range n.children {
|
||||||
|
if child == target {
|
||||||
|
n.children = append(n.children[:i], n.children[i+1:]...)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// dereference causes the node to copy all its inode key/value references to heap memory.
|
// dereference causes the node to copy all its inode key/value references to heap memory.
|
||||||
// This is required when the mmap is reallocated so inodes are not pointing to stale data.
|
// This is required when the mmap is reallocated so inodes are not pointing to stale data.
|
||||||
func (n *node) dereference() {
|
func (n *node) dereference() {
|
||||||
|
@ -362,16 +473,10 @@ func (n *node) dereference() {
|
||||||
func (n *node) free() {
|
func (n *node) free() {
|
||||||
if n.pgid != 0 {
|
if n.pgid != 0 {
|
||||||
n.bucket.tx.db.freelist.free(n.bucket.tx.id(), n.bucket.tx.page(n.pgid))
|
n.bucket.tx.db.freelist.free(n.bucket.tx.id(), n.bucket.tx.page(n.pgid))
|
||||||
|
n.pgid = 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// nodesByDepth sorts a list of branches by deepest first.
|
|
||||||
type nodesByDepth []*node
|
|
||||||
|
|
||||||
func (s nodesByDepth) Len() int { return len(s) }
|
|
||||||
func (s nodesByDepth) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
|
||||||
func (s nodesByDepth) Less(i, j int) bool { return s[i].depth > s[j].depth }
|
|
||||||
|
|
||||||
// inode represents an internal node inside of a node.
|
// inode represents an internal node inside of a node.
|
||||||
// It can be used to point to elements in a page or point
|
// It can be used to point to elements in a page or point
|
||||||
// to an element which hasn't been added to a page yet.
|
// to an element which hasn't been added to a page yet.
|
||||||
|
|
25
node_test.go
25
node_test.go
|
@ -85,7 +85,7 @@ func TestNode_write_LeafPage(t *testing.T) {
|
||||||
// Ensure that a node can split into appropriate subgroups.
|
// Ensure that a node can split into appropriate subgroups.
|
||||||
func TestNode_split(t *testing.T) {
|
func TestNode_split(t *testing.T) {
|
||||||
// Create a node.
|
// Create a node.
|
||||||
n := &node{inodes: make(inodes, 0)}
|
n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{}}}
|
||||||
n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0)
|
n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0)
|
||||||
n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0)
|
n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0)
|
||||||
n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0)
|
n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0)
|
||||||
|
@ -93,30 +93,30 @@ func TestNode_split(t *testing.T) {
|
||||||
n.put([]byte("00000005"), []byte("00000005"), []byte("0123456701234567"), 0, 0)
|
n.put([]byte("00000005"), []byte("00000005"), []byte("0123456701234567"), 0, 0)
|
||||||
|
|
||||||
// Split between 2 & 3.
|
// Split between 2 & 3.
|
||||||
nodes := n.split(100)
|
n.split(100)
|
||||||
|
|
||||||
assert.Equal(t, len(nodes), 2)
|
var parent = n.parent
|
||||||
assert.Equal(t, len(nodes[0].inodes), 2)
|
assert.Equal(t, len(parent.children), 2)
|
||||||
assert.Equal(t, len(nodes[1].inodes), 3)
|
assert.Equal(t, len(parent.children[0].inodes), 2)
|
||||||
|
assert.Equal(t, len(parent.children[1].inodes), 3)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure that a page with the minimum number of inodes just returns a single node.
|
// Ensure that a page with the minimum number of inodes just returns a single node.
|
||||||
func TestNode_split_MinKeys(t *testing.T) {
|
func TestNode_split_MinKeys(t *testing.T) {
|
||||||
// Create a node.
|
// Create a node.
|
||||||
n := &node{inodes: make(inodes, 0)}
|
n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{}}}
|
||||||
n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0)
|
n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0)
|
||||||
n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0)
|
n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0)
|
||||||
|
|
||||||
// Split.
|
// Split.
|
||||||
nodes := n.split(20)
|
n.split(20)
|
||||||
assert.Equal(t, len(nodes), 1)
|
assert.Nil(t, n.parent)
|
||||||
assert.Equal(t, len(nodes[0].inodes), 2)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure that a node that has keys that all fit on a page just returns one leaf.
|
// Ensure that a node that has keys that all fit on a page just returns one leaf.
|
||||||
func TestNode_split_SinglePage(t *testing.T) {
|
func TestNode_split_SinglePage(t *testing.T) {
|
||||||
// Create a node.
|
// Create a node.
|
||||||
n := &node{inodes: make(inodes, 0)}
|
n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{}}}
|
||||||
n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0)
|
n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0)
|
||||||
n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0)
|
n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0)
|
||||||
n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0)
|
n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0)
|
||||||
|
@ -124,7 +124,6 @@ func TestNode_split_SinglePage(t *testing.T) {
|
||||||
n.put([]byte("00000005"), []byte("00000005"), []byte("0123456701234567"), 0, 0)
|
n.put([]byte("00000005"), []byte("00000005"), []byte("0123456701234567"), 0, 0)
|
||||||
|
|
||||||
// Split.
|
// Split.
|
||||||
nodes := n.split(4096)
|
n.split(4096)
|
||||||
assert.Equal(t, len(nodes), 1)
|
assert.Nil(t, n.parent)
|
||||||
assert.Equal(t, len(nodes[0].inodes), 5)
|
|
||||||
}
|
}
|
||||||
|
|
7
tx.go
7
tx.go
|
@ -363,8 +363,9 @@ type TxStats struct {
|
||||||
Rebalance int // number of node rebalances
|
Rebalance int // number of node rebalances
|
||||||
RebalanceTime time.Duration // total time spent rebalancing
|
RebalanceTime time.Duration // total time spent rebalancing
|
||||||
|
|
||||||
// Spill statistics.
|
// Split/Spill statistics.
|
||||||
Spill int // number of node spilled
|
Split int // number of nodes split
|
||||||
|
Spill int // number of nodes spilled
|
||||||
SpillTime time.Duration // total time spent spilling
|
SpillTime time.Duration // total time spent spilling
|
||||||
|
|
||||||
// Write statistics.
|
// Write statistics.
|
||||||
|
@ -380,6 +381,7 @@ func (s *TxStats) add(other *TxStats) {
|
||||||
s.NodeDeref += other.NodeDeref
|
s.NodeDeref += other.NodeDeref
|
||||||
s.Rebalance += other.Rebalance
|
s.Rebalance += other.Rebalance
|
||||||
s.RebalanceTime += other.RebalanceTime
|
s.RebalanceTime += other.RebalanceTime
|
||||||
|
s.Split += other.Split
|
||||||
s.Spill += other.Spill
|
s.Spill += other.Spill
|
||||||
s.SpillTime += other.SpillTime
|
s.SpillTime += other.SpillTime
|
||||||
s.Write += other.Write
|
s.Write += other.Write
|
||||||
|
@ -398,6 +400,7 @@ func (s *TxStats) Sub(other *TxStats) TxStats {
|
||||||
diff.NodeDeref = s.NodeDeref - other.NodeDeref
|
diff.NodeDeref = s.NodeDeref - other.NodeDeref
|
||||||
diff.Rebalance = s.Rebalance - other.Rebalance
|
diff.Rebalance = s.Rebalance - other.Rebalance
|
||||||
diff.RebalanceTime = s.RebalanceTime - other.RebalanceTime
|
diff.RebalanceTime = s.RebalanceTime - other.RebalanceTime
|
||||||
|
diff.Split = s.Split - other.Split
|
||||||
diff.Spill = s.Spill - other.Spill
|
diff.Spill = s.Spill - other.Spill
|
||||||
diff.SpillTime = s.SpillTime - other.SpillTime
|
diff.SpillTime = s.SpillTime - other.SpillTime
|
||||||
diff.Write = s.Write - other.Write
|
diff.Write = s.Write - other.Write
|
||||||
|
|
Loading…
Reference in New Issue