mirror of
https://github.com/etcd-io/bbolt.git
synced 2025-05-02 05:29:42 +00:00
This commit reverts merge-split and fixes the node.split() to do a multi-page split. This issue caused problems with bulk loading because it would split into a small page and a very large page. The very large page, in turn, would be an arbitrary size so when it was freed later it would be difficult to reuse and would cause serious fragmentation issues.
607 lines
16 KiB
Go
607 lines
16 KiB
Go
package bolt
|
|
|
|
import (
|
|
"bytes"
|
|
"sort"
|
|
"unsafe"
|
|
)
|
|
|
|
// node represents an in-memory, deserialized page.
|
|
type node struct {
|
|
bucket *Bucket
|
|
dirty bool
|
|
isLeaf bool
|
|
unbalanced bool
|
|
key []byte
|
|
pgid pgid
|
|
parent *node
|
|
children nodes
|
|
inodes inodes
|
|
}
|
|
|
|
// root returns the top-level node this node is attached to.
|
|
func (n *node) root() *node {
|
|
if n.parent == nil {
|
|
return n
|
|
}
|
|
return n.parent.root()
|
|
}
|
|
|
|
// minKeys returns the minimum number of inodes this node should have.
|
|
func (n *node) minKeys() int {
|
|
if n.isLeaf {
|
|
return 1
|
|
}
|
|
return 2
|
|
}
|
|
|
|
// size returns the size of the node after serialization.
|
|
func (n *node) size() int {
|
|
var elementSize = n.pageElementSize()
|
|
|
|
var size = pageHeaderSize
|
|
for _, item := range n.inodes {
|
|
size += elementSize + len(item.key) + len(item.value)
|
|
}
|
|
return size
|
|
}
|
|
|
|
// pageElementSize returns the size of each page element based on the type of node.
|
|
func (n *node) pageElementSize() int {
|
|
if n.isLeaf {
|
|
return leafPageElementSize
|
|
}
|
|
return branchPageElementSize
|
|
}
|
|
|
|
// childAt returns the child node at a given index.
|
|
func (n *node) childAt(index int) *node {
|
|
_assert(!n.isLeaf, "invalid childAt(%d) on a leaf node", index)
|
|
return n.bucket.node(n.inodes[index].pgid, n)
|
|
}
|
|
|
|
// childIndex returns the index of a given child node.
|
|
func (n *node) childIndex(child *node) int {
|
|
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
|
|
return index
|
|
}
|
|
|
|
// numChildren returns the number of children.
|
|
func (n *node) numChildren() int {
|
|
return len(n.inodes)
|
|
}
|
|
|
|
// nextSibling returns the next node with the same parent.
|
|
func (n *node) nextSibling() *node {
|
|
if n.parent == nil {
|
|
return nil
|
|
}
|
|
index := n.parent.childIndex(n)
|
|
if index >= n.parent.numChildren()-1 {
|
|
return nil
|
|
}
|
|
return n.parent.childAt(index + 1)
|
|
}
|
|
|
|
// prevSibling returns the previous node with the same parent.
|
|
func (n *node) prevSibling() *node {
|
|
if n.parent == nil {
|
|
return nil
|
|
}
|
|
index := n.parent.childIndex(n)
|
|
if index == 0 {
|
|
return nil
|
|
}
|
|
return n.parent.childAt(index - 1)
|
|
}
|
|
|
|
// put inserts a key/value.
|
|
func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
|
|
_assert(pgid < n.bucket.tx.meta.pgid, "pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid)
|
|
_assert(len(oldKey) > 0, "put: zero-length old key")
|
|
_assert(len(newKey) > 0, "put: zero-length new key")
|
|
|
|
// Find insertion index.
|
|
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
|
|
|
|
// Add capacity and shift nodes if we don't have an exact match and need to insert.
|
|
exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
|
|
if !exact {
|
|
n.inodes = append(n.inodes, inode{})
|
|
copy(n.inodes[index+1:], n.inodes[index:])
|
|
}
|
|
|
|
inode := &n.inodes[index]
|
|
inode.flags = flags
|
|
inode.key = newKey
|
|
inode.value = value
|
|
inode.pgid = pgid
|
|
_assert(len(inode.key) > 0, "put: zero-length inode key")
|
|
}
|
|
|
|
// del removes a key from the node.
|
|
func (n *node) del(key []byte) {
|
|
// Find index of key.
|
|
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
|
|
|
|
// Exit if the key isn't found.
|
|
if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
|
|
return
|
|
}
|
|
|
|
// Delete inode from the node.
|
|
n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
|
|
|
|
// Mark the node as needing rebalancing.
|
|
n.unbalanced = true
|
|
}
|
|
|
|
// read initializes the node from a page.
|
|
func (n *node) read(p *page) {
|
|
n.pgid = p.id
|
|
n.isLeaf = ((p.flags & leafPageFlag) != 0)
|
|
n.inodes = make(inodes, int(p.count))
|
|
|
|
for i := 0; i < int(p.count); i++ {
|
|
inode := &n.inodes[i]
|
|
if n.isLeaf {
|
|
elem := p.leafPageElement(uint16(i))
|
|
inode.flags = elem.flags
|
|
inode.key = elem.key()
|
|
inode.value = elem.value()
|
|
} else {
|
|
elem := p.branchPageElement(uint16(i))
|
|
inode.pgid = elem.pgid
|
|
inode.key = elem.key()
|
|
}
|
|
_assert(len(inode.key) > 0, "read: zero-length inode key")
|
|
}
|
|
|
|
// Save first key so we can find the node in the parent when we spill.
|
|
if len(n.inodes) > 0 {
|
|
n.key = n.inodes[0].key
|
|
_assert(len(n.key) > 0, "read: zero-length node key")
|
|
} else {
|
|
n.key = nil
|
|
}
|
|
}
|
|
|
|
// write writes the items onto one or more pages.
|
|
func (n *node) write(p *page) {
|
|
// Initialize page.
|
|
if n.isLeaf {
|
|
p.flags |= leafPageFlag
|
|
} else {
|
|
p.flags |= branchPageFlag
|
|
}
|
|
p.count = uint16(len(n.inodes))
|
|
|
|
// Loop over each item and write it to the page.
|
|
b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
|
|
for i, item := range n.inodes {
|
|
_assert(len(item.key) > 0, "write: zero-length inode key")
|
|
|
|
// Write the page element.
|
|
if n.isLeaf {
|
|
elem := p.leafPageElement(uint16(i))
|
|
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
|
|
elem.flags = item.flags
|
|
elem.ksize = uint32(len(item.key))
|
|
elem.vsize = uint32(len(item.value))
|
|
} else {
|
|
elem := p.branchPageElement(uint16(i))
|
|
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
|
|
elem.ksize = uint32(len(item.key))
|
|
elem.pgid = item.pgid
|
|
_assert(elem.pgid != p.id, "write: circular dependency occurred")
|
|
}
|
|
|
|
// Write data for the element to the end of the page.
|
|
copy(b[0:], item.key)
|
|
b = b[len(item.key):]
|
|
copy(b[0:], item.value)
|
|
b = b[len(item.value):]
|
|
}
|
|
|
|
// DEBUG ONLY: n.dump()
|
|
}
|
|
|
|
// split breaks up a node into multiple smaller nodes, if appropriate.
|
|
// This should only be called from the spill() function.
|
|
func (n *node) split(pageSize int) []*node {
|
|
var nodes []*node
|
|
|
|
node := n
|
|
for {
|
|
// Split node into two.
|
|
a, b := node.splitTwo(pageSize)
|
|
nodes = append(nodes, a)
|
|
|
|
// If we can't split then exit the loop.
|
|
if b == nil {
|
|
break
|
|
}
|
|
|
|
// Set node to b so it gets split on the next iteration.
|
|
node = b
|
|
}
|
|
|
|
return nodes
|
|
}
|
|
|
|
// splitTwo breaks up a node into two smaller nodes, if appropriate.
|
|
// This should only be called from the split() function.
|
|
func (n *node) splitTwo(pageSize int) (*node, *node) {
|
|
// Ignore the split if the page doesn't have at least enough nodes for
|
|
// two pages or if the data can fit on a single page.
|
|
sz := n.size()
|
|
if len(n.inodes) <= (minKeysPerPage*2) || sz < pageSize {
|
|
return n, nil
|
|
}
|
|
|
|
// Determine the threshold before starting a new node.
|
|
var fillPercent = n.bucket.tx.db.FillPercent
|
|
if fillPercent < minFillPercent {
|
|
fillPercent = minFillPercent
|
|
} else if fillPercent > maxFillPercent {
|
|
fillPercent = maxFillPercent
|
|
}
|
|
threshold := int(float64(pageSize) * fillPercent)
|
|
|
|
// Determine split position and sizes of the two pages.
|
|
splitIndex, _ := n.splitIndex(threshold)
|
|
|
|
// Split node into two separate nodes.
|
|
// If there's no parent then we'll need to create one.
|
|
if n.parent == nil {
|
|
n.parent = &node{bucket: n.bucket, children: []*node{n}}
|
|
}
|
|
|
|
// Create a new node and add it to the parent.
|
|
next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
|
|
n.parent.children = append(n.parent.children, next)
|
|
|
|
// Split inodes across two nodes.
|
|
next.inodes = n.inodes[splitIndex:]
|
|
n.inodes = n.inodes[:splitIndex]
|
|
|
|
// Update the statistics.
|
|
n.bucket.tx.stats.Split++
|
|
|
|
return n, next
|
|
}
|
|
|
|
// splitIndex finds the position where a page will fill a given threshold.
|
|
// It returns the index as well as the size of the first page.
|
|
// This is only be called from split().
|
|
func (n *node) splitIndex(threshold int) (index, sz int) {
|
|
sz = pageHeaderSize
|
|
|
|
// Loop until we only have the minimum number of keys required for the second page.
|
|
for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
|
|
index = i
|
|
inode := n.inodes[i]
|
|
elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
|
|
|
|
// If we have at least the minimum number of keys and adding another
|
|
// node would put us over the threshold then exit and return.
|
|
if i >= minKeysPerPage && sz+elsize > threshold {
|
|
break
|
|
}
|
|
|
|
// Add the element size to the total size.
|
|
sz += elsize
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// spill writes the nodes to dirty pages and splits nodes as it goes.
|
|
// Returns an error if dirty pages cannot be allocated.
|
|
func (n *node) spill() error {
|
|
var tx = n.bucket.tx
|
|
|
|
// Spill child nodes first. Child nodes can materialize sibling nodes in
|
|
// the case of split-merge so we cannot use a range loop. We have to check
|
|
// the children size on every loop iteration.
|
|
sort.Sort(n.children)
|
|
for i := 0; i < len(n.children); i++ {
|
|
if err := n.children[i].spill(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// We no longer need the child list because it's only used for spill tracking.
|
|
n.children = nil
|
|
|
|
// Split nodes into appropriate sizes. The first node will always be n.
|
|
var nodes = n.split(tx.db.pageSize)
|
|
for _, node := range nodes {
|
|
// Add node's page to the freelist if it's not new.
|
|
if node.pgid > 0 {
|
|
tx.db.freelist.free(tx.id(), tx.page(node.pgid))
|
|
node.pgid = 0
|
|
}
|
|
|
|
// Allocate contiguous space for the node.
|
|
p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Write the node.
|
|
_assert(p.id < tx.meta.pgid, "pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)
|
|
node.pgid = p.id
|
|
node.write(p)
|
|
|
|
// Insert into parent inodes.
|
|
if node.parent != nil {
|
|
var key = node.key
|
|
if key == nil {
|
|
key = node.inodes[0].key
|
|
}
|
|
|
|
node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
|
|
node.key = node.inodes[0].key
|
|
_assert(len(node.key) > 0, "spill: zero-length node key")
|
|
}
|
|
|
|
// Update the statistics.
|
|
tx.stats.Spill++
|
|
}
|
|
|
|
// This is a special case where we need to write the parent if it is new
|
|
// and caused by a split in the root.
|
|
var parent = n.parent
|
|
if parent != nil && parent.pgid == 0 {
|
|
// Allocate contiguous space for the node.
|
|
p, err := tx.allocate((parent.size() / tx.db.pageSize) + 1)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Write the new root.
|
|
_assert(p.id < tx.meta.pgid, "pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)
|
|
parent.pgid = p.id
|
|
parent.write(p)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// rebalance attempts to combine the node with sibling nodes if the node fill
|
|
// size is below a threshold or if there are not enough keys.
|
|
func (n *node) rebalance() {
|
|
if !n.unbalanced {
|
|
return
|
|
}
|
|
n.unbalanced = false
|
|
|
|
// Update statistics.
|
|
n.bucket.tx.stats.Rebalance++
|
|
|
|
// Ignore if node is above threshold (25%) and has enough keys.
|
|
var threshold = n.bucket.tx.db.pageSize / 4
|
|
if n.size() > threshold && len(n.inodes) > n.minKeys() {
|
|
return
|
|
}
|
|
|
|
// Root node has special handling.
|
|
if n.parent == nil {
|
|
// If root node is a branch and only has one node then collapse it.
|
|
if !n.isLeaf && len(n.inodes) == 1 {
|
|
// Move root's child up.
|
|
child := n.bucket.node(n.inodes[0].pgid, n)
|
|
n.isLeaf = child.isLeaf
|
|
n.inodes = child.inodes[:]
|
|
n.children = child.children
|
|
|
|
// Reparent all child nodes being moved.
|
|
for _, inode := range n.inodes {
|
|
if child, ok := n.bucket.nodes[inode.pgid]; ok {
|
|
child.parent = n
|
|
}
|
|
}
|
|
|
|
// Remove old child.
|
|
child.parent = nil
|
|
delete(n.bucket.nodes, child.pgid)
|
|
child.free()
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// If node has no keys then just remove it.
|
|
if n.numChildren() == 0 {
|
|
n.parent.del(n.key)
|
|
n.parent.removeChild(n)
|
|
delete(n.bucket.nodes, n.pgid)
|
|
n.free()
|
|
n.parent.rebalance()
|
|
return
|
|
}
|
|
|
|
_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
|
|
|
|
// Destination node is right sibling if idx == 0, otherwise left sibling.
|
|
var target *node
|
|
var useNextSibling = (n.parent.childIndex(n) == 0)
|
|
if useNextSibling {
|
|
target = n.nextSibling()
|
|
} else {
|
|
target = n.prevSibling()
|
|
}
|
|
|
|
// If target node has extra nodes then just move one over.
|
|
if target.numChildren() > target.minKeys() {
|
|
if useNextSibling {
|
|
// Reparent and move node.
|
|
if child, ok := n.bucket.nodes[target.inodes[0].pgid]; ok {
|
|
child.parent.removeChild(child)
|
|
child.parent = n
|
|
child.parent.children = append(child.parent.children, child)
|
|
}
|
|
n.inodes = append(n.inodes, target.inodes[0])
|
|
target.inodes = target.inodes[1:]
|
|
|
|
// Update target key on parent.
|
|
target.parent.put(target.key, target.inodes[0].key, nil, target.pgid, 0)
|
|
target.key = target.inodes[0].key
|
|
_assert(len(target.key) > 0, "rebalance(1): zero-length node key")
|
|
} else {
|
|
// Reparent and move node.
|
|
if child, ok := n.bucket.nodes[target.inodes[len(target.inodes)-1].pgid]; ok {
|
|
child.parent.removeChild(child)
|
|
child.parent = n
|
|
child.parent.children = append(child.parent.children, child)
|
|
}
|
|
n.inodes = append(n.inodes, inode{})
|
|
copy(n.inodes[1:], n.inodes)
|
|
n.inodes[0] = target.inodes[len(target.inodes)-1]
|
|
target.inodes = target.inodes[:len(target.inodes)-1]
|
|
}
|
|
|
|
// Update parent key for node.
|
|
n.parent.put(n.key, n.inodes[0].key, nil, n.pgid, 0)
|
|
n.key = n.inodes[0].key
|
|
_assert(len(n.key) > 0, "rebalance(2): zero-length node key")
|
|
|
|
return
|
|
}
|
|
|
|
// If both this node and the target node are too small then merge them.
|
|
if useNextSibling {
|
|
// Reparent all child nodes being moved.
|
|
for _, inode := range target.inodes {
|
|
if child, ok := n.bucket.nodes[inode.pgid]; ok {
|
|
child.parent.removeChild(child)
|
|
child.parent = n
|
|
child.parent.children = append(child.parent.children, child)
|
|
}
|
|
}
|
|
|
|
// Copy over inodes from target and remove target.
|
|
n.inodes = append(n.inodes, target.inodes...)
|
|
n.parent.del(target.key)
|
|
n.parent.removeChild(target)
|
|
delete(n.bucket.nodes, target.pgid)
|
|
target.free()
|
|
} else {
|
|
// Reparent all child nodes being moved.
|
|
for _, inode := range n.inodes {
|
|
if child, ok := n.bucket.nodes[inode.pgid]; ok {
|
|
child.parent.removeChild(child)
|
|
child.parent = target
|
|
child.parent.children = append(child.parent.children, child)
|
|
}
|
|
}
|
|
|
|
// Copy over inodes to target and remove node.
|
|
target.inodes = append(target.inodes, n.inodes...)
|
|
n.parent.del(n.key)
|
|
n.parent.removeChild(n)
|
|
delete(n.bucket.nodes, n.pgid)
|
|
n.free()
|
|
}
|
|
|
|
// Either this node or the target node was deleted from the parent so rebalance it.
|
|
n.parent.rebalance()
|
|
}
|
|
|
|
// removes a node from the list of in-memory children.
|
|
// This does not affect the inodes.
|
|
func (n *node) removeChild(target *node) {
|
|
for i, child := range n.children {
|
|
if child == target {
|
|
n.children = append(n.children[:i], n.children[i+1:]...)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// dereference causes the node to copy all its inode key/value references to heap memory.
|
|
// This is required when the mmap is reallocated so inodes are not pointing to stale data.
|
|
func (n *node) dereference() {
|
|
if n.key != nil {
|
|
key := make([]byte, len(n.key))
|
|
copy(key, n.key)
|
|
n.key = key
|
|
_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
|
|
}
|
|
|
|
for i := range n.inodes {
|
|
inode := &n.inodes[i]
|
|
|
|
key := make([]byte, len(inode.key))
|
|
copy(key, inode.key)
|
|
inode.key = key
|
|
_assert(len(inode.key) > 0, "dereference: zero-length inode key")
|
|
|
|
value := make([]byte, len(inode.value))
|
|
copy(value, inode.value)
|
|
inode.value = value
|
|
}
|
|
|
|
// Recursively dereference children.
|
|
for _, child := range n.children {
|
|
child.dereference()
|
|
}
|
|
|
|
// Update statistics.
|
|
n.bucket.tx.stats.NodeDeref++
|
|
}
|
|
|
|
// free adds the node's underlying page to the freelist.
|
|
func (n *node) free() {
|
|
if n.pgid != 0 {
|
|
n.bucket.tx.db.freelist.free(n.bucket.tx.id(), n.bucket.tx.page(n.pgid))
|
|
n.pgid = 0
|
|
}
|
|
}
|
|
|
|
// dump writes the contents of the node to STDERR for debugging purposes.
|
|
/*
|
|
func (n *node) dump() {
|
|
// Write node header.
|
|
var typ = "branch"
|
|
if n.isLeaf {
|
|
typ = "leaf"
|
|
}
|
|
warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
|
|
|
|
// Write out abbreviated version of each item.
|
|
for _, item := range n.inodes {
|
|
if n.isLeaf {
|
|
if item.flags&bucketLeafFlag != 0 {
|
|
bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
|
|
warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
|
|
} else {
|
|
warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
|
|
}
|
|
} else {
|
|
warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
|
|
}
|
|
}
|
|
warn("")
|
|
}
|
|
*/
|
|
|
|
type nodes []*node
|
|
|
|
func (s nodes) Len() int { return len(s) }
|
|
func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
|
func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
|
|
|
|
// inode represents an internal node inside of a node.
|
|
// It can be used to point to elements in a page or point
|
|
// to an element which hasn't been added to a page yet.
|
|
type inode struct {
|
|
flags uint32
|
|
pgid pgid
|
|
key []byte
|
|
value []byte
|
|
}
|
|
|
|
type inodes []inode
|