mirror of
https://github.com/etcd-io/bbolt.git
synced 2025-05-02 13:39:45 +00:00
This commit fixes a rare issue where a page can become accessible when it has already been freed. This occurs when the first two child pages of a parent both have deletions and the first page has 1 remaining children and the second page has 2 remaining children. During rebalancing the first page pulls an element from the second page and then the second page pulls the same element back from the first. The child page was not being freed properly. I resolved this issue by removing this part of the rebalancing. I made this choice for two reasons: 1. Moving a single item between pages has negligible benefit. The page will eventually be cleaned up when it reaches zero elements. 2. This is an infrequently executed branch of code which increases the likelihood of bugs occurring and it makes it more difficult to test properly. Fixes #348
600 lines
16 KiB
Go
600 lines
16 KiB
Go
package bolt
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"sort"
|
|
"unsafe"
|
|
)
|
|
|
|
// node represents an in-memory, deserialized page.
|
|
type node struct {
|
|
bucket *Bucket
|
|
isLeaf bool
|
|
unbalanced bool
|
|
spilled bool
|
|
key []byte
|
|
pgid pgid
|
|
parent *node
|
|
children nodes
|
|
inodes inodes
|
|
}
|
|
|
|
// root returns the top-level node this node is attached to.
|
|
func (n *node) root() *node {
|
|
if n.parent == nil {
|
|
return n
|
|
}
|
|
return n.parent.root()
|
|
}
|
|
|
|
// minKeys returns the minimum number of inodes this node should have.
|
|
func (n *node) minKeys() int {
|
|
if n.isLeaf {
|
|
return 1
|
|
}
|
|
return 2
|
|
}
|
|
|
|
// size returns the size of the node after serialization.
|
|
func (n *node) size() int {
|
|
sz, elsz := pageHeaderSize, n.pageElementSize()
|
|
for i := 0; i < len(n.inodes); i++ {
|
|
item := &n.inodes[i]
|
|
sz += elsz + len(item.key) + len(item.value)
|
|
}
|
|
return sz
|
|
}
|
|
|
|
// sizeLessThan returns true if the node is less than a given size.
|
|
// This is an optimization to avoid calculating a large node when we only need
|
|
// to know if it fits inside a certain page size.
|
|
func (n *node) sizeLessThan(v int) bool {
|
|
sz, elsz := pageHeaderSize, n.pageElementSize()
|
|
for i := 0; i < len(n.inodes); i++ {
|
|
item := &n.inodes[i]
|
|
sz += elsz + len(item.key) + len(item.value)
|
|
if sz >= v {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// pageElementSize returns the size of each page element based on the type of node.
|
|
func (n *node) pageElementSize() int {
|
|
if n.isLeaf {
|
|
return leafPageElementSize
|
|
}
|
|
return branchPageElementSize
|
|
}
|
|
|
|
// childAt returns the child node at a given index.
|
|
func (n *node) childAt(index int) *node {
|
|
if n.isLeaf {
|
|
panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
|
|
}
|
|
return n.bucket.node(n.inodes[index].pgid, n)
|
|
}
|
|
|
|
// childIndex returns the index of a given child node.
|
|
func (n *node) childIndex(child *node) int {
|
|
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
|
|
return index
|
|
}
|
|
|
|
// numChildren returns the number of children.
|
|
func (n *node) numChildren() int {
|
|
return len(n.inodes)
|
|
}
|
|
|
|
// nextSibling returns the next node with the same parent.
|
|
func (n *node) nextSibling() *node {
|
|
if n.parent == nil {
|
|
return nil
|
|
}
|
|
index := n.parent.childIndex(n)
|
|
if index >= n.parent.numChildren()-1 {
|
|
return nil
|
|
}
|
|
return n.parent.childAt(index + 1)
|
|
}
|
|
|
|
// prevSibling returns the previous node with the same parent.
|
|
func (n *node) prevSibling() *node {
|
|
if n.parent == nil {
|
|
return nil
|
|
}
|
|
index := n.parent.childIndex(n)
|
|
if index == 0 {
|
|
return nil
|
|
}
|
|
return n.parent.childAt(index - 1)
|
|
}
|
|
|
|
// put inserts a key/value.
|
|
func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
|
|
if pgid >= n.bucket.tx.meta.pgid {
|
|
panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
|
|
} else if len(oldKey) <= 0 {
|
|
panic("put: zero-length old key")
|
|
} else if len(newKey) <= 0 {
|
|
panic("put: zero-length new key")
|
|
}
|
|
|
|
// Find insertion index.
|
|
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
|
|
|
|
// Add capacity and shift nodes if we don't have an exact match and need to insert.
|
|
exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
|
|
if !exact {
|
|
n.inodes = append(n.inodes, inode{})
|
|
copy(n.inodes[index+1:], n.inodes[index:])
|
|
}
|
|
|
|
inode := &n.inodes[index]
|
|
inode.flags = flags
|
|
inode.key = newKey
|
|
inode.value = value
|
|
inode.pgid = pgid
|
|
_assert(len(inode.key) > 0, "put: zero-length inode key")
|
|
}
|
|
|
|
// del removes a key from the node.
|
|
func (n *node) del(key []byte) {
|
|
// Find index of key.
|
|
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
|
|
|
|
// Exit if the key isn't found.
|
|
if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
|
|
return
|
|
}
|
|
|
|
// Delete inode from the node.
|
|
n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
|
|
|
|
// Mark the node as needing rebalancing.
|
|
n.unbalanced = true
|
|
}
|
|
|
|
// read initializes the node from a page.
|
|
func (n *node) read(p *page) {
|
|
n.pgid = p.id
|
|
n.isLeaf = ((p.flags & leafPageFlag) != 0)
|
|
n.inodes = make(inodes, int(p.count))
|
|
|
|
for i := 0; i < int(p.count); i++ {
|
|
inode := &n.inodes[i]
|
|
if n.isLeaf {
|
|
elem := p.leafPageElement(uint16(i))
|
|
inode.flags = elem.flags
|
|
inode.key = elem.key()
|
|
inode.value = elem.value()
|
|
} else {
|
|
elem := p.branchPageElement(uint16(i))
|
|
inode.pgid = elem.pgid
|
|
inode.key = elem.key()
|
|
}
|
|
_assert(len(inode.key) > 0, "read: zero-length inode key")
|
|
}
|
|
|
|
// Save first key so we can find the node in the parent when we spill.
|
|
if len(n.inodes) > 0 {
|
|
n.key = n.inodes[0].key
|
|
_assert(len(n.key) > 0, "read: zero-length node key")
|
|
} else {
|
|
n.key = nil
|
|
}
|
|
}
|
|
|
|
// write writes the items onto one or more pages.
|
|
func (n *node) write(p *page) {
|
|
// Initialize page.
|
|
if n.isLeaf {
|
|
p.flags |= leafPageFlag
|
|
} else {
|
|
p.flags |= branchPageFlag
|
|
}
|
|
|
|
if len(n.inodes) >= 0xFFFF {
|
|
panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
|
|
}
|
|
p.count = uint16(len(n.inodes))
|
|
|
|
// Loop over each item and write it to the page.
|
|
b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
|
|
for i, item := range n.inodes {
|
|
_assert(len(item.key) > 0, "write: zero-length inode key")
|
|
|
|
// Write the page element.
|
|
if n.isLeaf {
|
|
elem := p.leafPageElement(uint16(i))
|
|
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
|
|
elem.flags = item.flags
|
|
elem.ksize = uint32(len(item.key))
|
|
elem.vsize = uint32(len(item.value))
|
|
} else {
|
|
elem := p.branchPageElement(uint16(i))
|
|
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
|
|
elem.ksize = uint32(len(item.key))
|
|
elem.pgid = item.pgid
|
|
_assert(elem.pgid != p.id, "write: circular dependency occurred")
|
|
}
|
|
|
|
// If the length of key+value is larger than the max allocation size
|
|
// then we need to reallocate the byte array pointer.
|
|
//
|
|
// See: https://github.com/boltdb/bolt/pull/335
|
|
klen, vlen := len(item.key), len(item.value)
|
|
if len(b) < klen+vlen {
|
|
b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
|
|
}
|
|
|
|
// Write data for the element to the end of the page.
|
|
copy(b[0:], item.key)
|
|
b = b[klen:]
|
|
copy(b[0:], item.value)
|
|
b = b[vlen:]
|
|
}
|
|
|
|
// DEBUG ONLY: n.dump()
|
|
}
|
|
|
|
// split breaks up a node into multiple smaller nodes, if appropriate.
|
|
// This should only be called from the spill() function.
|
|
func (n *node) split(pageSize int) []*node {
|
|
var nodes []*node
|
|
|
|
node := n
|
|
for {
|
|
// Split node into two.
|
|
a, b := node.splitTwo(pageSize)
|
|
nodes = append(nodes, a)
|
|
|
|
// If we can't split then exit the loop.
|
|
if b == nil {
|
|
break
|
|
}
|
|
|
|
// Set node to b so it gets split on the next iteration.
|
|
node = b
|
|
}
|
|
|
|
return nodes
|
|
}
|
|
|
|
// splitTwo breaks up a node into two smaller nodes, if appropriate.
|
|
// This should only be called from the split() function.
|
|
func (n *node) splitTwo(pageSize int) (*node, *node) {
|
|
// Ignore the split if the page doesn't have at least enough nodes for
|
|
// two pages or if the nodes can fit in a single page.
|
|
if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
|
|
return n, nil
|
|
}
|
|
|
|
// Determine the threshold before starting a new node.
|
|
var fillPercent = n.bucket.FillPercent
|
|
if fillPercent < minFillPercent {
|
|
fillPercent = minFillPercent
|
|
} else if fillPercent > maxFillPercent {
|
|
fillPercent = maxFillPercent
|
|
}
|
|
threshold := int(float64(pageSize) * fillPercent)
|
|
|
|
// Determine split position and sizes of the two pages.
|
|
splitIndex, _ := n.splitIndex(threshold)
|
|
|
|
// Split node into two separate nodes.
|
|
// If there's no parent then we'll need to create one.
|
|
if n.parent == nil {
|
|
n.parent = &node{bucket: n.bucket, children: []*node{n}}
|
|
}
|
|
|
|
// Create a new node and add it to the parent.
|
|
next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
|
|
n.parent.children = append(n.parent.children, next)
|
|
|
|
// Split inodes across two nodes.
|
|
next.inodes = n.inodes[splitIndex:]
|
|
n.inodes = n.inodes[:splitIndex]
|
|
|
|
// Update the statistics.
|
|
n.bucket.tx.stats.Split++
|
|
|
|
return n, next
|
|
}
|
|
|
|
// splitIndex finds the position where a page will fill a given threshold.
|
|
// It returns the index as well as the size of the first page.
|
|
// This is only be called from split().
|
|
func (n *node) splitIndex(threshold int) (index, sz int) {
|
|
sz = pageHeaderSize
|
|
|
|
// Loop until we only have the minimum number of keys required for the second page.
|
|
for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
|
|
index = i
|
|
inode := n.inodes[i]
|
|
elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
|
|
|
|
// If we have at least the minimum number of keys and adding another
|
|
// node would put us over the threshold then exit and return.
|
|
if i >= minKeysPerPage && sz+elsize > threshold {
|
|
break
|
|
}
|
|
|
|
// Add the element size to the total size.
|
|
sz += elsize
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// spill writes the nodes to dirty pages and splits nodes as it goes.
|
|
// Returns an error if dirty pages cannot be allocated.
|
|
func (n *node) spill() error {
|
|
var tx = n.bucket.tx
|
|
if n.spilled {
|
|
return nil
|
|
}
|
|
|
|
// Spill child nodes first. Child nodes can materialize sibling nodes in
|
|
// the case of split-merge so we cannot use a range loop. We have to check
|
|
// the children size on every loop iteration.
|
|
sort.Sort(n.children)
|
|
for i := 0; i < len(n.children); i++ {
|
|
if err := n.children[i].spill(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// We no longer need the child list because it's only used for spill tracking.
|
|
n.children = nil
|
|
|
|
// Split nodes into appropriate sizes. The first node will always be n.
|
|
var nodes = n.split(tx.db.pageSize)
|
|
for _, node := range nodes {
|
|
// Add node's page to the freelist if it's not new.
|
|
if node.pgid > 0 {
|
|
tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
|
|
node.pgid = 0
|
|
}
|
|
|
|
// Allocate contiguous space for the node.
|
|
p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Write the node.
|
|
if p.id >= tx.meta.pgid {
|
|
panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
|
|
}
|
|
node.pgid = p.id
|
|
node.write(p)
|
|
node.spilled = true
|
|
|
|
// Insert into parent inodes.
|
|
if node.parent != nil {
|
|
var key = node.key
|
|
if key == nil {
|
|
key = node.inodes[0].key
|
|
}
|
|
|
|
node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
|
|
node.key = node.inodes[0].key
|
|
_assert(len(node.key) > 0, "spill: zero-length node key")
|
|
}
|
|
|
|
// Update the statistics.
|
|
tx.stats.Spill++
|
|
}
|
|
|
|
// If the root node split and created a new root then we need to spill that
|
|
// as well. We'll clear out the children to make sure it doesn't try to respill.
|
|
if n.parent != nil && n.parent.pgid == 0 {
|
|
n.children = nil
|
|
return n.parent.spill()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// rebalance attempts to combine the node with sibling nodes if the node fill
|
|
// size is below a threshold or if there are not enough keys.
|
|
func (n *node) rebalance() {
|
|
if !n.unbalanced {
|
|
return
|
|
}
|
|
n.unbalanced = false
|
|
|
|
// Update statistics.
|
|
n.bucket.tx.stats.Rebalance++
|
|
|
|
// Ignore if node is above threshold (25%) and has enough keys.
|
|
var threshold = n.bucket.tx.db.pageSize / 4
|
|
if n.size() > threshold && len(n.inodes) > n.minKeys() {
|
|
return
|
|
}
|
|
|
|
// Root node has special handling.
|
|
if n.parent == nil {
|
|
// If root node is a branch and only has one node then collapse it.
|
|
if !n.isLeaf && len(n.inodes) == 1 {
|
|
// Move root's child up.
|
|
child := n.bucket.node(n.inodes[0].pgid, n)
|
|
n.isLeaf = child.isLeaf
|
|
n.inodes = child.inodes[:]
|
|
n.children = child.children
|
|
|
|
// Reparent all child nodes being moved.
|
|
for _, inode := range n.inodes {
|
|
if child, ok := n.bucket.nodes[inode.pgid]; ok {
|
|
child.parent = n
|
|
}
|
|
}
|
|
|
|
// Remove old child.
|
|
child.parent = nil
|
|
delete(n.bucket.nodes, child.pgid)
|
|
child.free()
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// If node has no keys then just remove it.
|
|
if n.numChildren() == 0 {
|
|
n.parent.del(n.key)
|
|
n.parent.removeChild(n)
|
|
delete(n.bucket.nodes, n.pgid)
|
|
n.free()
|
|
n.parent.rebalance()
|
|
return
|
|
}
|
|
|
|
_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
|
|
|
|
// Destination node is right sibling if idx == 0, otherwise left sibling.
|
|
var target *node
|
|
var useNextSibling = (n.parent.childIndex(n) == 0)
|
|
if useNextSibling {
|
|
target = n.nextSibling()
|
|
} else {
|
|
target = n.prevSibling()
|
|
}
|
|
|
|
// If both this node and the target node are too small then merge them.
|
|
if useNextSibling {
|
|
// Reparent all child nodes being moved.
|
|
for _, inode := range target.inodes {
|
|
if child, ok := n.bucket.nodes[inode.pgid]; ok {
|
|
child.parent.removeChild(child)
|
|
child.parent = n
|
|
child.parent.children = append(child.parent.children, child)
|
|
}
|
|
}
|
|
|
|
// Copy over inodes from target and remove target.
|
|
n.inodes = append(n.inodes, target.inodes...)
|
|
n.parent.del(target.key)
|
|
n.parent.removeChild(target)
|
|
delete(n.bucket.nodes, target.pgid)
|
|
target.free()
|
|
} else {
|
|
// Reparent all child nodes being moved.
|
|
for _, inode := range n.inodes {
|
|
if child, ok := n.bucket.nodes[inode.pgid]; ok {
|
|
child.parent.removeChild(child)
|
|
child.parent = target
|
|
child.parent.children = append(child.parent.children, child)
|
|
}
|
|
}
|
|
|
|
// Copy over inodes to target and remove node.
|
|
target.inodes = append(target.inodes, n.inodes...)
|
|
n.parent.del(n.key)
|
|
n.parent.removeChild(n)
|
|
delete(n.bucket.nodes, n.pgid)
|
|
n.free()
|
|
}
|
|
|
|
// Either this node or the target node was deleted from the parent so rebalance it.
|
|
n.parent.rebalance()
|
|
}
|
|
|
|
// removes a node from the list of in-memory children.
|
|
// This does not affect the inodes.
|
|
func (n *node) removeChild(target *node) {
|
|
for i, child := range n.children {
|
|
if child == target {
|
|
n.children = append(n.children[:i], n.children[i+1:]...)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// dereference causes the node to copy all its inode key/value references to heap memory.
|
|
// This is required when the mmap is reallocated so inodes are not pointing to stale data.
|
|
func (n *node) dereference() {
|
|
if n.key != nil {
|
|
key := make([]byte, len(n.key))
|
|
copy(key, n.key)
|
|
n.key = key
|
|
_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
|
|
}
|
|
|
|
for i := range n.inodes {
|
|
inode := &n.inodes[i]
|
|
|
|
key := make([]byte, len(inode.key))
|
|
copy(key, inode.key)
|
|
inode.key = key
|
|
_assert(len(inode.key) > 0, "dereference: zero-length inode key")
|
|
|
|
value := make([]byte, len(inode.value))
|
|
copy(value, inode.value)
|
|
inode.value = value
|
|
}
|
|
|
|
// Recursively dereference children.
|
|
for _, child := range n.children {
|
|
child.dereference()
|
|
}
|
|
|
|
// Update statistics.
|
|
n.bucket.tx.stats.NodeDeref++
|
|
}
|
|
|
|
// free adds the node's underlying page to the freelist.
|
|
func (n *node) free() {
|
|
if n.pgid != 0 {
|
|
n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
|
|
n.pgid = 0
|
|
}
|
|
}
|
|
|
|
// dump writes the contents of the node to STDERR for debugging purposes.
|
|
/*
|
|
func (n *node) dump() {
|
|
// Write node header.
|
|
var typ = "branch"
|
|
if n.isLeaf {
|
|
typ = "leaf"
|
|
}
|
|
warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
|
|
|
|
// Write out abbreviated version of each item.
|
|
for _, item := range n.inodes {
|
|
if n.isLeaf {
|
|
if item.flags&bucketLeafFlag != 0 {
|
|
bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
|
|
warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
|
|
} else {
|
|
warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
|
|
}
|
|
} else {
|
|
warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
|
|
}
|
|
}
|
|
warn("")
|
|
}
|
|
*/
|
|
|
|
type nodes []*node
|
|
|
|
func (s nodes) Len() int { return len(s) }
|
|
func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
|
func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
|
|
|
|
// inode represents an internal node inside of a node.
|
|
// It can be used to point to elements in a page or point
|
|
// to an element which hasn't been added to a page yet.
|
|
type inode struct {
|
|
flags uint32
|
|
pgid pgid
|
|
key []byte
|
|
value []byte
|
|
}
|
|
|
|
type inodes []inode
|