Merge pull request #407 from ahrtr/refactor_inode_20230215

Refactor: move inode to internal/common package
pull/415/head
Piotr Tabor 2023-03-06 18:07:54 +01:00 committed by GitHub
commit 17b18580c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
36 changed files with 1519 additions and 1351 deletions

View File

@ -2,20 +2,22 @@ package bbolt
import ( import (
"testing" "testing"
"go.etcd.io/bbolt/internal/common"
) )
func TestTx_allocatePageStats(t *testing.T) { func TestTx_allocatePageStats(t *testing.T) {
f := newTestFreelist() f := newTestFreelist()
ids := []pgid{2, 3} ids := []common.Pgid{2, 3}
f.readIDs(ids) f.readIDs(ids)
tx := &Tx{ tx := &Tx{
db: &DB{ db: &DB{
freelist: f, freelist: f,
pageSize: defaultPageSize, pageSize: common.DefaultPageSize,
}, },
meta: &meta{}, meta: &common.Meta{},
pages: make(map[pgid]*page), pages: make(map[common.Pgid]*common.Page),
} }
txStats := tx.Stats() txStats := tx.Stats()

View File

@ -10,6 +10,8 @@ import (
"unsafe" "unsafe"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
"go.etcd.io/bbolt/internal/common"
) )
// flock acquires an advisory lock on a file descriptor. // flock acquires an advisory lock on a file descriptor.
@ -36,7 +38,7 @@ func flock(db *DB, exclusive bool, timeout time.Duration) error {
// If we timed out then return an error. // If we timed out then return an error.
if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout { if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout {
return ErrTimeout return common.ErrTimeout
} }
// Wait for a bit and try again. // Wait for a bit and try again.

View File

@ -8,6 +8,8 @@ import (
"unsafe" "unsafe"
"golang.org/x/sys/windows" "golang.org/x/sys/windows"
"go.etcd.io/bbolt/internal/common"
) )
// fdatasync flushes written data to a file descriptor. // fdatasync flushes written data to a file descriptor.
@ -42,7 +44,7 @@ func flock(db *DB, exclusive bool, timeout time.Duration) error {
// If we timed oumercit then return an error. // If we timed oumercit then return an error.
if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout { if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout {
return ErrTimeout return common.ErrTimeout
} }
// Wait for a bit and try again. // Wait for a bit and try again.
@ -93,7 +95,7 @@ func mmap(db *DB, sz int) error {
} }
// Convert to a byte array. // Convert to a byte array.
db.data = ((*[maxMapSize]byte)(unsafe.Pointer(addr))) db.data = (*[maxMapSize]byte)(unsafe.Pointer(addr))
db.datasz = sz db.datasz = sz
return nil return nil

239
bucket.go
View File

@ -4,6 +4,8 @@ import (
"bytes" "bytes"
"fmt" "fmt"
"unsafe" "unsafe"
"go.etcd.io/bbolt/internal/common"
) )
const ( const (
@ -14,8 +16,6 @@ const (
MaxValueSize = (1 << 31) - 2 MaxValueSize = (1 << 31) - 2
) )
const bucketHeaderSize = int(unsafe.Sizeof(bucket{}))
const ( const (
minFillPercent = 0.1 minFillPercent = 0.1
maxFillPercent = 1.0 maxFillPercent = 1.0
@ -27,12 +27,12 @@ const DefaultFillPercent = 0.5
// Bucket represents a collection of key/value pairs inside the database. // Bucket represents a collection of key/value pairs inside the database.
type Bucket struct { type Bucket struct {
*bucket *common.InBucket
tx *Tx // the associated transaction tx *Tx // the associated transaction
buckets map[string]*Bucket // subbucket cache buckets map[string]*Bucket // subbucket cache
page *page // inline page reference page *common.Page // inline page reference
rootNode *node // materialized node for the root page. rootNode *node // materialized node for the root page.
nodes map[pgid]*node // node cache nodes map[common.Pgid]*node // node cache
// Sets the threshold for filling nodes when they split. By default, // Sets the threshold for filling nodes when they split. By default,
// the bucket will fill to 50% but it can be useful to increase this // the bucket will fill to 50% but it can be useful to increase this
@ -42,21 +42,12 @@ type Bucket struct {
FillPercent float64 FillPercent float64
} }
// bucket represents the on-file representation of a bucket.
// This is stored as the "value" of a bucket key. If the bucket is small enough,
// then its root page can be stored inline in the "value", after the bucket
// header. In the case of inline buckets, the "root" will be 0.
type bucket struct {
root pgid // page id of the bucket's root-level page
sequence uint64 // monotonically incrementing, used by NextSequence()
}
// newBucket returns a new bucket associated with a transaction. // newBucket returns a new bucket associated with a transaction.
func newBucket(tx *Tx) Bucket { func newBucket(tx *Tx) Bucket {
var b = Bucket{tx: tx, FillPercent: DefaultFillPercent} var b = Bucket{tx: tx, FillPercent: DefaultFillPercent}
if tx.writable { if tx.writable {
b.buckets = make(map[string]*Bucket) b.buckets = make(map[string]*Bucket)
b.nodes = make(map[pgid]*node) b.nodes = make(map[common.Pgid]*node)
} }
return b return b
} }
@ -67,8 +58,8 @@ func (b *Bucket) Tx() *Tx {
} }
// Root returns the root of the bucket. // Root returns the root of the bucket.
func (b *Bucket) Root() pgid { func (b *Bucket) Root() common.Pgid {
return b.root return b.RootPage()
} }
// Writable returns whether the bucket is writable. // Writable returns whether the bucket is writable.
@ -105,7 +96,7 @@ func (b *Bucket) Bucket(name []byte) *Bucket {
k, v, flags := c.seek(name) k, v, flags := c.seek(name)
// Return nil if the key doesn't exist or it is not a bucket. // Return nil if the key doesn't exist or it is not a bucket.
if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 { if !bytes.Equal(name, k) || (flags&common.BucketLeafFlag) == 0 {
return nil return nil
} }
@ -125,8 +116,8 @@ func (b *Bucket) openBucket(value []byte) *Bucket {
// Unaligned access requires a copy to be made. // Unaligned access requires a copy to be made.
const unalignedMask = unsafe.Alignof(struct { const unalignedMask = unsafe.Alignof(struct {
bucket common.InBucket
page common.Page
}{}) - 1 }{}) - 1
unaligned := uintptr(unsafe.Pointer(&value[0]))&unalignedMask != 0 unaligned := uintptr(unsafe.Pointer(&value[0]))&unalignedMask != 0
if unaligned { if unaligned {
@ -136,15 +127,15 @@ func (b *Bucket) openBucket(value []byte) *Bucket {
// If this is a writable transaction then we need to copy the bucket entry. // If this is a writable transaction then we need to copy the bucket entry.
// Read-only transactions can point directly at the mmap entry. // Read-only transactions can point directly at the mmap entry.
if b.tx.writable && !unaligned { if b.tx.writable && !unaligned {
child.bucket = &bucket{} child.InBucket = &common.InBucket{}
*child.bucket = *(*bucket)(unsafe.Pointer(&value[0])) *child.InBucket = *(*common.InBucket)(unsafe.Pointer(&value[0]))
} else { } else {
child.bucket = (*bucket)(unsafe.Pointer(&value[0])) child.InBucket = (*common.InBucket)(unsafe.Pointer(&value[0]))
} }
// Save a reference to the inline page if the bucket is inline. // Save a reference to the inline page if the bucket is inline.
if child.root == 0 { if child.RootPage() == 0 {
child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize])) child.page = (*common.Page)(unsafe.Pointer(&value[common.BucketHeaderSize]))
} }
return &child return &child
@ -155,11 +146,11 @@ func (b *Bucket) openBucket(value []byte) *Bucket {
// The bucket instance is only valid for the lifetime of the transaction. // The bucket instance is only valid for the lifetime of the transaction.
func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) { func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
if b.tx.db == nil { if b.tx.db == nil {
return nil, ErrTxClosed return nil, common.ErrTxClosed
} else if !b.tx.writable { } else if !b.tx.writable {
return nil, ErrTxNotWritable return nil, common.ErrTxNotWritable
} else if len(key) == 0 { } else if len(key) == 0 {
return nil, ErrBucketNameRequired return nil, common.ErrBucketNameRequired
} }
// Move cursor to correct position. // Move cursor to correct position.
@ -168,15 +159,15 @@ func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
// Return an error if there is an existing key. // Return an error if there is an existing key.
if bytes.Equal(key, k) { if bytes.Equal(key, k) {
if (flags & bucketLeafFlag) != 0 { if (flags & common.BucketLeafFlag) != 0 {
return nil, ErrBucketExists return nil, common.ErrBucketExists
} }
return nil, ErrIncompatibleValue return nil, common.ErrIncompatibleValue
} }
// Create empty, inline bucket. // Create empty, inline bucket.
var bucket = Bucket{ var bucket = Bucket{
bucket: &bucket{}, InBucket: &common.InBucket{},
rootNode: &node{isLeaf: true}, rootNode: &node{isLeaf: true},
FillPercent: DefaultFillPercent, FillPercent: DefaultFillPercent,
} }
@ -184,7 +175,7 @@ func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
// Insert into node. // Insert into node.
key = cloneBytes(key) key = cloneBytes(key)
c.node().put(key, key, value, 0, bucketLeafFlag) c.node().put(key, key, value, 0, common.BucketLeafFlag)
// Since subbuckets are not allowed on inline buckets, we need to // Since subbuckets are not allowed on inline buckets, we need to
// dereference the inline page, if it exists. This will cause the bucket // dereference the inline page, if it exists. This will cause the bucket
@ -199,7 +190,7 @@ func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
// The bucket instance is only valid for the lifetime of the transaction. // The bucket instance is only valid for the lifetime of the transaction.
func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) { func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) {
child, err := b.CreateBucket(key) child, err := b.CreateBucket(key)
if err == ErrBucketExists { if err == common.ErrBucketExists {
return b.Bucket(key), nil return b.Bucket(key), nil
} else if err != nil { } else if err != nil {
return nil, err return nil, err
@ -211,9 +202,9 @@ func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) {
// Returns an error if the bucket does not exist, or if the key represents a non-bucket value. // Returns an error if the bucket does not exist, or if the key represents a non-bucket value.
func (b *Bucket) DeleteBucket(key []byte) error { func (b *Bucket) DeleteBucket(key []byte) error {
if b.tx.db == nil { if b.tx.db == nil {
return ErrTxClosed return common.ErrTxClosed
} else if !b.Writable() { } else if !b.Writable() {
return ErrTxNotWritable return common.ErrTxNotWritable
} }
// Move cursor to correct position. // Move cursor to correct position.
@ -222,9 +213,9 @@ func (b *Bucket) DeleteBucket(key []byte) error {
// Return an error if bucket doesn't exist or is not a bucket. // Return an error if bucket doesn't exist or is not a bucket.
if !bytes.Equal(key, k) { if !bytes.Equal(key, k) {
return ErrBucketNotFound return common.ErrBucketNotFound
} else if (flags & bucketLeafFlag) == 0 { } else if (flags & common.BucketLeafFlag) == 0 {
return ErrIncompatibleValue return common.ErrIncompatibleValue
} }
// Recursively delete all child buckets. // Recursively delete all child buckets.
@ -260,7 +251,7 @@ func (b *Bucket) Get(key []byte) []byte {
k, v, flags := b.Cursor().seek(key) k, v, flags := b.Cursor().seek(key)
// Return nil if this is a bucket. // Return nil if this is a bucket.
if (flags & bucketLeafFlag) != 0 { if (flags & common.BucketLeafFlag) != 0 {
return nil return nil
} }
@ -277,15 +268,15 @@ func (b *Bucket) Get(key []byte) []byte {
// Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large. // Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large.
func (b *Bucket) Put(key []byte, value []byte) error { func (b *Bucket) Put(key []byte, value []byte) error {
if b.tx.db == nil { if b.tx.db == nil {
return ErrTxClosed return common.ErrTxClosed
} else if !b.Writable() { } else if !b.Writable() {
return ErrTxNotWritable return common.ErrTxNotWritable
} else if len(key) == 0 { } else if len(key) == 0 {
return ErrKeyRequired return common.ErrKeyRequired
} else if len(key) > MaxKeySize { } else if len(key) > MaxKeySize {
return ErrKeyTooLarge return common.ErrKeyTooLarge
} else if int64(len(value)) > MaxValueSize { } else if int64(len(value)) > MaxValueSize {
return ErrValueTooLarge return common.ErrValueTooLarge
} }
// Move cursor to correct position. // Move cursor to correct position.
@ -293,8 +284,8 @@ func (b *Bucket) Put(key []byte, value []byte) error {
k, _, flags := c.seek(key) k, _, flags := c.seek(key)
// Return an error if there is an existing key with a bucket value. // Return an error if there is an existing key with a bucket value.
if bytes.Equal(key, k) && (flags&bucketLeafFlag) != 0 { if bytes.Equal(key, k) && (flags&common.BucketLeafFlag) != 0 {
return ErrIncompatibleValue return common.ErrIncompatibleValue
} }
// Insert into node. // Insert into node.
@ -309,9 +300,9 @@ func (b *Bucket) Put(key []byte, value []byte) error {
// Returns an error if the bucket was created from a read-only transaction. // Returns an error if the bucket was created from a read-only transaction.
func (b *Bucket) Delete(key []byte) error { func (b *Bucket) Delete(key []byte) error {
if b.tx.db == nil { if b.tx.db == nil {
return ErrTxClosed return common.ErrTxClosed
} else if !b.Writable() { } else if !b.Writable() {
return ErrTxNotWritable return common.ErrTxNotWritable
} }
// Move cursor to correct position. // Move cursor to correct position.
@ -324,8 +315,8 @@ func (b *Bucket) Delete(key []byte) error {
} }
// Return an error if there is already existing bucket value. // Return an error if there is already existing bucket value.
if (flags & bucketLeafFlag) != 0 { if (flags & common.BucketLeafFlag) != 0 {
return ErrIncompatibleValue return common.ErrIncompatibleValue
} }
// Delete the node if we have a matching key. // Delete the node if we have a matching key.
@ -335,44 +326,46 @@ func (b *Bucket) Delete(key []byte) error {
} }
// Sequence returns the current integer for the bucket without incrementing it. // Sequence returns the current integer for the bucket without incrementing it.
func (b *Bucket) Sequence() uint64 { return b.bucket.sequence } func (b *Bucket) Sequence() uint64 {
return b.InSequence()
}
// SetSequence updates the sequence number for the bucket. // SetSequence updates the sequence number for the bucket.
func (b *Bucket) SetSequence(v uint64) error { func (b *Bucket) SetSequence(v uint64) error {
if b.tx.db == nil { if b.tx.db == nil {
return ErrTxClosed return common.ErrTxClosed
} else if !b.Writable() { } else if !b.Writable() {
return ErrTxNotWritable return common.ErrTxNotWritable
} }
// Materialize the root node if it hasn't been already so that the // Materialize the root node if it hasn't been already so that the
// bucket will be saved during commit. // bucket will be saved during commit.
if b.rootNode == nil { if b.rootNode == nil {
_ = b.node(b.root, nil) _ = b.node(b.RootPage(), nil)
} }
// Set the sequence. // Set the sequence.
b.bucket.sequence = v b.SetInSequence(v)
return nil return nil
} }
// NextSequence returns an autoincrementing integer for the bucket. // NextSequence returns an autoincrementing integer for the bucket.
func (b *Bucket) NextSequence() (uint64, error) { func (b *Bucket) NextSequence() (uint64, error) {
if b.tx.db == nil { if b.tx.db == nil {
return 0, ErrTxClosed return 0, common.ErrTxClosed
} else if !b.Writable() { } else if !b.Writable() {
return 0, ErrTxNotWritable return 0, common.ErrTxNotWritable
} }
// Materialize the root node if it hasn't been already so that the // Materialize the root node if it hasn't been already so that the
// bucket will be saved during commit. // bucket will be saved during commit.
if b.rootNode == nil { if b.rootNode == nil {
_ = b.node(b.root, nil) _ = b.node(b.RootPage(), nil)
} }
// Increment and return the sequence. // Increment and return the sequence.
b.bucket.sequence++ b.IncSequence()
return b.bucket.sequence, nil return b.Sequence(), nil
} }
// ForEach executes a function for each key/value pair in a bucket. // ForEach executes a function for each key/value pair in a bucket.
@ -382,7 +375,7 @@ func (b *Bucket) NextSequence() (uint64, error) {
// the bucket; this will result in undefined behavior. // the bucket; this will result in undefined behavior.
func (b *Bucket) ForEach(fn func(k, v []byte) error) error { func (b *Bucket) ForEach(fn func(k, v []byte) error) error {
if b.tx.db == nil { if b.tx.db == nil {
return ErrTxClosed return common.ErrTxClosed
} }
c := b.Cursor() c := b.Cursor()
for k, v := c.First(); k != nil; k, v = c.Next() { for k, v := c.First(); k != nil; k, v = c.Next() {
@ -395,11 +388,11 @@ func (b *Bucket) ForEach(fn func(k, v []byte) error) error {
func (b *Bucket) ForEachBucket(fn func(k []byte) error) error { func (b *Bucket) ForEachBucket(fn func(k []byte) error) error {
if b.tx.db == nil { if b.tx.db == nil {
return ErrTxClosed return common.ErrTxClosed
} }
c := b.Cursor() c := b.Cursor()
for k, _, flags := c.first(); k != nil; k, _, flags = c.next() { for k, _, flags := c.first(); k != nil; k, _, flags = c.next() {
if flags&bucketLeafFlag != 0 { if flags&common.BucketLeafFlag != 0 {
if err := fn(k); err != nil { if err := fn(k); err != nil {
return err return err
} }
@ -413,64 +406,64 @@ func (b *Bucket) Stats() BucketStats {
var s, subStats BucketStats var s, subStats BucketStats
pageSize := b.tx.db.pageSize pageSize := b.tx.db.pageSize
s.BucketN += 1 s.BucketN += 1
if b.root == 0 { if b.RootPage() == 0 {
s.InlineBucketN += 1 s.InlineBucketN += 1
} }
b.forEachPage(func(p *page, depth int, pgstack []pgid) { b.forEachPage(func(p *common.Page, depth int, pgstack []common.Pgid) {
if (p.flags & leafPageFlag) != 0 { if (p.Flags() & common.LeafPageFlag) != 0 {
s.KeyN += int(p.count) s.KeyN += int(p.Count())
// used totals the used bytes for the page // used totals the used bytes for the page
used := pageHeaderSize used := common.PageHeaderSize
if p.count != 0 { if p.Count() != 0 {
// If page has any elements, add all element headers. // If page has any elements, add all element headers.
used += leafPageElementSize * uintptr(p.count-1) used += common.LeafPageElementSize * uintptr(p.Count()-1)
// Add all element key, value sizes. // Add all element key, value sizes.
// The computation takes advantage of the fact that the position // The computation takes advantage of the fact that the position
// of the last element's key/value equals to the total of the sizes // of the last element's key/value equals to the total of the sizes
// of all previous elements' keys and values. // of all previous elements' keys and values.
// It also includes the last element's header. // It also includes the last element's header.
lastElement := p.leafPageElement(p.count - 1) lastElement := p.LeafPageElement(p.Count() - 1)
used += uintptr(lastElement.pos + lastElement.ksize + lastElement.vsize) used += uintptr(lastElement.Pos() + lastElement.Ksize() + lastElement.Vsize())
} }
if b.root == 0 { if b.RootPage() == 0 {
// For inlined bucket just update the inline stats // For inlined bucket just update the inline stats
s.InlineBucketInuse += int(used) s.InlineBucketInuse += int(used)
} else { } else {
// For non-inlined bucket update all the leaf stats // For non-inlined bucket update all the leaf stats
s.LeafPageN++ s.LeafPageN++
s.LeafInuse += int(used) s.LeafInuse += int(used)
s.LeafOverflowN += int(p.overflow) s.LeafOverflowN += int(p.Overflow())
// Collect stats from sub-buckets. // Collect stats from sub-buckets.
// Do that by iterating over all element headers // Do that by iterating over all element headers
// looking for the ones with the bucketLeafFlag. // looking for the ones with the bucketLeafFlag.
for i := uint16(0); i < p.count; i++ { for i := uint16(0); i < p.Count(); i++ {
e := p.leafPageElement(i) e := p.LeafPageElement(i)
if (e.flags & bucketLeafFlag) != 0 { if (e.Flags() & common.BucketLeafFlag) != 0 {
// For any bucket element, open the element value // For any bucket element, open the element value
// and recursively call Stats on the contained bucket. // and recursively call Stats on the contained bucket.
subStats.Add(b.openBucket(e.value()).Stats()) subStats.Add(b.openBucket(e.Value()).Stats())
} }
} }
} }
} else if (p.flags & branchPageFlag) != 0 { } else if (p.Flags() & common.BranchPageFlag) != 0 {
s.BranchPageN++ s.BranchPageN++
lastElement := p.branchPageElement(p.count - 1) lastElement := p.BranchPageElement(p.Count() - 1)
// used totals the used bytes for the page // used totals the used bytes for the page
// Add header and all element headers. // Add header and all element headers.
used := pageHeaderSize + (branchPageElementSize * uintptr(p.count-1)) used := common.PageHeaderSize + (common.BranchPageElementSize * uintptr(p.Count()-1))
// Add size of all keys and values. // Add size of all keys and values.
// Again, use the fact that last element's position equals to // Again, use the fact that last element's position equals to
// the total of key, value sizes of all previous elements. // the total of key, value sizes of all previous elements.
used += uintptr(lastElement.pos + lastElement.ksize) used += uintptr(lastElement.Pos() + lastElement.Ksize())
s.BranchInuse += int(used) s.BranchInuse += int(used)
s.BranchOverflowN += int(p.overflow) s.BranchOverflowN += int(p.Overflow())
} }
// Keep track of maximum page depth. // Keep track of maximum page depth.
@ -491,29 +484,29 @@ func (b *Bucket) Stats() BucketStats {
} }
// forEachPage iterates over every page in a bucket, including inline pages. // forEachPage iterates over every page in a bucket, including inline pages.
func (b *Bucket) forEachPage(fn func(*page, int, []pgid)) { func (b *Bucket) forEachPage(fn func(*common.Page, int, []common.Pgid)) {
// If we have an inline page then just use that. // If we have an inline page then just use that.
if b.page != nil { if b.page != nil {
fn(b.page, 0, []pgid{b.root}) fn(b.page, 0, []common.Pgid{b.RootPage()})
return return
} }
// Otherwise traverse the page hierarchy. // Otherwise traverse the page hierarchy.
b.tx.forEachPage(b.root, fn) b.tx.forEachPage(b.RootPage(), fn)
} }
// forEachPageNode iterates over every page (or node) in a bucket. // forEachPageNode iterates over every page (or node) in a bucket.
// This also includes inline pages. // This also includes inline pages.
func (b *Bucket) forEachPageNode(fn func(*page, *node, int)) { func (b *Bucket) forEachPageNode(fn func(*common.Page, *node, int)) {
// If we have an inline page or root node then just use that. // If we have an inline page or root node then just use that.
if b.page != nil { if b.page != nil {
fn(b.page, nil, 0) fn(b.page, nil, 0)
return return
} }
b._forEachPageNode(b.root, 0, fn) b._forEachPageNode(b.RootPage(), 0, fn)
} }
func (b *Bucket) _forEachPageNode(pgId pgid, depth int, fn func(*page, *node, int)) { func (b *Bucket) _forEachPageNode(pgId common.Pgid, depth int, fn func(*common.Page, *node, int)) {
var p, n = b.pageNode(pgId) var p, n = b.pageNode(pgId)
// Execute function. // Execute function.
@ -521,16 +514,16 @@ func (b *Bucket) _forEachPageNode(pgId pgid, depth int, fn func(*page, *node, in
// Recursively loop over children. // Recursively loop over children.
if p != nil { if p != nil {
if (p.flags & branchPageFlag) != 0 { if (p.Flags() & common.BranchPageFlag) != 0 {
for i := 0; i < int(p.count); i++ { for i := 0; i < int(p.Count()); i++ {
elem := p.branchPageElement(uint16(i)) elem := p.BranchPageElement(uint16(i))
b._forEachPageNode(elem.pgid, depth+1, fn) b._forEachPageNode(elem.Pgid(), depth+1, fn)
} }
} }
} else { } else {
if !n.isLeaf { if !n.isLeaf {
for _, inode := range n.inodes { for _, inode := range n.inodes {
b._forEachPageNode(inode.pgid, depth+1, fn) b._forEachPageNode(inode.Pgid(), depth+1, fn)
} }
} }
} }
@ -553,9 +546,9 @@ func (b *Bucket) spill() error {
} }
// Update the child bucket header in this bucket. // Update the child bucket header in this bucket.
value = make([]byte, unsafe.Sizeof(bucket{})) value = make([]byte, unsafe.Sizeof(common.InBucket{}))
var bucket = (*bucket)(unsafe.Pointer(&value[0])) var bucket = (*common.InBucket)(unsafe.Pointer(&value[0]))
*bucket = *child.bucket *bucket = *child.InBucket
} }
// Skip writing the bucket if there are no materialized nodes. // Skip writing the bucket if there are no materialized nodes.
@ -569,10 +562,10 @@ func (b *Bucket) spill() error {
if !bytes.Equal([]byte(name), k) { if !bytes.Equal([]byte(name), k) {
panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(name), k)) panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(name), k))
} }
if flags&bucketLeafFlag == 0 { if flags&common.BucketLeafFlag == 0 {
panic(fmt.Sprintf("unexpected bucket header flag: %x", flags)) panic(fmt.Sprintf("unexpected bucket header flag: %x", flags))
} }
c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag) c.node().put([]byte(name), []byte(name), value, 0, common.BucketLeafFlag)
} }
// Ignore if there's not a materialized root node. // Ignore if there's not a materialized root node.
@ -587,16 +580,16 @@ func (b *Bucket) spill() error {
b.rootNode = b.rootNode.root() b.rootNode = b.rootNode.root()
// Update the root node for this bucket. // Update the root node for this bucket.
if b.rootNode.pgid >= b.tx.meta.pgid { if b.rootNode.pgid >= b.tx.meta.Pgid() {
panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.pgid)) panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.Pgid()))
} }
b.root = b.rootNode.pgid b.SetRootPage(b.rootNode.pgid)
return nil return nil
} }
// inlineable returns true if a bucket is small enough to be written inline // inlineable returns true if a bucket is small enough to be written inline
// and if it contains no subbuckets. Otherwise returns false. // and if it contains no subbuckets. Otherwise, returns false.
func (b *Bucket) inlineable() bool { func (b *Bucket) inlineable() bool {
var n = b.rootNode var n = b.rootNode
@ -607,11 +600,11 @@ func (b *Bucket) inlineable() bool {
// Bucket is not inlineable if it contains subbuckets or if it goes beyond // Bucket is not inlineable if it contains subbuckets or if it goes beyond
// our threshold for inline bucket size. // our threshold for inline bucket size.
var size = pageHeaderSize var size = common.PageHeaderSize
for _, inode := range n.inodes { for _, inode := range n.inodes {
size += leafPageElementSize + uintptr(len(inode.key)) + uintptr(len(inode.value)) size += common.LeafPageElementSize + uintptr(len(inode.Key())) + uintptr(len(inode.Value()))
if inode.flags&bucketLeafFlag != 0 { if inode.Flags()&common.BucketLeafFlag != 0 {
return false return false
} else if size > b.maxInlineBucketSize() { } else if size > b.maxInlineBucketSize() {
return false return false
@ -630,14 +623,14 @@ func (b *Bucket) maxInlineBucketSize() uintptr {
func (b *Bucket) write() []byte { func (b *Bucket) write() []byte {
// Allocate the appropriate size. // Allocate the appropriate size.
var n = b.rootNode var n = b.rootNode
var value = make([]byte, bucketHeaderSize+n.size()) var value = make([]byte, common.BucketHeaderSize+n.size())
// Write a bucket header. // Write a bucket header.
var bucket = (*bucket)(unsafe.Pointer(&value[0])) var bucket = (*common.InBucket)(unsafe.Pointer(&value[0]))
*bucket = *b.bucket *bucket = *b.InBucket
// Convert byte slice to a fake page and write the root node. // Convert byte slice to a fake page and write the root node.
var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize])) var p = (*common.Page)(unsafe.Pointer(&value[common.BucketHeaderSize]))
n.write(p) n.write(p)
return value return value
@ -654,8 +647,8 @@ func (b *Bucket) rebalance() {
} }
// node creates a node from a page and associates it with a given parent. // node creates a node from a page and associates it with a given parent.
func (b *Bucket) node(pgId pgid, parent *node) *node { func (b *Bucket) node(pgId common.Pgid, parent *node) *node {
_assert(b.nodes != nil, "nodes map expected") common.Assert(b.nodes != nil, "nodes map expected")
// Retrieve node if it's already been created. // Retrieve node if it's already been created.
if n := b.nodes[pgId]; n != nil { if n := b.nodes[pgId]; n != nil {
@ -688,19 +681,19 @@ func (b *Bucket) node(pgId pgid, parent *node) *node {
// free recursively frees all pages in the bucket. // free recursively frees all pages in the bucket.
func (b *Bucket) free() { func (b *Bucket) free() {
if b.root == 0 { if b.RootPage() == 0 {
return return
} }
var tx = b.tx var tx = b.tx
b.forEachPageNode(func(p *page, n *node, _ int) { b.forEachPageNode(func(p *common.Page, n *node, _ int) {
if p != nil { if p != nil {
tx.db.freelist.free(tx.meta.txid, p) tx.db.freelist.free(tx.meta.Txid(), p)
} else { } else {
n.free() n.free()
} }
}) })
b.root = 0 b.SetRootPage(0)
} }
// dereference removes all references to the old mmap. // dereference removes all references to the old mmap.
@ -715,11 +708,11 @@ func (b *Bucket) dereference() {
} }
// pageNode returns the in-memory node, if it exists. // pageNode returns the in-memory node, if it exists.
// Otherwise returns the underlying page. // Otherwise, returns the underlying page.
func (b *Bucket) pageNode(id pgid) (*page, *node) { func (b *Bucket) pageNode(id common.Pgid) (*common.Page, *node) {
// Inline buckets have a fake page embedded in their value so treat them // Inline buckets have a fake page embedded in their value so treat them
// differently. We'll return the rootNode (if available) or the fake page. // differently. We'll return the rootNode (if available) or the fake page.
if b.root == 0 { if b.RootPage() == 0 {
if id != 0 { if id != 0 {
panic(fmt.Sprintf("inline bucket non-zero page access(2): %d != 0", id)) panic(fmt.Sprintf("inline bucket non-zero page access(2): %d != 0", id))
} }

View File

@ -18,6 +18,7 @@ import (
bolt "go.etcd.io/bbolt" bolt "go.etcd.io/bbolt"
"go.etcd.io/bbolt/internal/btesting" "go.etcd.io/bbolt/internal/btesting"
"go.etcd.io/bbolt/internal/common"
) )
// Ensure that a bucket that gets a non-existent key returns nil. // Ensure that a bucket that gets a non-existent key returns nil.
@ -246,7 +247,7 @@ func TestBucket_Put_IncompatibleValue(t *testing.T) {
if _, err := tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo")); err != nil { if _, err := tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo")); err != nil {
t.Fatal(err) t.Fatal(err)
} }
if err := b0.Put([]byte("foo"), []byte("bar")); err != bolt.ErrIncompatibleValue { if err := b0.Put([]byte("foo"), []byte("bar")); err != common.ErrIncompatibleValue {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -272,7 +273,7 @@ func TestBucket_Put_Closed(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
if err := b.Put([]byte("foo"), []byte("bar")); err != bolt.ErrTxClosed { if err := b.Put([]byte("foo"), []byte("bar")); err != common.ErrTxClosed {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }
@ -292,7 +293,7 @@ func TestBucket_Put_ReadOnly(t *testing.T) {
if err := db.View(func(tx *bolt.Tx) error { if err := db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("widgets")) b := tx.Bucket([]byte("widgets"))
if err := b.Put([]byte("foo"), []byte("bar")); err != bolt.ErrTxNotWritable { if err := b.Put([]byte("foo"), []byte("bar")); err != common.ErrTxNotWritable {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -560,7 +561,7 @@ func TestBucket_Delete_Bucket(t *testing.T) {
if _, err := b.CreateBucket([]byte("foo")); err != nil { if _, err := b.CreateBucket([]byte("foo")); err != nil {
t.Fatal(err) t.Fatal(err)
} }
if err := b.Delete([]byte("foo")); err != bolt.ErrIncompatibleValue { if err := b.Delete([]byte("foo")); err != common.ErrIncompatibleValue {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -583,7 +584,7 @@ func TestBucket_Delete_ReadOnly(t *testing.T) {
} }
if err := db.View(func(tx *bolt.Tx) error { if err := db.View(func(tx *bolt.Tx) error {
if err := tx.Bucket([]byte("widgets")).Delete([]byte("foo")); err != bolt.ErrTxNotWritable { if err := tx.Bucket([]byte("widgets")).Delete([]byte("foo")); err != common.ErrTxNotWritable {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -609,7 +610,7 @@ func TestBucket_Delete_Closed(t *testing.T) {
if err := tx.Rollback(); err != nil { if err := tx.Rollback(); err != nil {
t.Fatal(err) t.Fatal(err)
} }
if err := b.Delete([]byte("foo")); err != bolt.ErrTxClosed { if err := b.Delete([]byte("foo")); err != common.ErrTxClosed {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }
@ -780,7 +781,7 @@ func TestBucket_CreateBucket_IncompatibleValue(t *testing.T) {
if err := widgets.Put([]byte("foo"), []byte("bar")); err != nil { if err := widgets.Put([]byte("foo"), []byte("bar")); err != nil {
t.Fatal(err) t.Fatal(err)
} }
if _, err := widgets.CreateBucket([]byte("foo")); err != bolt.ErrIncompatibleValue { if _, err := widgets.CreateBucket([]byte("foo")); err != common.ErrIncompatibleValue {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -801,7 +802,7 @@ func TestBucket_DeleteBucket_IncompatibleValue(t *testing.T) {
if err := widgets.Put([]byte("foo"), []byte("bar")); err != nil { if err := widgets.Put([]byte("foo"), []byte("bar")); err != nil {
t.Fatal(err) t.Fatal(err)
} }
if err := tx.Bucket([]byte("widgets")).DeleteBucket([]byte("foo")); err != bolt.ErrIncompatibleValue { if err := tx.Bucket([]byte("widgets")).DeleteBucket([]byte("foo")); err != common.ErrIncompatibleValue {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -943,7 +944,7 @@ func TestBucket_NextSequence_ReadOnly(t *testing.T) {
if err := db.View(func(tx *bolt.Tx) error { if err := db.View(func(tx *bolt.Tx) error {
_, err := tx.Bucket([]byte("widgets")).NextSequence() _, err := tx.Bucket([]byte("widgets")).NextSequence()
if err != bolt.ErrTxNotWritable { if err != common.ErrTxNotWritable {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -966,7 +967,7 @@ func TestBucket_NextSequence_Closed(t *testing.T) {
if err := tx.Rollback(); err != nil { if err := tx.Rollback(); err != nil {
t.Fatal(err) t.Fatal(err)
} }
if _, err := b.NextSequence(); err != bolt.ErrTxClosed { if _, err := b.NextSequence(); err != common.ErrTxClosed {
t.Fatal(err) t.Fatal(err)
} }
} }
@ -1158,7 +1159,7 @@ func TestBucket_ForEach_Closed(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
if err := b.ForEach(func(k, v []byte) error { return nil }); err != bolt.ErrTxClosed { if err := b.ForEach(func(k, v []byte) error { return nil }); err != common.ErrTxClosed {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }
@ -1172,10 +1173,10 @@ func TestBucket_Put_EmptyKey(t *testing.T) {
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
if err := b.Put([]byte(""), []byte("bar")); err != bolt.ErrKeyRequired { if err := b.Put([]byte(""), []byte("bar")); err != common.ErrKeyRequired {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
if err := b.Put(nil, []byte("bar")); err != bolt.ErrKeyRequired { if err := b.Put(nil, []byte("bar")); err != common.ErrKeyRequired {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -1192,7 +1193,7 @@ func TestBucket_Put_KeyTooLarge(t *testing.T) {
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
if err := b.Put(make([]byte, 32769), []byte("bar")); err != bolt.ErrKeyTooLarge { if err := b.Put(make([]byte, 32769), []byte("bar")); err != common.ErrKeyTooLarge {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -1215,7 +1216,7 @@ func TestBucket_Put_ValueTooLarge(t *testing.T) {
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
if err := b.Put([]byte("foo"), make([]byte, bolt.MaxValueSize+1)); err != bolt.ErrValueTooLarge { if err := b.Put([]byte("foo"), make([]byte, bolt.MaxValueSize+1)); err != common.ErrValueTooLarge {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil

View File

@ -18,11 +18,10 @@ import (
"time" "time"
"unicode" "unicode"
"unicode/utf8" "unicode/utf8"
"unsafe"
"go.etcd.io/bbolt/internal/guts_cli"
bolt "go.etcd.io/bbolt" bolt "go.etcd.io/bbolt"
"go.etcd.io/bbolt/internal/common"
"go.etcd.io/bbolt/internal/guts_cli"
) )
var ( var (
@ -52,12 +51,6 @@ var (
// ErrBucketRequired is returned when a bucket is not specified. // ErrBucketRequired is returned when a bucket is not specified.
ErrBucketRequired = errors.New("bucket required") ErrBucketRequired = errors.New("bucket required")
// ErrBucketNotFound is returned when a bucket is not found.
ErrBucketNotFound = errors.New("bucket not found")
// ErrKeyRequired is returned when a key is not specified.
ErrKeyRequired = errors.New("key required")
// ErrKeyNotFound is returned when a key is not found. // ErrKeyNotFound is returned when a key is not found.
ErrKeyNotFound = errors.New("key not found") ErrKeyNotFound = errors.New("key not found")
) )
@ -509,16 +502,17 @@ func (cmd *pageItemCommand) Run(args ...string) error {
return nil return nil
} }
// leafPageElement retrieves a leaf page element. func (cmd *pageItemCommand) leafPageElement(pageBytes []byte, index uint16) ([]byte, []byte, error) {
func (cmd *pageItemCommand) leafPageElement(pageBytes []byte, index uint16) (*guts_cli.LeafPageElement, error) { p := common.LoadPage(pageBytes)
p := (*guts_cli.Page)(unsafe.Pointer(&pageBytes[0]))
if index >= p.Count() { if index >= p.Count() {
return nil, fmt.Errorf("leafPageElement: expected item index less than %d, but got %d.", p.Count(), index) return nil, nil, fmt.Errorf("leafPageElement: expected item index less than %d, but got %d", p.Count(), index)
} }
if p.Type() != "leaf" { if p.Typ() != "leaf" {
return nil, fmt.Errorf("leafPageElement: expected page type of 'leaf', but got '%s'", p.Type()) return nil, nil, fmt.Errorf("leafPageElement: expected page type of 'leaf', but got '%s'", p.Typ())
} }
return p.LeafPageElement(index), nil
e := p.LeafPageElement(index)
return e.Key(), e.Value(), nil
} }
const FORMAT_MODES = "auto|ascii-encoded|hex|bytes|redacted" const FORMAT_MODES = "auto|ascii-encoded|hex|bytes|redacted"
@ -568,20 +562,21 @@ func writelnBytes(w io.Writer, b []byte, format string) error {
// PrintLeafItemKey writes the bytes of a leaf element's key. // PrintLeafItemKey writes the bytes of a leaf element's key.
func (cmd *pageItemCommand) PrintLeafItemKey(w io.Writer, pageBytes []byte, index uint16, format string) error { func (cmd *pageItemCommand) PrintLeafItemKey(w io.Writer, pageBytes []byte, index uint16, format string) error {
e, err := cmd.leafPageElement(pageBytes, index) k, _, err := cmd.leafPageElement(pageBytes, index)
if err != nil { if err != nil {
return err return err
} }
return writelnBytes(w, e.Key(), format)
return writelnBytes(w, k, format)
} }
// PrintLeafItemKey writes the bytes of a leaf element's value. // PrintLeafItemValue writes the bytes of a leaf element's value.
func (cmd *pageItemCommand) PrintLeafItemValue(w io.Writer, pageBytes []byte, index uint16, format string) error { func (cmd *pageItemCommand) PrintLeafItemValue(w io.Writer, pageBytes []byte, index uint16, format string) error {
e, err := cmd.leafPageElement(pageBytes, index) _, v, err := cmd.leafPageElement(pageBytes, index)
if err != nil { if err != nil {
return err return err
} }
return writelnBytes(w, e.Value(), format) return writelnBytes(w, v, format)
} }
// Usage returns the help message. // Usage returns the help message.
@ -931,12 +926,12 @@ func (cmd *keysCommand) Run(args ...string) error {
// Find bucket. // Find bucket.
var lastbucket *bolt.Bucket = tx.Bucket([]byte(buckets[0])) var lastbucket *bolt.Bucket = tx.Bucket([]byte(buckets[0]))
if lastbucket == nil { if lastbucket == nil {
return ErrBucketNotFound return common.ErrBucketNotFound
} }
for _, bucket := range buckets[1:] { for _, bucket := range buckets[1:] {
lastbucket = lastbucket.Bucket([]byte(bucket)) lastbucket = lastbucket.Bucket([]byte(bucket))
if lastbucket == nil { if lastbucket == nil {
return ErrBucketNotFound return common.ErrBucketNotFound
} }
} }
@ -1007,7 +1002,7 @@ func (cmd *getCommand) Run(args ...string) error {
} else if len(buckets) == 0 { } else if len(buckets) == 0 {
return ErrBucketRequired return ErrBucketRequired
} else if len(key) == 0 { } else if len(key) == 0 {
return ErrKeyRequired return common.ErrKeyRequired
} }
// Open database. // Open database.
@ -1022,12 +1017,12 @@ func (cmd *getCommand) Run(args ...string) error {
// Find bucket. // Find bucket.
var lastbucket *bolt.Bucket = tx.Bucket([]byte(buckets[0])) var lastbucket *bolt.Bucket = tx.Bucket([]byte(buckets[0]))
if lastbucket == nil { if lastbucket == nil {
return ErrBucketNotFound return common.ErrBucketNotFound
} }
for _, bucket := range buckets[1:] { for _, bucket := range buckets[1:] {
lastbucket = lastbucket.Bucket([]byte(bucket)) lastbucket = lastbucket.Bucket([]byte(bucket))
if lastbucket == nil { if lastbucket == nil {
return ErrBucketNotFound return common.ErrBucketNotFound
} }
} }

View File

@ -8,6 +8,7 @@ import (
"os" "os"
"strings" "strings"
"go.etcd.io/bbolt/internal/common"
"go.etcd.io/bbolt/internal/guts_cli" "go.etcd.io/bbolt/internal/guts_cli"
) )
@ -113,12 +114,12 @@ func (cmd *pageCommand) printPage(path string, pageID uint64, formatValue string
// Print basic page info. // Print basic page info.
fmt.Fprintf(cmd.Stdout, "Page ID: %d\n", p.Id()) fmt.Fprintf(cmd.Stdout, "Page ID: %d\n", p.Id())
fmt.Fprintf(cmd.Stdout, "Page Type: %s\n", p.Type()) fmt.Fprintf(cmd.Stdout, "Page Type: %s\n", p.Typ())
fmt.Fprintf(cmd.Stdout, "Total Size: %d bytes\n", len(buf)) fmt.Fprintf(cmd.Stdout, "Total Size: %d bytes\n", len(buf))
fmt.Fprintf(cmd.Stdout, "Overflow pages: %d\n", p.Overflow()) fmt.Fprintf(cmd.Stdout, "Overflow pages: %d\n", p.Overflow())
// Print type-specific data. // Print type-specific data.
switch p.Type() { switch p.Typ() {
case "meta": case "meta":
err = cmd.PrintMeta(cmd.Stdout, buf) err = cmd.PrintMeta(cmd.Stdout, buf)
case "leaf": case "leaf":
@ -136,14 +137,14 @@ func (cmd *pageCommand) printPage(path string, pageID uint64, formatValue string
// PrintMeta prints the data from the meta page. // PrintMeta prints the data from the meta page.
func (cmd *pageCommand) PrintMeta(w io.Writer, buf []byte) error { func (cmd *pageCommand) PrintMeta(w io.Writer, buf []byte) error {
m := guts_cli.LoadPageMeta(buf) m := common.LoadPageMeta(buf)
m.Print(w) m.Print(w)
return nil return nil
} }
// PrintLeaf prints the data for a leaf page. // PrintLeaf prints the data for a leaf page.
func (cmd *pageCommand) PrintLeaf(w io.Writer, buf []byte, formatValue string) error { func (cmd *pageCommand) PrintLeaf(w io.Writer, buf []byte, formatValue string) error {
p := guts_cli.LoadPage(buf) p := common.LoadPage(buf)
// Print number of items. // Print number of items.
fmt.Fprintf(w, "Item Count: %d\n", p.Count()) fmt.Fprintf(w, "Item Count: %d\n", p.Count())
@ -182,7 +183,7 @@ func (cmd *pageCommand) PrintLeaf(w io.Writer, buf []byte, formatValue string) e
// PrintBranch prints the data for a leaf page. // PrintBranch prints the data for a leaf page.
func (cmd *pageCommand) PrintBranch(w io.Writer, buf []byte) error { func (cmd *pageCommand) PrintBranch(w io.Writer, buf []byte) error {
p := guts_cli.LoadPage(buf) p := common.LoadPage(buf)
// Print number of items. // Print number of items.
fmt.Fprintf(w, "Item Count: %d\n", p.Count()) fmt.Fprintf(w, "Item Count: %d\n", p.Count())
@ -200,7 +201,7 @@ func (cmd *pageCommand) PrintBranch(w io.Writer, buf []byte) error {
k = fmt.Sprintf("%x", string(e.Key())) k = fmt.Sprintf("%x", string(e.Key()))
} }
fmt.Fprintf(w, "%s: <pgid=%d>\n", k, e.PgId()) fmt.Fprintf(w, "%s: <pgid=%d>\n", k, e.Pgid())
} }
fmt.Fprintf(w, "\n") fmt.Fprintf(w, "\n")
return nil return nil
@ -208,16 +209,17 @@ func (cmd *pageCommand) PrintBranch(w io.Writer, buf []byte) error {
// PrintFreelist prints the data for a freelist page. // PrintFreelist prints the data for a freelist page.
func (cmd *pageCommand) PrintFreelist(w io.Writer, buf []byte) error { func (cmd *pageCommand) PrintFreelist(w io.Writer, buf []byte) error {
p := guts_cli.LoadPage(buf) p := common.LoadPage(buf)
// Print number of items. // Print number of items.
fmt.Fprintf(w, "Item Count: %d\n", p.FreelistPageCount()) _, cnt := p.FreelistPageCount()
fmt.Fprintf(w, "Item Count: %d\n", cnt)
fmt.Fprintf(w, "Overflow: %d\n", p.Overflow()) fmt.Fprintf(w, "Overflow: %d\n", p.Overflow())
fmt.Fprintf(w, "\n") fmt.Fprintf(w, "\n")
// Print each page in the freelist. // Print each page in the freelist.
ids := p.FreelistPagePages() ids := p.FreelistPageIds()
for _, ids := range ids { for _, ids := range ids {
fmt.Fprintf(w, "%d\n", ids) fmt.Fprintf(w, "%d\n", ids)
} }
@ -244,7 +246,7 @@ func (cmd *pageCommand) PrintPage(w io.Writer, r io.ReaderAt, pageID int, pageSi
for offset := 0; offset < pageSize; offset += bytesPerLineN { for offset := 0; offset < pageSize; offset += bytesPerLineN {
// Retrieve current 16-byte line. // Retrieve current 16-byte line.
line := buf[offset : offset+bytesPerLineN] line := buf[offset : offset+bytesPerLineN]
isLastLine := (offset == (pageSize - bytesPerLineN)) isLastLine := offset == (pageSize - bytesPerLineN)
// If it's the same as the previous line then print a skip. // If it's the same as the previous line then print a skip.
if bytes.Equal(line, prev) && !isLastLine { if bytes.Equal(line, prev) && !isLastLine {

View File

@ -9,7 +9,7 @@ import (
"strconv" "strconv"
"strings" "strings"
"go.etcd.io/bbolt/internal/guts_cli" "go.etcd.io/bbolt/internal/common"
"go.etcd.io/bbolt/internal/surgeon" "go.etcd.io/bbolt/internal/surgeon"
) )
@ -224,7 +224,7 @@ func (cmd *copyPageCommand) Run(args ...string) error {
} }
// copy the page // copy the page
if err := surgeon.CopyPage(cmd.dstPath, guts_cli.Pgid(srcPageId), guts_cli.Pgid(dstPageId)); err != nil { if err := surgeon.CopyPage(cmd.dstPath, common.Pgid(srcPageId), common.Pgid(dstPageId)); err != nil {
return fmt.Errorf("copyPageCommand failed: %w", err) return fmt.Errorf("copyPageCommand failed: %w", err)
} }
@ -279,7 +279,7 @@ func (cmd *clearPageCommand) Run(args ...string) error {
return err return err
} }
if err := surgeon.ClearPage(cmd.dstPath, guts_cli.Pgid(pageId)); err != nil { if err := surgeon.ClearPage(cmd.dstPath, common.Pgid(pageId)); err != nil {
return fmt.Errorf("clearPageCommand failed: %w", err) return fmt.Errorf("clearPageCommand failed: %w", err)
} }

View File

@ -11,7 +11,7 @@ import (
bolt "go.etcd.io/bbolt" bolt "go.etcd.io/bbolt"
"go.etcd.io/bbolt/internal/btesting" "go.etcd.io/bbolt/internal/btesting"
"go.etcd.io/bbolt/internal/guts_cli" "go.etcd.io/bbolt/internal/common"
) )
func TestSurgery_RevertMetaPage(t *testing.T) { func TestSurgery_RevertMetaPage(t *testing.T) {
@ -28,8 +28,8 @@ func TestSurgery_RevertMetaPage(t *testing.T) {
// Read both meta0 and meta1 from srcFile // Read both meta0 and meta1 from srcFile
srcBuf0 := readPage(t, srcPath, 0, pageSize) srcBuf0 := readPage(t, srcPath, 0, pageSize)
srcBuf1 := readPage(t, srcPath, 1, pageSize) srcBuf1 := readPage(t, srcPath, 1, pageSize)
meta0Page := guts_cli.LoadPageMeta(srcBuf0) meta0Page := common.LoadPageMeta(srcBuf0)
meta1Page := guts_cli.LoadPageMeta(srcBuf1) meta1Page := common.LoadPageMeta(srcBuf1)
// Get the non-active meta page // Get the non-active meta page
nonActiveSrcBuf := srcBuf0 nonActiveSrcBuf := srcBuf0
@ -115,7 +115,7 @@ func TestSurgery_ClearPage(t *testing.T) {
t.Log("Verify result") t.Log("Verify result")
dstPageId3Data := readPage(t, dstPath, 3, pageSize) dstPageId3Data := readPage(t, dstPath, 3, pageSize)
p := guts_cli.LoadPage(dstPageId3Data) p := common.LoadPage(dstPageId3Data)
assert.Equal(t, uint16(0), p.Count()) assert.Equal(t, uint16(0), p.Count())
assert.Equal(t, uint32(0), p.Overflow()) assert.Equal(t, uint32(0), p.Overflow())
} }

View File

@ -4,6 +4,8 @@ import (
"bytes" "bytes"
"fmt" "fmt"
"sort" "sort"
"go.etcd.io/bbolt/internal/common"
) )
// Cursor represents an iterator that can traverse over all key/value pairs in a bucket // Cursor represents an iterator that can traverse over all key/value pairs in a bucket
@ -30,9 +32,9 @@ func (c *Cursor) Bucket() *Bucket {
// If the bucket is empty then a nil key and value are returned. // If the bucket is empty then a nil key and value are returned.
// The returned key and value are only valid for the life of the transaction. // The returned key and value are only valid for the life of the transaction.
func (c *Cursor) First() (key []byte, value []byte) { func (c *Cursor) First() (key []byte, value []byte) {
_assert(c.bucket.tx.db != nil, "tx closed") common.Assert(c.bucket.tx.db != nil, "tx closed")
k, v, flags := c.first() k, v, flags := c.first()
if (flags & uint32(bucketLeafFlag)) != 0 { if (flags & uint32(common.BucketLeafFlag)) != 0 {
return k, nil return k, nil
} }
return k, v return k, v
@ -40,7 +42,7 @@ func (c *Cursor) First() (key []byte, value []byte) {
func (c *Cursor) first() (key []byte, value []byte, flags uint32) { func (c *Cursor) first() (key []byte, value []byte, flags uint32) {
c.stack = c.stack[:0] c.stack = c.stack[:0]
p, n := c.bucket.pageNode(c.bucket.root) p, n := c.bucket.pageNode(c.bucket.RootPage())
c.stack = append(c.stack, elemRef{page: p, node: n, index: 0}) c.stack = append(c.stack, elemRef{page: p, node: n, index: 0})
c.goToFirstElementOnTheStack() c.goToFirstElementOnTheStack()
@ -51,7 +53,7 @@ func (c *Cursor) first() (key []byte, value []byte, flags uint32) {
} }
k, v, flags := c.keyValue() k, v, flags := c.keyValue()
if (flags & uint32(bucketLeafFlag)) != 0 { if (flags & uint32(common.BucketLeafFlag)) != 0 {
return k, nil, flags return k, nil, flags
} }
return k, v, flags return k, v, flags
@ -61,9 +63,9 @@ func (c *Cursor) first() (key []byte, value []byte, flags uint32) {
// If the bucket is empty then a nil key and value are returned. // If the bucket is empty then a nil key and value are returned.
// The returned key and value are only valid for the life of the transaction. // The returned key and value are only valid for the life of the transaction.
func (c *Cursor) Last() (key []byte, value []byte) { func (c *Cursor) Last() (key []byte, value []byte) {
_assert(c.bucket.tx.db != nil, "tx closed") common.Assert(c.bucket.tx.db != nil, "tx closed")
c.stack = c.stack[:0] c.stack = c.stack[:0]
p, n := c.bucket.pageNode(c.bucket.root) p, n := c.bucket.pageNode(c.bucket.RootPage())
ref := elemRef{page: p, node: n} ref := elemRef{page: p, node: n}
ref.index = ref.count() - 1 ref.index = ref.count() - 1
c.stack = append(c.stack, ref) c.stack = append(c.stack, ref)
@ -80,7 +82,7 @@ func (c *Cursor) Last() (key []byte, value []byte) {
} }
k, v, flags := c.keyValue() k, v, flags := c.keyValue()
if (flags & uint32(bucketLeafFlag)) != 0 { if (flags & uint32(common.BucketLeafFlag)) != 0 {
return k, nil return k, nil
} }
return k, v return k, v
@ -90,9 +92,9 @@ func (c *Cursor) Last() (key []byte, value []byte) {
// If the cursor is at the end of the bucket then a nil key and value are returned. // If the cursor is at the end of the bucket then a nil key and value are returned.
// The returned key and value are only valid for the life of the transaction. // The returned key and value are only valid for the life of the transaction.
func (c *Cursor) Next() (key []byte, value []byte) { func (c *Cursor) Next() (key []byte, value []byte) {
_assert(c.bucket.tx.db != nil, "tx closed") common.Assert(c.bucket.tx.db != nil, "tx closed")
k, v, flags := c.next() k, v, flags := c.next()
if (flags & uint32(bucketLeafFlag)) != 0 { if (flags & uint32(common.BucketLeafFlag)) != 0 {
return k, nil return k, nil
} }
return k, v return k, v
@ -102,9 +104,9 @@ func (c *Cursor) Next() (key []byte, value []byte) {
// If the cursor is at the beginning of the bucket then a nil key and value are returned. // If the cursor is at the beginning of the bucket then a nil key and value are returned.
// The returned key and value are only valid for the life of the transaction. // The returned key and value are only valid for the life of the transaction.
func (c *Cursor) Prev() (key []byte, value []byte) { func (c *Cursor) Prev() (key []byte, value []byte) {
_assert(c.bucket.tx.db != nil, "tx closed") common.Assert(c.bucket.tx.db != nil, "tx closed")
k, v, flags := c.prev() k, v, flags := c.prev()
if (flags & uint32(bucketLeafFlag)) != 0 { if (flags & uint32(common.BucketLeafFlag)) != 0 {
return k, nil return k, nil
} }
return k, v return k, v
@ -115,7 +117,7 @@ func (c *Cursor) Prev() (key []byte, value []byte) {
// follow, a nil key is returned. // follow, a nil key is returned.
// The returned key and value are only valid for the life of the transaction. // The returned key and value are only valid for the life of the transaction.
func (c *Cursor) Seek(seek []byte) (key []byte, value []byte) { func (c *Cursor) Seek(seek []byte) (key []byte, value []byte) {
_assert(c.bucket.tx.db != nil, "tx closed") common.Assert(c.bucket.tx.db != nil, "tx closed")
k, v, flags := c.seek(seek) k, v, flags := c.seek(seek)
@ -126,7 +128,7 @@ func (c *Cursor) Seek(seek []byte) (key []byte, value []byte) {
if k == nil { if k == nil {
return nil, nil return nil, nil
} else if (flags & uint32(bucketLeafFlag)) != 0 { } else if (flags & uint32(common.BucketLeafFlag)) != 0 {
return k, nil return k, nil
} }
return k, v return k, v
@ -136,15 +138,15 @@ func (c *Cursor) Seek(seek []byte) (key []byte, value []byte) {
// Delete fails if current key/value is a bucket or if the transaction is not writable. // Delete fails if current key/value is a bucket or if the transaction is not writable.
func (c *Cursor) Delete() error { func (c *Cursor) Delete() error {
if c.bucket.tx.db == nil { if c.bucket.tx.db == nil {
return ErrTxClosed return common.ErrTxClosed
} else if !c.bucket.Writable() { } else if !c.bucket.Writable() {
return ErrTxNotWritable return common.ErrTxNotWritable
} }
key, _, flags := c.keyValue() key, _, flags := c.keyValue()
// Return an error if current value is a bucket. // Return an error if current value is a bucket.
if (flags & bucketLeafFlag) != 0 { if (flags & common.BucketLeafFlag) != 0 {
return ErrIncompatibleValue return common.ErrIncompatibleValue
} }
c.node().del(key) c.node().del(key)
@ -156,7 +158,7 @@ func (c *Cursor) Delete() error {
func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) { func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) {
// Start from root page/node and traverse to correct page. // Start from root page/node and traverse to correct page.
c.stack = c.stack[:0] c.stack = c.stack[:0]
c.search(seek, c.bucket.root) c.search(seek, c.bucket.RootPage())
// If this is a bucket then return a nil value. // If this is a bucket then return a nil value.
return c.keyValue() return c.keyValue()
@ -172,11 +174,11 @@ func (c *Cursor) goToFirstElementOnTheStack() {
} }
// Keep adding pages pointing to the first element to the stack. // Keep adding pages pointing to the first element to the stack.
var pgId pgid var pgId common.Pgid
if ref.node != nil { if ref.node != nil {
pgId = ref.node.inodes[ref.index].pgid pgId = ref.node.inodes[ref.index].Pgid()
} else { } else {
pgId = ref.page.branchPageElement(uint16(ref.index)).pgid pgId = ref.page.BranchPageElement(uint16(ref.index)).Pgid()
} }
p, n := c.bucket.pageNode(pgId) p, n := c.bucket.pageNode(pgId)
c.stack = append(c.stack, elemRef{page: p, node: n, index: 0}) c.stack = append(c.stack, elemRef{page: p, node: n, index: 0})
@ -193,11 +195,11 @@ func (c *Cursor) last() {
} }
// Keep adding pages pointing to the last element in the stack. // Keep adding pages pointing to the last element in the stack.
var pgId pgid var pgId common.Pgid
if ref.node != nil { if ref.node != nil {
pgId = ref.node.inodes[ref.index].pgid pgId = ref.node.inodes[ref.index].Pgid()
} else { } else {
pgId = ref.page.branchPageElement(uint16(ref.index)).pgid pgId = ref.page.BranchPageElement(uint16(ref.index)).Pgid()
} }
p, n := c.bucket.pageNode(pgId) p, n := c.bucket.pageNode(pgId)
@ -268,10 +270,10 @@ func (c *Cursor) prev() (key []byte, value []byte, flags uint32) {
} }
// search recursively performs a binary search against a given page/node until it finds a given key. // search recursively performs a binary search against a given page/node until it finds a given key.
func (c *Cursor) search(key []byte, pgId pgid) { func (c *Cursor) search(key []byte, pgId common.Pgid) {
p, n := c.bucket.pageNode(pgId) p, n := c.bucket.pageNode(pgId)
if p != nil && (p.flags&(branchPageFlag|leafPageFlag)) == 0 { if p != nil && (p.Flags()&(common.BranchPageFlag|common.LeafPageFlag)) == 0 {
panic(fmt.Sprintf("invalid page type: %d: %x", p.id, p.flags)) panic(fmt.Sprintf("invalid page type: %d: %x", p.Id(), p.Flags()))
} }
e := elemRef{page: p, node: n} e := elemRef{page: p, node: n}
c.stack = append(c.stack, e) c.stack = append(c.stack, e)
@ -294,7 +296,7 @@ func (c *Cursor) searchNode(key []byte, n *node) {
index := sort.Search(len(n.inodes), func(i int) bool { index := sort.Search(len(n.inodes), func(i int) bool {
// TODO(benbjohnson): Optimize this range search. It's a bit hacky right now. // TODO(benbjohnson): Optimize this range search. It's a bit hacky right now.
// sort.Search() finds the lowest index where f() != -1 but we need the highest index. // sort.Search() finds the lowest index where f() != -1 but we need the highest index.
ret := bytes.Compare(n.inodes[i].key, key) ret := bytes.Compare(n.inodes[i].Key(), key)
if ret == 0 { if ret == 0 {
exact = true exact = true
} }
@ -306,18 +308,18 @@ func (c *Cursor) searchNode(key []byte, n *node) {
c.stack[len(c.stack)-1].index = index c.stack[len(c.stack)-1].index = index
// Recursively search to the next page. // Recursively search to the next page.
c.search(key, n.inodes[index].pgid) c.search(key, n.inodes[index].Pgid())
} }
func (c *Cursor) searchPage(key []byte, p *page) { func (c *Cursor) searchPage(key []byte, p *common.Page) {
// Binary search for the correct range. // Binary search for the correct range.
inodes := p.branchPageElements() inodes := p.BranchPageElements()
var exact bool var exact bool
index := sort.Search(int(p.count), func(i int) bool { index := sort.Search(int(p.Count()), func(i int) bool {
// TODO(benbjohnson): Optimize this range search. It's a bit hacky right now. // TODO(benbjohnson): Optimize this range search. It's a bit hacky right now.
// sort.Search() finds the lowest index where f() != -1 but we need the highest index. // sort.Search() finds the lowest index where f() != -1 but we need the highest index.
ret := bytes.Compare(inodes[i].key(), key) ret := bytes.Compare(inodes[i].Key(), key)
if ret == 0 { if ret == 0 {
exact = true exact = true
} }
@ -329,7 +331,7 @@ func (c *Cursor) searchPage(key []byte, p *page) {
c.stack[len(c.stack)-1].index = index c.stack[len(c.stack)-1].index = index
// Recursively search to the next page. // Recursively search to the next page.
c.search(key, inodes[index].pgid) c.search(key, inodes[index].Pgid())
} }
// nsearch searches the leaf node on the top of the stack for a key. // nsearch searches the leaf node on the top of the stack for a key.
@ -340,16 +342,16 @@ func (c *Cursor) nsearch(key []byte) {
// If we have a node then search its inodes. // If we have a node then search its inodes.
if n != nil { if n != nil {
index := sort.Search(len(n.inodes), func(i int) bool { index := sort.Search(len(n.inodes), func(i int) bool {
return bytes.Compare(n.inodes[i].key, key) != -1 return bytes.Compare(n.inodes[i].Key(), key) != -1
}) })
e.index = index e.index = index
return return
} }
// If we have a page then search its leaf elements. // If we have a page then search its leaf elements.
inodes := p.leafPageElements() inodes := p.LeafPageElements()
index := sort.Search(int(p.count), func(i int) bool { index := sort.Search(int(p.Count()), func(i int) bool {
return bytes.Compare(inodes[i].key(), key) != -1 return bytes.Compare(inodes[i].Key(), key) != -1
}) })
e.index = index e.index = index
} }
@ -366,17 +368,17 @@ func (c *Cursor) keyValue() ([]byte, []byte, uint32) {
// Retrieve value from node. // Retrieve value from node.
if ref.node != nil { if ref.node != nil {
inode := &ref.node.inodes[ref.index] inode := &ref.node.inodes[ref.index]
return inode.key, inode.value, inode.flags return inode.Key(), inode.Value(), inode.Flags()
} }
// Or retrieve value from page. // Or retrieve value from page.
elem := ref.page.leafPageElement(uint16(ref.index)) elem := ref.page.LeafPageElement(uint16(ref.index))
return elem.key(), elem.value(), elem.flags return elem.Key(), elem.Value(), elem.Flags()
} }
// node returns the node that the cursor is currently positioned on. // node returns the node that the cursor is currently positioned on.
func (c *Cursor) node() *node { func (c *Cursor) node() *node {
_assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack") common.Assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack")
// If the top of the stack is a leaf node then just return it. // If the top of the stack is a leaf node then just return it.
if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() { if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() {
@ -386,19 +388,19 @@ func (c *Cursor) node() *node {
// Start from root and traverse down the hierarchy. // Start from root and traverse down the hierarchy.
var n = c.stack[0].node var n = c.stack[0].node
if n == nil { if n == nil {
n = c.bucket.node(c.stack[0].page.id, nil) n = c.bucket.node(c.stack[0].page.Id(), nil)
} }
for _, ref := range c.stack[:len(c.stack)-1] { for _, ref := range c.stack[:len(c.stack)-1] {
_assert(!n.isLeaf, "expected branch node") common.Assert(!n.isLeaf, "expected branch node")
n = n.childAt(ref.index) n = n.childAt(ref.index)
} }
_assert(n.isLeaf, "expected leaf node") common.Assert(n.isLeaf, "expected leaf node")
return n return n
} }
// elemRef represents a reference to an element on a given page/node. // elemRef represents a reference to an element on a given page/node.
type elemRef struct { type elemRef struct {
page *page page *common.Page
node *node node *node
index int index int
} }
@ -408,7 +410,7 @@ func (r *elemRef) isLeaf() bool {
if r.node != nil { if r.node != nil {
return r.node.isLeaf return r.node.isLeaf
} }
return (r.page.flags & leafPageFlag) != 0 return (r.page.Flags() & common.LeafPageFlag) != 0
} }
// count returns the number of inodes or page elements. // count returns the number of inodes or page elements.
@ -416,5 +418,5 @@ func (r *elemRef) count() int {
if r.node != nil { if r.node != nil {
return len(r.node.inodes) return len(r.node.inodes)
} }
return int(r.page.count) return int(r.page.Count())
} }

View File

@ -13,6 +13,7 @@ import (
bolt "go.etcd.io/bbolt" bolt "go.etcd.io/bbolt"
"go.etcd.io/bbolt/internal/btesting" "go.etcd.io/bbolt/internal/btesting"
"go.etcd.io/bbolt/internal/common"
) )
// Ensure that a cursor can return a reference to the bucket that created it. // Ensure that a cursor can return a reference to the bucket that created it.
@ -139,7 +140,7 @@ func TestCursor_Delete(t *testing.T) {
} }
c.Seek([]byte("sub")) c.Seek([]byte("sub"))
if err := c.Delete(); err != bolt.ErrIncompatibleValue { if err := c.Delete(); err != common.ErrIncompatibleValue {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }

260
db.go
View File

@ -3,7 +3,6 @@ package bbolt
import ( import (
"errors" "errors"
"fmt" "fmt"
"hash/fnv"
"io" "io"
"os" "os"
"runtime" "runtime"
@ -11,48 +10,13 @@ import (
"sync" "sync"
"time" "time"
"unsafe" "unsafe"
"go.etcd.io/bbolt/internal/common"
) )
// The largest step that can be taken when remapping the mmap.
const maxMmapStep = 1 << 30 // 1GB
// The data file format version.
const version = 2
// Represents a marker value to indicate that a file is a Bolt DB.
const magic uint32 = 0xED0CDAED
const pgidNoFreelist pgid = 0xffffffffffffffff
// IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
// syncing changes to a file. This is required as some operating systems,
// such as OpenBSD, do not have a unified buffer cache (UBC) and writes
// must be synchronized using the msync(2) syscall.
const IgnoreNoSync = runtime.GOOS == "openbsd"
// Default values if not set in a DB instance.
const (
DefaultMaxBatchSize int = 1000
DefaultMaxBatchDelay = 10 * time.Millisecond
DefaultAllocSize = 16 * 1024 * 1024
)
// default page size for db is set to the OS page size.
var defaultPageSize = os.Getpagesize()
// The time elapsed between consecutive file locking attempts. // The time elapsed between consecutive file locking attempts.
const flockRetryTimeout = 50 * time.Millisecond const flockRetryTimeout = 50 * time.Millisecond
// FreelistType is the type of the freelist backend
type FreelistType string
const (
// FreelistArrayType indicates backend freelist type is array
FreelistArrayType = FreelistType("array")
// FreelistMapType indicates backend freelist type is hashmap
FreelistMapType = FreelistType("hashmap")
)
// DB represents a collection of buckets persisted to a file on disk. // DB represents a collection of buckets persisted to a file on disk.
// All data access is performed through transactions which can be obtained through the DB. // All data access is performed through transactions which can be obtained through the DB.
// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called. // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
@ -85,7 +49,7 @@ type DB struct {
// The alternative one is using hashmap, it is faster in almost all circumstances // The alternative one is using hashmap, it is faster in almost all circumstances
// but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
// The default type is array // The default type is array
FreelistType FreelistType FreelistType common.FreelistType
// When true, skips the truncate call when growing the database. // When true, skips the truncate call when growing the database.
// Setting this to true is only safe on non-ext3/ext4 systems. // Setting this to true is only safe on non-ext3/ext4 systems.
@ -141,8 +105,8 @@ type DB struct {
data *[maxMapSize]byte data *[maxMapSize]byte
datasz int datasz int
filesz int // current on disk file size filesz int // current on disk file size
meta0 *meta meta0 *common.Meta
meta1 *meta meta1 *common.Meta
pageSize int pageSize int
opened bool opened bool
rwtx *Tx rwtx *Tx
@ -206,9 +170,9 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
db.Mlock = options.Mlock db.Mlock = options.Mlock
// Set default values for later DB operations. // Set default values for later DB operations.
db.MaxBatchSize = DefaultMaxBatchSize db.MaxBatchSize = common.DefaultMaxBatchSize
db.MaxBatchDelay = DefaultMaxBatchDelay db.MaxBatchDelay = common.DefaultMaxBatchDelay
db.AllocSize = DefaultAllocSize db.AllocSize = common.DefaultAllocSize
flag := os.O_RDWR flag := os.O_RDWR
if options.ReadOnly { if options.ReadOnly {
@ -249,7 +213,7 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
if db.pageSize = options.PageSize; db.pageSize == 0 { if db.pageSize = options.PageSize; db.pageSize == 0 {
// Set the default page size to the OS page size. // Set the default page size to the OS page size.
db.pageSize = defaultPageSize db.pageSize = common.DefaultPageSize
} }
// Initialize the database if it doesn't exist. // Initialize the database if it doesn't exist.
@ -269,7 +233,7 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
db.pageSize = pgSize db.pageSize = pgSize
} else { } else {
_ = db.close() _ = db.close()
return nil, ErrInvalid return nil, common.ErrInvalid
} }
} }
@ -347,7 +311,7 @@ func (db *DB) getPageSize() (int, error) {
return db.pageSize, nil return db.pageSize, nil
} }
return 0, ErrInvalid return 0, common.ErrInvalid
} }
// getPageSizeFromFirstMeta reads the pageSize from the first meta page // getPageSizeFromFirstMeta reads the pageSize from the first meta page
@ -356,11 +320,11 @@ func (db *DB) getPageSizeFromFirstMeta() (int, bool, error) {
var metaCanRead bool var metaCanRead bool
if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) { if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) {
metaCanRead = true metaCanRead = true
if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil { if m := db.pageInBuffer(buf[:], 0).Meta(); m.Validate() == nil {
return int(m.pageSize), metaCanRead, nil return int(m.PageSize()), metaCanRead, nil
} }
} }
return 0, metaCanRead, ErrInvalid return 0, metaCanRead, common.ErrInvalid
} }
// getPageSizeFromSecondMeta reads the pageSize from the second meta page // getPageSizeFromSecondMeta reads the pageSize from the second meta page
@ -392,13 +356,13 @@ func (db *DB) getPageSizeFromSecondMeta() (int, bool, error) {
bw, err := db.file.ReadAt(buf[:], pos) bw, err := db.file.ReadAt(buf[:], pos)
if (err == nil && bw == len(buf)) || (err == io.EOF && int64(bw) == (fileSize-pos)) { if (err == nil && bw == len(buf)) || (err == io.EOF && int64(bw) == (fileSize-pos)) {
metaCanRead = true metaCanRead = true
if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil { if m := db.pageInBuffer(buf[:], 0).Meta(); m.Validate() == nil {
return int(m.pageSize), metaCanRead, nil return int(m.PageSize()), metaCanRead, nil
} }
} }
} }
return 0, metaCanRead, ErrInvalid return 0, metaCanRead, common.ErrInvalid
} }
// loadFreelist reads the freelist if it is synced, or reconstructs it // loadFreelist reads the freelist if it is synced, or reconstructs it
@ -412,14 +376,14 @@ func (db *DB) loadFreelist() {
db.freelist.readIDs(db.freepages()) db.freelist.readIDs(db.freepages())
} else { } else {
// Read free list from freelist page. // Read free list from freelist page.
db.freelist.read(db.page(db.meta().freelist)) db.freelist.read(db.page(db.meta().Freelist()))
} }
db.stats.FreePageN = db.freelist.free_count() db.stats.FreePageN = db.freelist.free_count()
}) })
} }
func (db *DB) hasSyncedFreelist() bool { func (db *DB) hasSyncedFreelist() bool {
return db.meta().freelist != pgidNoFreelist return db.meta().Freelist() != common.PgidNoFreelist
} }
// mmap opens the underlying memory-mapped file and initializes the meta references. // mmap opens the underlying memory-mapped file and initializes the meta references.
@ -478,14 +442,14 @@ func (db *DB) mmap(minsz int) error {
} }
// Save references to the meta pages. // Save references to the meta pages.
db.meta0 = db.page(0).meta() db.meta0 = db.page(0).Meta()
db.meta1 = db.page(1).meta() db.meta1 = db.page(1).Meta()
// Validate the meta pages. We only return an error if both meta pages fail // Validate the meta pages. We only return an error if both meta pages fail
// validation, since meta0 failing validation means that it wasn't saved // validation, since meta0 failing validation means that it wasn't saved
// properly -- but we can recover using meta1. And vice-versa. // properly -- but we can recover using meta1. And vice-versa.
err0 := db.meta0.validate() err0 := db.meta0.Validate()
err1 := db.meta1.validate() err1 := db.meta1.Validate()
if err0 != nil && err1 != nil { if err0 != nil && err1 != nil {
return err0 return err0
} }
@ -533,8 +497,8 @@ func (db *DB) mmapSize(size int) (int, error) {
// If larger than 1GB then grow by 1GB at a time. // If larger than 1GB then grow by 1GB at a time.
sz := int64(size) sz := int64(size)
if remainder := sz % int64(maxMmapStep); remainder > 0 { if remainder := sz % int64(common.MaxMmapStep); remainder > 0 {
sz += int64(maxMmapStep) - remainder sz += int64(common.MaxMmapStep) - remainder
} }
// Ensure that the mmap size is a multiple of the page size. // Ensure that the mmap size is a multiple of the page size.
@ -581,33 +545,33 @@ func (db *DB) init() error {
// Create two meta pages on a buffer. // Create two meta pages on a buffer.
buf := make([]byte, db.pageSize*4) buf := make([]byte, db.pageSize*4)
for i := 0; i < 2; i++ { for i := 0; i < 2; i++ {
p := db.pageInBuffer(buf, pgid(i)) p := db.pageInBuffer(buf, common.Pgid(i))
p.id = pgid(i) p.SetId(common.Pgid(i))
p.flags = metaPageFlag p.SetFlags(common.MetaPageFlag)
// Initialize the meta page. // Initialize the meta page.
m := p.meta() m := p.Meta()
m.magic = magic m.SetMagic(common.Magic)
m.version = version m.SetVersion(common.Version)
m.pageSize = uint32(db.pageSize) m.SetPageSize(uint32(db.pageSize))
m.freelist = 2 m.SetFreelist(2)
m.root = bucket{root: 3} m.SetRootBucket(common.NewInBucket(3, 0))
m.pgid = 4 m.SetPgid(4)
m.txid = txid(i) m.SetTxid(common.Txid(i))
m.checksum = m.sum64() m.SetChecksum(m.Sum64())
} }
// Write an empty freelist at page 3. // Write an empty freelist at page 3.
p := db.pageInBuffer(buf, pgid(2)) p := db.pageInBuffer(buf, common.Pgid(2))
p.id = pgid(2) p.SetId(2)
p.flags = freelistPageFlag p.SetFlags(common.FreelistPageFlag)
p.count = 0 p.SetCount(0)
// Write an empty leaf page at page 4. // Write an empty leaf page at page 4.
p = db.pageInBuffer(buf, pgid(3)) p = db.pageInBuffer(buf, common.Pgid(3))
p.id = pgid(3) p.SetId(3)
p.flags = leafPageFlag p.SetFlags(common.LeafPageFlag)
p.count = 0 p.SetCount(0)
// Write the buffer to our data file. // Write the buffer to our data file.
if _, err := db.ops.writeAt(buf, 0); err != nil { if _, err := db.ops.writeAt(buf, 0); err != nil {
@ -719,14 +683,14 @@ func (db *DB) beginTx() (*Tx, error) {
if !db.opened { if !db.opened {
db.mmaplock.RUnlock() db.mmaplock.RUnlock()
db.metalock.Unlock() db.metalock.Unlock()
return nil, ErrDatabaseNotOpen return nil, common.ErrDatabaseNotOpen
} }
// Exit if the database is not correctly mapped. // Exit if the database is not correctly mapped.
if db.data == nil { if db.data == nil {
db.mmaplock.RUnlock() db.mmaplock.RUnlock()
db.metalock.Unlock() db.metalock.Unlock()
return nil, ErrInvalidMapping return nil, common.ErrInvalidMapping
} }
// Create a transaction associated with the database. // Create a transaction associated with the database.
@ -752,7 +716,7 @@ func (db *DB) beginTx() (*Tx, error) {
func (db *DB) beginRWTx() (*Tx, error) { func (db *DB) beginRWTx() (*Tx, error) {
// If the database was opened with Options.ReadOnly, return an error. // If the database was opened with Options.ReadOnly, return an error.
if db.readOnly { if db.readOnly {
return nil, ErrDatabaseReadOnly return nil, common.ErrDatabaseReadOnly
} }
// Obtain writer lock. This is released by the transaction when it closes. // Obtain writer lock. This is released by the transaction when it closes.
@ -767,13 +731,13 @@ func (db *DB) beginRWTx() (*Tx, error) {
// Exit if the database is not open yet. // Exit if the database is not open yet.
if !db.opened { if !db.opened {
db.rwlock.Unlock() db.rwlock.Unlock()
return nil, ErrDatabaseNotOpen return nil, common.ErrDatabaseNotOpen
} }
// Exit if the database is not correctly mapped. // Exit if the database is not correctly mapped.
if db.data == nil { if db.data == nil {
db.rwlock.Unlock() db.rwlock.Unlock()
return nil, ErrInvalidMapping return nil, common.ErrInvalidMapping
} }
// Create a transaction associated with the database. // Create a transaction associated with the database.
@ -788,19 +752,19 @@ func (db *DB) beginRWTx() (*Tx, error) {
func (db *DB) freePages() { func (db *DB) freePages() {
// Free all pending pages prior to earliest open transaction. // Free all pending pages prior to earliest open transaction.
sort.Sort(txsById(db.txs)) sort.Sort(txsById(db.txs))
minid := txid(0xFFFFFFFFFFFFFFFF) minid := common.Txid(0xFFFFFFFFFFFFFFFF)
if len(db.txs) > 0 { if len(db.txs) > 0 {
minid = db.txs[0].meta.txid minid = db.txs[0].meta.Txid()
} }
if minid > 0 { if minid > 0 {
db.freelist.release(minid - 1) db.freelist.release(minid - 1)
} }
// Release unused txid extents. // Release unused txid extents.
for _, t := range db.txs { for _, t := range db.txs {
db.freelist.releaseRange(minid, t.meta.txid-1) db.freelist.releaseRange(minid, t.meta.Txid()-1)
minid = t.meta.txid + 1 minid = t.meta.Txid() + 1
} }
db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF)) db.freelist.releaseRange(minid, common.Txid(0xFFFFFFFFFFFFFFFF))
// Any page both allocated and freed in an extent is safe to release. // Any page both allocated and freed in an extent is safe to release.
} }
@ -808,7 +772,7 @@ type txsById []*Tx
func (t txsById) Len() int { return len(t) } func (t txsById) Len() int { return len(t) }
func (t txsById) Swap(i, j int) { t[i], t[j] = t[j], t[i] } func (t txsById) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid } func (t txsById) Less(i, j int) bool { return t[i].meta.Txid() < t[j].meta.Txid() }
// removeTx removes a transaction from the database. // removeTx removes a transaction from the database.
func (db *DB) removeTx(tx *Tx) { func (db *DB) removeTx(tx *Tx) {
@ -1050,37 +1014,37 @@ func (db *DB) Stats() Stats {
// This is for internal access to the raw data bytes from the C cursor, use // This is for internal access to the raw data bytes from the C cursor, use
// carefully, or not at all. // carefully, or not at all.
func (db *DB) Info() *Info { func (db *DB) Info() *Info {
_assert(db.data != nil, "database file isn't correctly mapped") common.Assert(db.data != nil, "database file isn't correctly mapped")
return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize} return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize}
} }
// page retrieves a page reference from the mmap based on the current page size. // page retrieves a page reference from the mmap based on the current page size.
func (db *DB) page(id pgid) *page { func (db *DB) page(id common.Pgid) *common.Page {
pos := id * pgid(db.pageSize) pos := id * common.Pgid(db.pageSize)
return (*page)(unsafe.Pointer(&db.data[pos])) return (*common.Page)(unsafe.Pointer(&db.data[pos]))
} }
// pageInBuffer retrieves a page reference from a given byte array based on the current page size. // pageInBuffer retrieves a page reference from a given byte array based on the current page size.
func (db *DB) pageInBuffer(b []byte, id pgid) *page { func (db *DB) pageInBuffer(b []byte, id common.Pgid) *common.Page {
return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)])) return (*common.Page)(unsafe.Pointer(&b[id*common.Pgid(db.pageSize)]))
} }
// meta retrieves the current meta page reference. // meta retrieves the current meta page reference.
func (db *DB) meta() *meta { func (db *DB) meta() *common.Meta {
// We have to return the meta with the highest txid which doesn't fail // We have to return the meta with the highest txid which doesn't fail
// validation. Otherwise, we can cause errors when in fact the database is // validation. Otherwise, we can cause errors when in fact the database is
// in a consistent state. metaA is the one with the higher txid. // in a consistent state. metaA is the one with the higher txid.
metaA := db.meta0 metaA := db.meta0
metaB := db.meta1 metaB := db.meta1
if db.meta1.txid > db.meta0.txid { if db.meta1.Txid() > db.meta0.Txid() {
metaA = db.meta1 metaA = db.meta1
metaB = db.meta0 metaB = db.meta0
} }
// Use higher meta page if valid. Otherwise, fallback to previous, if valid. // Use higher meta page if valid. Otherwise, fallback to previous, if valid.
if err := metaA.validate(); err == nil { if err := metaA.Validate(); err == nil {
return metaA return metaA
} else if err := metaB.validate(); err == nil { } else if err := metaB.Validate(); err == nil {
return metaB return metaB
} }
@ -1090,7 +1054,7 @@ func (db *DB) meta() *meta {
} }
// allocate returns a contiguous block of memory starting at a given page. // allocate returns a contiguous block of memory starting at a given page.
func (db *DB) allocate(txid txid, count int) (*page, error) { func (db *DB) allocate(txid common.Txid, count int) (*common.Page, error) {
// Allocate a temporary buffer for the page. // Allocate a temporary buffer for the page.
var buf []byte var buf []byte
if count == 1 { if count == 1 {
@ -1098,17 +1062,18 @@ func (db *DB) allocate(txid txid, count int) (*page, error) {
} else { } else {
buf = make([]byte, count*db.pageSize) buf = make([]byte, count*db.pageSize)
} }
p := (*page)(unsafe.Pointer(&buf[0])) p := (*common.Page)(unsafe.Pointer(&buf[0]))
p.overflow = uint32(count - 1) p.SetOverflow(uint32(count - 1))
// Use pages from the freelist if they are available. // Use pages from the freelist if they are available.
if p.id = db.freelist.allocate(txid, count); p.id != 0 { p.SetId(db.freelist.allocate(txid, count))
if p.Id() != 0 {
return p, nil return p, nil
} }
// Resize mmap() if we're at the end. // Resize mmap() if we're at the end.
p.id = db.rwtx.meta.pgid p.SetId(db.rwtx.meta.Pgid())
var minsz = int((p.id+pgid(count))+1) * db.pageSize var minsz = int((p.Id()+common.Pgid(count))+1) * db.pageSize
if minsz >= db.datasz { if minsz >= db.datasz {
if err := db.mmap(minsz); err != nil { if err := db.mmap(minsz); err != nil {
return nil, fmt.Errorf("mmap allocate error: %s", err) return nil, fmt.Errorf("mmap allocate error: %s", err)
@ -1116,7 +1081,8 @@ func (db *DB) allocate(txid txid, count int) (*page, error) {
} }
// Move the page id high water mark. // Move the page id high water mark.
db.rwtx.meta.pgid += pgid(count) curPgid := db.rwtx.meta.Pgid()
db.rwtx.meta.SetPgid(curPgid + common.Pgid(count))
return p, nil return p, nil
} }
@ -1163,7 +1129,7 @@ func (db *DB) IsReadOnly() bool {
return db.readOnly return db.readOnly
} }
func (db *DB) freepages() []pgid { func (db *DB) freepages() []common.Pgid {
tx, err := db.beginTx() tx, err := db.beginTx()
defer func() { defer func() {
err = tx.Rollback() err = tx.Rollback()
@ -1175,8 +1141,8 @@ func (db *DB) freepages() []pgid {
panic("freepages: failed to open read only tx") panic("freepages: failed to open read only tx")
} }
reachable := make(map[pgid]*page) reachable := make(map[common.Pgid]*common.Page)
nofreed := make(map[pgid]bool) nofreed := make(map[common.Pgid]bool)
ech := make(chan error) ech := make(chan error)
go func() { go func() {
for e := range ech { for e := range ech {
@ -1188,8 +1154,8 @@ func (db *DB) freepages() []pgid {
// TODO: If check bucket reported any corruptions (ech) we shouldn't proceed to freeing the pages. // TODO: If check bucket reported any corruptions (ech) we shouldn't proceed to freeing the pages.
var fids []pgid var fids []common.Pgid
for i := pgid(2); i < db.meta().pgid; i++ { for i := common.Pgid(2); i < db.meta().Pgid(); i++ {
if _, ok := reachable[i]; !ok { if _, ok := reachable[i]; !ok {
fids = append(fids, i) fids = append(fids, i)
} }
@ -1221,7 +1187,7 @@ type Options struct {
// The alternative one is using hashmap, it is faster in almost all circumstances // The alternative one is using hashmap, it is faster in almost all circumstances
// but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
// The default type is array // The default type is array
FreelistType FreelistType FreelistType common.FreelistType
// Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
// grab a shared lock (UNIX). // grab a shared lock (UNIX).
@ -1263,7 +1229,7 @@ type Options struct {
var DefaultOptions = &Options{ var DefaultOptions = &Options{
Timeout: 0, Timeout: 0,
NoGrowSync: false, NoGrowSync: false,
FreelistType: FreelistArrayType, FreelistType: common.FreelistArrayType,
} }
// Stats represents statistics about the database. // Stats represents statistics about the database.
@ -1302,65 +1268,3 @@ type Info struct {
Data uintptr Data uintptr
PageSize int PageSize int
} }
type meta struct {
magic uint32
version uint32
pageSize uint32
flags uint32
root bucket
freelist pgid
pgid pgid
txid txid
checksum uint64
}
// validate checks the marker bytes and version of the meta page to ensure it matches this binary.
func (m *meta) validate() error {
if m.magic != magic {
return ErrInvalid
} else if m.version != version {
return ErrVersionMismatch
} else if m.checksum != m.sum64() {
return ErrChecksum
}
return nil
}
// copy copies one meta object to another.
func (m *meta) copy(dest *meta) {
*dest = *m
}
// write writes the meta onto a page.
func (m *meta) write(p *page) {
if m.root.root >= m.pgid {
panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
} else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist {
// TODO: reject pgidNoFreeList if !NoFreelistSync
panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
}
// Page id is either going to be 0 or 1 which we can determine by the transaction ID.
p.id = pgid(m.txid % 2)
p.flags |= metaPageFlag
// Calculate the checksum.
m.checksum = m.sum64()
m.copy(p.meta())
}
// generates the checksum for the meta.
func (m *meta) sum64() uint64 {
var h = fnv.New64a()
_, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
return h.Sum64()
}
// _assert will panic with a given formatted message if the given condition is false.
func _assert(condition bool, msg string, v ...interface{}) {
if !condition {
panic(fmt.Sprintf("assertion failed: "+msg, v...))
}
}

View File

@ -21,6 +21,7 @@ import (
bolt "go.etcd.io/bbolt" bolt "go.etcd.io/bbolt"
"go.etcd.io/bbolt/internal/btesting" "go.etcd.io/bbolt/internal/btesting"
"go.etcd.io/bbolt/internal/common"
) )
// pageSize is the size of one page in the data file. // pageSize is the size of one page in the data file.
@ -136,7 +137,7 @@ func TestOpen_ErrInvalid(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
if _, err := bolt.Open(path, 0666, nil); err != bolt.ErrInvalid { if _, err := bolt.Open(path, 0666, nil); err != common.ErrInvalid {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }
@ -172,7 +173,7 @@ func TestOpen_ErrVersionMismatch(t *testing.T) {
} }
// Reopen data file. // Reopen data file.
if _, err := bolt.Open(path, 0666, nil); err != bolt.ErrVersionMismatch { if _, err := bolt.Open(path, 0666, nil); err != common.ErrVersionMismatch {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }
@ -208,7 +209,7 @@ func TestOpen_ErrChecksum(t *testing.T) {
} }
// Reopen data file. // Reopen data file.
if _, err := bolt.Open(path, 0666, nil); err != bolt.ErrChecksum { if _, err := bolt.Open(path, 0666, nil); err != common.ErrChecksum {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }
@ -552,7 +553,7 @@ func TestDB_Open_ReadOnly(t *testing.T) {
} }
// Can't launch read-write transaction. // Can't launch read-write transaction.
if _, err := readOnlyDB.Begin(true); err != bolt.ErrDatabaseReadOnly { if _, err := readOnlyDB.Begin(true); err != common.ErrDatabaseReadOnly {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
@ -641,7 +642,7 @@ func TestOpen_RecoverFreeList(t *testing.T) {
// Ensure that a database cannot open a transaction when it's not open. // Ensure that a database cannot open a transaction when it's not open.
func TestDB_Begin_ErrDatabaseNotOpen(t *testing.T) { func TestDB_Begin_ErrDatabaseNotOpen(t *testing.T) {
var db bolt.DB var db bolt.DB
if _, err := db.Begin(false); err != bolt.ErrDatabaseNotOpen { if _, err := db.Begin(false); err != common.ErrDatabaseNotOpen {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }
@ -727,7 +728,7 @@ func TestDB_Concurrent_WriteTo(t *testing.T) {
// Ensure that opening a transaction while the DB is closed returns an error. // Ensure that opening a transaction while the DB is closed returns an error.
func TestDB_BeginRW_Closed(t *testing.T) { func TestDB_BeginRW_Closed(t *testing.T) {
var db bolt.DB var db bolt.DB
if _, err := db.Begin(true); err != bolt.ErrDatabaseNotOpen { if _, err := db.Begin(true); err != common.ErrDatabaseNotOpen {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }
@ -828,7 +829,7 @@ func TestDB_Update_Closed(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
return nil return nil
}); err != bolt.ErrDatabaseNotOpen { }); err != common.ErrDatabaseNotOpen {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }

View File

@ -6,6 +6,8 @@ import (
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"go.etcd.io/bbolt/internal/common"
) )
func TestOpenWithPreLoadFreelist(t *testing.T) { func TestOpenWithPreLoadFreelist(t *testing.T) {
@ -76,7 +78,7 @@ func TestMethodPage(t *testing.T) {
name: "readonly mode without preloading free pages", name: "readonly mode without preloading free pages",
readonly: true, readonly: true,
preLoadFreePage: false, preLoadFreePage: false,
expectedError: ErrFreePagesNotLoaded, expectedError: common.ErrFreePagesNotLoaded,
}, },
} }

View File

@ -4,50 +4,52 @@ import (
"fmt" "fmt"
"sort" "sort"
"unsafe" "unsafe"
"go.etcd.io/bbolt/internal/common"
) )
// txPending holds a list of pgids and corresponding allocation txns // txPending holds a list of pgids and corresponding allocation txns
// that are pending to be freed. // that are pending to be freed.
type txPending struct { type txPending struct {
ids []pgid ids []common.Pgid
alloctx []txid // txids allocating the ids alloctx []common.Txid // txids allocating the ids
lastReleaseBegin txid // beginning txid of last matching releaseRange lastReleaseBegin common.Txid // beginning txid of last matching releaseRange
} }
// pidSet holds the set of starting pgids which have the same span size // pidSet holds the set of starting pgids which have the same span size
type pidSet map[pgid]struct{} type pidSet map[common.Pgid]struct{}
// freelist represents a list of all pages that are available for allocation. // freelist represents a list of all pages that are available for allocation.
// It also tracks pages that have been freed but are still in use by open transactions. // It also tracks pages that have been freed but are still in use by open transactions.
type freelist struct { type freelist struct {
freelistType FreelistType // freelist type freelistType common.FreelistType // freelist type
ids []pgid // all free and available free page ids. ids []common.Pgid // all free and available free page ids.
allocs map[pgid]txid // mapping of txid that allocated a pgid. allocs map[common.Pgid]common.Txid // mapping of Txid that allocated a pgid.
pending map[txid]*txPending // mapping of soon-to-be free page ids by tx. pending map[common.Txid]*txPending // mapping of soon-to-be free page ids by tx.
cache map[pgid]struct{} // fast lookup of all free and pending page ids. cache map[common.Pgid]struct{} // fast lookup of all free and pending page ids.
freemaps map[uint64]pidSet // key is the size of continuous pages(span), value is a set which contains the starting pgids of same size freemaps map[uint64]pidSet // key is the size of continuous pages(span), value is a set which contains the starting pgids of same size
forwardMap map[pgid]uint64 // key is start pgid, value is its span size forwardMap map[common.Pgid]uint64 // key is start pgid, value is its span size
backwardMap map[pgid]uint64 // key is end pgid, value is its span size backwardMap map[common.Pgid]uint64 // key is end pgid, value is its span size
allocate func(txid txid, n int) pgid // the freelist allocate func allocate func(txid common.Txid, n int) common.Pgid // the freelist allocate func
free_count func() int // the function which gives you free page number free_count func() int // the function which gives you free page number
mergeSpans func(ids pgids) // the mergeSpan func mergeSpans func(ids common.Pgids) // the mergeSpan func
getFreePageIDs func() []pgid // get free pgids func getFreePageIDs func() []common.Pgid // get free pgids func
readIDs func(pgids []pgid) // readIDs func reads list of pages and init the freelist readIDs func(pgids []common.Pgid) // readIDs func reads list of pages and init the freelist
} }
// newFreelist returns an empty, initialized freelist. // newFreelist returns an empty, initialized freelist.
func newFreelist(freelistType FreelistType) *freelist { func newFreelist(freelistType common.FreelistType) *freelist {
f := &freelist{ f := &freelist{
freelistType: freelistType, freelistType: freelistType,
allocs: make(map[pgid]txid), allocs: make(map[common.Pgid]common.Txid),
pending: make(map[txid]*txPending), pending: make(map[common.Txid]*txPending),
cache: make(map[pgid]struct{}), cache: make(map[common.Pgid]struct{}),
freemaps: make(map[uint64]pidSet), freemaps: make(map[uint64]pidSet),
forwardMap: make(map[pgid]uint64), forwardMap: make(map[common.Pgid]uint64),
backwardMap: make(map[pgid]uint64), backwardMap: make(map[common.Pgid]uint64),
} }
if freelistType == FreelistMapType { if freelistType == common.FreelistMapType {
f.allocate = f.hashmapAllocate f.allocate = f.hashmapAllocate
f.free_count = f.hashmapFreeCount f.free_count = f.hashmapFreeCount
f.mergeSpans = f.hashmapMergeSpans f.mergeSpans = f.hashmapMergeSpans
@ -71,7 +73,7 @@ func (f *freelist) size() int {
// The first element will be used to store the count. See freelist.write. // The first element will be used to store the count. See freelist.write.
n++ n++
} }
return int(pageHeaderSize) + (int(unsafe.Sizeof(pgid(0))) * n) return int(common.PageHeaderSize) + (int(unsafe.Sizeof(common.Pgid(0))) * n)
} }
// count returns count of pages on the freelist // count returns count of pages on the freelist
@ -95,23 +97,23 @@ func (f *freelist) pending_count() int {
// copyall copies a list of all free ids and all pending ids in one sorted list. // copyall copies a list of all free ids and all pending ids in one sorted list.
// f.count returns the minimum length required for dst. // f.count returns the minimum length required for dst.
func (f *freelist) copyall(dst []pgid) { func (f *freelist) copyall(dst []common.Pgid) {
m := make(pgids, 0, f.pending_count()) m := make(common.Pgids, 0, f.pending_count())
for _, txp := range f.pending { for _, txp := range f.pending {
m = append(m, txp.ids...) m = append(m, txp.ids...)
} }
sort.Sort(m) sort.Sort(m)
mergepgids(dst, f.getFreePageIDs(), m) common.Mergepgids(dst, f.getFreePageIDs(), m)
} }
// arrayAllocate returns the starting page id of a contiguous list of pages of a given size. // arrayAllocate returns the starting page id of a contiguous list of pages of a given size.
// If a contiguous block cannot be found then 0 is returned. // If a contiguous block cannot be found then 0 is returned.
func (f *freelist) arrayAllocate(txid txid, n int) pgid { func (f *freelist) arrayAllocate(txid common.Txid, n int) common.Pgid {
if len(f.ids) == 0 { if len(f.ids) == 0 {
return 0 return 0
} }
var initial, previd pgid var initial, previd common.Pgid
for i, id := range f.ids { for i, id := range f.ids {
if id <= 1 { if id <= 1 {
panic(fmt.Sprintf("invalid page allocation: %d", id)) panic(fmt.Sprintf("invalid page allocation: %d", id))
@ -123,7 +125,7 @@ func (f *freelist) arrayAllocate(txid txid, n int) pgid {
} }
// If we found a contiguous block then remove it and return it. // If we found a contiguous block then remove it and return it.
if (id-initial)+1 == pgid(n) { if (id-initial)+1 == common.Pgid(n) {
// If we're allocating off the beginning then take the fast path // If we're allocating off the beginning then take the fast path
// and just adjust the existing slice. This will use extra memory // and just adjust the existing slice. This will use extra memory
// temporarily but the append() in free() will realloc the slice // temporarily but the append() in free() will realloc the slice
@ -136,7 +138,7 @@ func (f *freelist) arrayAllocate(txid txid, n int) pgid {
} }
// Remove from the free cache. // Remove from the free cache.
for i := pgid(0); i < pgid(n); i++ { for i := common.Pgid(0); i < common.Pgid(n); i++ {
delete(f.cache, initial+i) delete(f.cache, initial+i)
} }
f.allocs[initial] = txid f.allocs[initial] = txid
@ -150,9 +152,9 @@ func (f *freelist) arrayAllocate(txid txid, n int) pgid {
// free releases a page and its overflow for a given transaction id. // free releases a page and its overflow for a given transaction id.
// If the page is already free then a panic will occur. // If the page is already free then a panic will occur.
func (f *freelist) free(txid txid, p *page) { func (f *freelist) free(txid common.Txid, p *common.Page) {
if p.id <= 1 { if p.Id() <= 1 {
panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.id)) panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.Id()))
} }
// Free page and all its overflow pages. // Free page and all its overflow pages.
@ -161,15 +163,15 @@ func (f *freelist) free(txid txid, p *page) {
txp = &txPending{} txp = &txPending{}
f.pending[txid] = txp f.pending[txid] = txp
} }
allocTxid, ok := f.allocs[p.id] allocTxid, ok := f.allocs[p.Id()]
if ok { if ok {
delete(f.allocs, p.id) delete(f.allocs, p.Id())
} else if (p.flags & freelistPageFlag) != 0 { } else if (p.Flags() & common.FreelistPageFlag) != 0 {
// Freelist is always allocated by prior tx. // Freelist is always allocated by prior tx.
allocTxid = txid - 1 allocTxid = txid - 1
} }
for id := p.id; id <= p.id+pgid(p.overflow); id++ { for id := p.Id(); id <= p.Id()+common.Pgid(p.Overflow()); id++ {
// Verify that page is not already free. // Verify that page is not already free.
if _, ok := f.cache[id]; ok { if _, ok := f.cache[id]; ok {
panic(fmt.Sprintf("page %d already freed", id)) panic(fmt.Sprintf("page %d already freed", id))
@ -182,8 +184,8 @@ func (f *freelist) free(txid txid, p *page) {
} }
// release moves all page ids for a transaction id (or older) to the freelist. // release moves all page ids for a transaction id (or older) to the freelist.
func (f *freelist) release(txid txid) { func (f *freelist) release(txid common.Txid) {
m := make(pgids, 0) m := make(common.Pgids, 0)
for tid, txp := range f.pending { for tid, txp := range f.pending {
if tid <= txid { if tid <= txid {
// Move transaction's pending pages to the available freelist. // Move transaction's pending pages to the available freelist.
@ -196,11 +198,11 @@ func (f *freelist) release(txid txid) {
} }
// releaseRange moves pending pages allocated within an extent [begin,end] to the free list. // releaseRange moves pending pages allocated within an extent [begin,end] to the free list.
func (f *freelist) releaseRange(begin, end txid) { func (f *freelist) releaseRange(begin, end common.Txid) {
if begin > end { if begin > end {
return return
} }
var m pgids var m common.Pgids
for tid, txp := range f.pending { for tid, txp := range f.pending {
if tid < begin || tid > end { if tid < begin || tid > end {
continue continue
@ -229,13 +231,13 @@ func (f *freelist) releaseRange(begin, end txid) {
} }
// rollback removes the pages from a given pending tx. // rollback removes the pages from a given pending tx.
func (f *freelist) rollback(txid txid) { func (f *freelist) rollback(txid common.Txid) {
// Remove page ids from cache. // Remove page ids from cache.
txp := f.pending[txid] txp := f.pending[txid]
if txp == nil { if txp == nil {
return return
} }
var m pgids var m common.Pgids
for i, pgid := range txp.ids { for i, pgid := range txp.ids {
delete(f.cache, pgid) delete(f.cache, pgid)
tx := txp.alloctx[i] tx := txp.alloctx[i]
@ -256,82 +258,69 @@ func (f *freelist) rollback(txid txid) {
} }
// freed returns whether a given page is in the free list. // freed returns whether a given page is in the free list.
func (f *freelist) freed(pgId pgid) bool { func (f *freelist) freed(pgId common.Pgid) bool {
_, ok := f.cache[pgId] _, ok := f.cache[pgId]
return ok return ok
} }
// read initializes the freelist from a freelist page. // read initializes the freelist from a freelist page.
func (f *freelist) read(p *page) { func (f *freelist) read(p *common.Page) {
if (p.flags & freelistPageFlag) == 0 { if (p.Flags() & common.FreelistPageFlag) == 0 {
panic(fmt.Sprintf("invalid freelist page: %d, page type is %s", p.id, p.typ())) panic(fmt.Sprintf("invalid freelist page: %d, page type is %s", p.Id(), p.Typ()))
}
// If the page.count is at the max uint16 value (64k) then it's considered
// an overflow and the size of the freelist is stored as the first element.
var idx, count = 0, int(p.count)
if count == 0xFFFF {
idx = 1
c := *(*pgid)(unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)))
count = int(c)
if count < 0 {
panic(fmt.Sprintf("leading element count %d overflows int", c))
}
} }
ids := p.FreelistPageIds()
// Copy the list of page ids from the freelist. // Copy the list of page ids from the freelist.
if count == 0 { if len(ids) == 0 {
f.ids = nil f.ids = nil
} else { } else {
var ids []pgid
data := unsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), unsafe.Sizeof(ids[0]), idx)
unsafeSlice(unsafe.Pointer(&ids), data, count)
// copy the ids, so we don't modify on the freelist page directly // copy the ids, so we don't modify on the freelist page directly
idsCopy := make([]pgid, count) idsCopy := make([]common.Pgid, len(ids))
copy(idsCopy, ids) copy(idsCopy, ids)
// Make sure they're sorted. // Make sure they're sorted.
sort.Sort(pgids(idsCopy)) sort.Sort(common.Pgids(idsCopy))
f.readIDs(idsCopy) f.readIDs(idsCopy)
} }
} }
// arrayReadIDs initializes the freelist from a given list of ids. // arrayReadIDs initializes the freelist from a given list of ids.
func (f *freelist) arrayReadIDs(ids []pgid) { func (f *freelist) arrayReadIDs(ids []common.Pgid) {
f.ids = ids f.ids = ids
f.reindex() f.reindex()
} }
func (f *freelist) arrayGetFreePageIDs() []pgid { func (f *freelist) arrayGetFreePageIDs() []common.Pgid {
return f.ids return f.ids
} }
// write writes the page ids onto a freelist page. All free and pending ids are // write writes the page ids onto a freelist page. All free and pending ids are
// saved to disk since in the event of a program crash, all pending ids will // saved to disk since in the event of a program crash, all pending ids will
// become free. // become free.
func (f *freelist) write(p *page) error { func (f *freelist) write(p *common.Page) error {
// Combine the old free pgids and pgids waiting on an open transaction. // Combine the old free pgids and pgids waiting on an open transaction.
// Update the header flag. // Update the header flag.
p.flags |= freelistPageFlag p.FlagsXOR(common.FreelistPageFlag)
// The page.count can only hold up to 64k elements so if we overflow that // The page.count can only hold up to 64k elements so if we overflow that
// number then we handle it by putting the size in the first element. // number then we handle it by putting the size in the first element.
l := f.count() l := f.count()
if l == 0 { if l == 0 {
p.count = uint16(l) p.SetCount(uint16(l))
} else if l < 0xFFFF { } else if l < 0xFFFF {
p.count = uint16(l) p.SetCount(uint16(l))
var ids []pgid var ids []common.Pgid
data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) data := common.UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))
unsafeSlice(unsafe.Pointer(&ids), data, l) common.UnsafeSlice(unsafe.Pointer(&ids), data, l)
f.copyall(ids) f.copyall(ids)
} else { } else {
p.count = 0xFFFF p.SetCount(0xFFFF)
var ids []pgid var ids []common.Pgid
data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) data := common.UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))
unsafeSlice(unsafe.Pointer(&ids), data, l+1) common.UnsafeSlice(unsafe.Pointer(&ids), data, l+1)
ids[0] = pgid(l) ids[0] = common.Pgid(l)
f.copyall(ids[1:]) f.copyall(ids[1:])
} }
@ -339,11 +328,11 @@ func (f *freelist) write(p *page) error {
} }
// reload reads the freelist from a page and filters out pending items. // reload reads the freelist from a page and filters out pending items.
func (f *freelist) reload(p *page) { func (f *freelist) reload(p *common.Page) {
f.read(p) f.read(p)
// Build a cache of only pending pages. // Build a cache of only pending pages.
pcache := make(map[pgid]bool) pcache := make(map[common.Pgid]bool)
for _, txp := range f.pending { for _, txp := range f.pending {
for _, pendingID := range txp.ids { for _, pendingID := range txp.ids {
pcache[pendingID] = true pcache[pendingID] = true
@ -352,7 +341,7 @@ func (f *freelist) reload(p *page) {
// Check each page in the freelist and build a new available freelist // Check each page in the freelist and build a new available freelist
// with any pages not in the pending lists. // with any pages not in the pending lists.
var a []pgid var a []common.Pgid
for _, id := range f.getFreePageIDs() { for _, id := range f.getFreePageIDs() {
if !pcache[id] { if !pcache[id] {
a = append(a, id) a = append(a, id)
@ -362,10 +351,10 @@ func (f *freelist) reload(p *page) {
f.readIDs(a) f.readIDs(a)
} }
// noSyncReload reads the freelist from pgids and filters out pending items. // noSyncReload reads the freelist from Pgids and filters out pending items.
func (f *freelist) noSyncReload(pgids []pgid) { func (f *freelist) noSyncReload(Pgids []common.Pgid) {
// Build a cache of only pending pages. // Build a cache of only pending pages.
pcache := make(map[pgid]bool) pcache := make(map[common.Pgid]bool)
for _, txp := range f.pending { for _, txp := range f.pending {
for _, pendingID := range txp.ids { for _, pendingID := range txp.ids {
pcache[pendingID] = true pcache[pendingID] = true
@ -374,8 +363,8 @@ func (f *freelist) noSyncReload(pgids []pgid) {
// Check each page in the freelist and build a new available freelist // Check each page in the freelist and build a new available freelist
// with any pages not in the pending lists. // with any pages not in the pending lists.
var a []pgid var a []common.Pgid
for _, id := range pgids { for _, id := range Pgids {
if !pcache[id] { if !pcache[id] {
a = append(a, id) a = append(a, id)
} }
@ -387,7 +376,7 @@ func (f *freelist) noSyncReload(pgids []pgid) {
// reindex rebuilds the free cache based on available and pending free lists. // reindex rebuilds the free cache based on available and pending free lists.
func (f *freelist) reindex() { func (f *freelist) reindex() {
ids := f.getFreePageIDs() ids := f.getFreePageIDs()
f.cache = make(map[pgid]struct{}, len(ids)) f.cache = make(map[common.Pgid]struct{}, len(ids))
for _, id := range ids { for _, id := range ids {
f.cache[id] = struct{}{} f.cache[id] = struct{}{}
} }
@ -399,7 +388,7 @@ func (f *freelist) reindex() {
} }
// arrayMergeSpans try to merge list of pages(represented by pgids) with existing spans but using array // arrayMergeSpans try to merge list of pages(represented by pgids) with existing spans but using array
func (f *freelist) arrayMergeSpans(ids pgids) { func (f *freelist) arrayMergeSpans(ids common.Pgids) {
sort.Sort(ids) sort.Sort(ids)
f.ids = pgids(f.ids).merge(ids) f.ids = common.Pgids(f.ids).Merge(ids)
} }

View File

@ -1,6 +1,10 @@
package bbolt package bbolt
import "sort" import (
"sort"
"go.etcd.io/bbolt/internal/common"
)
// hashmapFreeCount returns count of free pages(hashmap version) // hashmapFreeCount returns count of free pages(hashmap version)
func (f *freelist) hashmapFreeCount() int { func (f *freelist) hashmapFreeCount() int {
@ -13,7 +17,7 @@ func (f *freelist) hashmapFreeCount() int {
} }
// hashmapAllocate serves the same purpose as arrayAllocate, but use hashmap as backend // hashmapAllocate serves the same purpose as arrayAllocate, but use hashmap as backend
func (f *freelist) hashmapAllocate(txid txid, n int) pgid { func (f *freelist) hashmapAllocate(txid common.Txid, n int) common.Pgid {
if n == 0 { if n == 0 {
return 0 return 0
} }
@ -26,7 +30,7 @@ func (f *freelist) hashmapAllocate(txid txid, n int) pgid {
f.allocs[pid] = txid f.allocs[pid] = txid
for i := pgid(0); i < pgid(n); i++ { for i := common.Pgid(0); i < common.Pgid(n); i++ {
delete(f.cache, pid+i) delete(f.cache, pid+i)
} }
return pid return pid
@ -48,9 +52,9 @@ func (f *freelist) hashmapAllocate(txid txid, n int) pgid {
remain := size - uint64(n) remain := size - uint64(n)
// add remain span // add remain span
f.addSpan(pid+pgid(n), remain) f.addSpan(pid+common.Pgid(n), remain)
for i := pgid(0); i < pgid(n); i++ { for i := common.Pgid(0); i < common.Pgid(n); i++ {
delete(f.cache, pid+i) delete(f.cache, pid+i)
} }
return pid return pid
@ -61,7 +65,7 @@ func (f *freelist) hashmapAllocate(txid txid, n int) pgid {
} }
// hashmapReadIDs reads pgids as input an initial the freelist(hashmap version) // hashmapReadIDs reads pgids as input an initial the freelist(hashmap version)
func (f *freelist) hashmapReadIDs(pgids []pgid) { func (f *freelist) hashmapReadIDs(pgids []common.Pgid) {
f.init(pgids) f.init(pgids)
// Rebuild the page cache. // Rebuild the page cache.
@ -69,25 +73,25 @@ func (f *freelist) hashmapReadIDs(pgids []pgid) {
} }
// hashmapGetFreePageIDs returns the sorted free page ids // hashmapGetFreePageIDs returns the sorted free page ids
func (f *freelist) hashmapGetFreePageIDs() []pgid { func (f *freelist) hashmapGetFreePageIDs() []common.Pgid {
count := f.free_count() count := f.free_count()
if count == 0 { if count == 0 {
return nil return nil
} }
m := make([]pgid, 0, count) m := make([]common.Pgid, 0, count)
for start, size := range f.forwardMap { for start, size := range f.forwardMap {
for i := 0; i < int(size); i++ { for i := 0; i < int(size); i++ {
m = append(m, start+pgid(i)) m = append(m, start+common.Pgid(i))
} }
} }
sort.Sort(pgids(m)) sort.Sort(common.Pgids(m))
return m return m
} }
// hashmapMergeSpans try to merge list of pages(represented by pgids) with existing spans // hashmapMergeSpans try to merge list of pages(represented by pgids) with existing spans
func (f *freelist) hashmapMergeSpans(ids pgids) { func (f *freelist) hashmapMergeSpans(ids common.Pgids) {
for _, id := range ids { for _, id := range ids {
// try to see if we can merge and update // try to see if we can merge and update
f.mergeWithExistingSpan(id) f.mergeWithExistingSpan(id)
@ -95,7 +99,7 @@ func (f *freelist) hashmapMergeSpans(ids pgids) {
} }
// mergeWithExistingSpan merges pid to the existing free spans, try to merge it backward and forward // mergeWithExistingSpan merges pid to the existing free spans, try to merge it backward and forward
func (f *freelist) mergeWithExistingSpan(pid pgid) { func (f *freelist) mergeWithExistingSpan(pid common.Pgid) {
prev := pid - 1 prev := pid - 1
next := pid + 1 next := pid + 1
@ -106,10 +110,10 @@ func (f *freelist) mergeWithExistingSpan(pid pgid) {
if mergeWithPrev { if mergeWithPrev {
//merge with previous span //merge with previous span
start := prev + 1 - pgid(preSize) start := prev + 1 - common.Pgid(preSize)
f.delSpan(start, preSize) f.delSpan(start, preSize)
newStart -= pgid(preSize) newStart -= common.Pgid(preSize)
newSize += preSize newSize += preSize
} }
@ -122,19 +126,19 @@ func (f *freelist) mergeWithExistingSpan(pid pgid) {
f.addSpan(newStart, newSize) f.addSpan(newStart, newSize)
} }
func (f *freelist) addSpan(start pgid, size uint64) { func (f *freelist) addSpan(start common.Pgid, size uint64) {
f.backwardMap[start-1+pgid(size)] = size f.backwardMap[start-1+common.Pgid(size)] = size
f.forwardMap[start] = size f.forwardMap[start] = size
if _, ok := f.freemaps[size]; !ok { if _, ok := f.freemaps[size]; !ok {
f.freemaps[size] = make(map[pgid]struct{}) f.freemaps[size] = make(map[common.Pgid]struct{})
} }
f.freemaps[size][start] = struct{}{} f.freemaps[size][start] = struct{}{}
} }
func (f *freelist) delSpan(start pgid, size uint64) { func (f *freelist) delSpan(start common.Pgid, size uint64) {
delete(f.forwardMap, start) delete(f.forwardMap, start)
delete(f.backwardMap, start+pgid(size-1)) delete(f.backwardMap, start+common.Pgid(size-1))
delete(f.freemaps[size], start) delete(f.freemaps[size], start)
if len(f.freemaps[size]) == 0 { if len(f.freemaps[size]) == 0 {
delete(f.freemaps, size) delete(f.freemaps, size)
@ -143,7 +147,7 @@ func (f *freelist) delSpan(start pgid, size uint64) {
// initial from pgids using when use hashmap version // initial from pgids using when use hashmap version
// pgids must be sorted // pgids must be sorted
func (f *freelist) init(pgids []pgid) { func (f *freelist) init(pgids []common.Pgid) {
if len(pgids) == 0 { if len(pgids) == 0 {
return return
} }
@ -151,13 +155,13 @@ func (f *freelist) init(pgids []pgid) {
size := uint64(1) size := uint64(1)
start := pgids[0] start := pgids[0]
if !sort.SliceIsSorted([]pgid(pgids), func(i, j int) bool { return pgids[i] < pgids[j] }) { if !sort.SliceIsSorted([]common.Pgid(pgids), func(i, j int) bool { return pgids[i] < pgids[j] }) {
panic("pgids not sorted") panic("pgids not sorted")
} }
f.freemaps = make(map[uint64]pidSet) f.freemaps = make(map[uint64]pidSet)
f.forwardMap = make(map[pgid]uint64) f.forwardMap = make(map[common.Pgid]uint64)
f.backwardMap = make(map[pgid]uint64) f.backwardMap = make(map[common.Pgid]uint64)
for i := 1; i < len(pgids); i++ { for i := 1; i < len(pgids); i++ {
// continuous page // continuous page

View File

@ -7,6 +7,8 @@ import (
"sort" "sort"
"testing" "testing"
"unsafe" "unsafe"
"go.etcd.io/bbolt/internal/common"
) )
// TestFreelistType is used as a env variable for test to indicate the backend type // TestFreelistType is used as a env variable for test to indicate the backend type
@ -15,17 +17,17 @@ const TestFreelistType = "TEST_FREELIST_TYPE"
// Ensure that a page is added to a transaction's freelist. // Ensure that a page is added to a transaction's freelist.
func TestFreelist_free(t *testing.T) { func TestFreelist_free(t *testing.T) {
f := newTestFreelist() f := newTestFreelist()
f.free(100, &page{id: 12}) f.free(100, common.NewPage(12, 0, 0, 0))
if !reflect.DeepEqual([]pgid{12}, f.pending[100].ids) { if !reflect.DeepEqual([]common.Pgid{12}, f.pending[100].ids) {
t.Fatalf("exp=%v; got=%v", []pgid{12}, f.pending[100].ids) t.Fatalf("exp=%v; got=%v", []common.Pgid{12}, f.pending[100].ids)
} }
} }
// Ensure that a page and its overflow is added to a transaction's freelist. // Ensure that a page and its overflow is added to a transaction's freelist.
func TestFreelist_free_overflow(t *testing.T) { func TestFreelist_free_overflow(t *testing.T) {
f := newTestFreelist() f := newTestFreelist()
f.free(100, &page{id: 12, overflow: 3}) f.free(100, common.NewPage(12, 0, 0, 3))
if exp := []pgid{12, 13, 14, 15}; !reflect.DeepEqual(exp, f.pending[100].ids) { if exp := []common.Pgid{12, 13, 14, 15}; !reflect.DeepEqual(exp, f.pending[100].ids) {
t.Fatalf("exp=%v; got=%v", exp, f.pending[100].ids) t.Fatalf("exp=%v; got=%v", exp, f.pending[100].ids)
} }
} }
@ -33,17 +35,17 @@ func TestFreelist_free_overflow(t *testing.T) {
// Ensure that a transaction's free pages can be released. // Ensure that a transaction's free pages can be released.
func TestFreelist_release(t *testing.T) { func TestFreelist_release(t *testing.T) {
f := newTestFreelist() f := newTestFreelist()
f.free(100, &page{id: 12, overflow: 1}) f.free(100, common.NewPage(12, 0, 0, 1))
f.free(100, &page{id: 9}) f.free(100, common.NewPage(9, 0, 0, 0))
f.free(102, &page{id: 39}) f.free(102, common.NewPage(39, 0, 0, 0))
f.release(100) f.release(100)
f.release(101) f.release(101)
if exp := []pgid{9, 12, 13}; !reflect.DeepEqual(exp, f.getFreePageIDs()) { if exp := []common.Pgid{9, 12, 13}; !reflect.DeepEqual(exp, f.getFreePageIDs()) {
t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs()) t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs())
} }
f.release(102) f.release(102)
if exp := []pgid{9, 12, 13, 39}; !reflect.DeepEqual(exp, f.getFreePageIDs()) { if exp := []common.Pgid{9, 12, 13, 39}; !reflect.DeepEqual(exp, f.getFreePageIDs()) {
t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs()) t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs())
} }
} }
@ -51,33 +53,33 @@ func TestFreelist_release(t *testing.T) {
// Ensure that releaseRange handles boundary conditions correctly // Ensure that releaseRange handles boundary conditions correctly
func TestFreelist_releaseRange(t *testing.T) { func TestFreelist_releaseRange(t *testing.T) {
type testRange struct { type testRange struct {
begin, end txid begin, end common.Txid
} }
type testPage struct { type testPage struct {
id pgid id common.Pgid
n int n int
allocTxn txid allocTxn common.Txid
freeTxn txid freeTxn common.Txid
} }
var releaseRangeTests = []struct { var releaseRangeTests = []struct {
title string title string
pagesIn []testPage pagesIn []testPage
releaseRanges []testRange releaseRanges []testRange
wantFree []pgid wantFree []common.Pgid
}{ }{
{ {
title: "Single pending in range", title: "Single pending in range",
pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}},
releaseRanges: []testRange{{1, 300}}, releaseRanges: []testRange{{1, 300}},
wantFree: []pgid{3}, wantFree: []common.Pgid{3},
}, },
{ {
title: "Single pending with minimum end range", title: "Single pending with minimum end range",
pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}},
releaseRanges: []testRange{{1, 200}}, releaseRanges: []testRange{{1, 200}},
wantFree: []pgid{3}, wantFree: []common.Pgid{3},
}, },
{ {
title: "Single pending outsize minimum end range", title: "Single pending outsize minimum end range",
@ -89,7 +91,7 @@ func TestFreelist_releaseRange(t *testing.T) {
title: "Single pending with minimum begin range", title: "Single pending with minimum begin range",
pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}}, pagesIn: []testPage{{id: 3, n: 1, allocTxn: 100, freeTxn: 200}},
releaseRanges: []testRange{{100, 300}}, releaseRanges: []testRange{{100, 300}},
wantFree: []pgid{3}, wantFree: []common.Pgid{3},
}, },
{ {
title: "Single pending outside minimum begin range", title: "Single pending outside minimum begin range",
@ -101,7 +103,7 @@ func TestFreelist_releaseRange(t *testing.T) {
title: "Single pending in minimum range", title: "Single pending in minimum range",
pagesIn: []testPage{{id: 3, n: 1, allocTxn: 199, freeTxn: 200}}, pagesIn: []testPage{{id: 3, n: 1, allocTxn: 199, freeTxn: 200}},
releaseRanges: []testRange{{199, 200}}, releaseRanges: []testRange{{199, 200}},
wantFree: []pgid{3}, wantFree: []common.Pgid{3},
}, },
{ {
title: "Single pending and read transaction at 199", title: "Single pending and read transaction at 199",
@ -146,16 +148,16 @@ func TestFreelist_releaseRange(t *testing.T) {
{id: 9, n: 2, allocTxn: 175, freeTxn: 200}, {id: 9, n: 2, allocTxn: 175, freeTxn: 200},
}, },
releaseRanges: []testRange{{50, 149}, {151, 300}}, releaseRanges: []testRange{{50, 149}, {151, 300}},
wantFree: []pgid{4, 9, 10}, wantFree: []common.Pgid{4, 9, 10},
}, },
} }
for _, c := range releaseRangeTests { for _, c := range releaseRangeTests {
f := newTestFreelist() f := newTestFreelist()
var ids []pgid var ids []common.Pgid
for _, p := range c.pagesIn { for _, p := range c.pagesIn {
for i := uint64(0); i < uint64(p.n); i++ { for i := uint64(0); i < uint64(p.n); i++ {
ids = append(ids, pgid(uint64(p.id)+i)) ids = append(ids, common.Pgid(uint64(p.id)+i))
} }
} }
f.readIDs(ids) f.readIDs(ids)
@ -164,7 +166,7 @@ func TestFreelist_releaseRange(t *testing.T) {
} }
for _, p := range c.pagesIn { for _, p := range c.pagesIn {
f.free(p.freeTxn, &page{id: p.id, overflow: uint32(p.n - 1)}) f.free(p.freeTxn, common.NewPage(p.id, 0, 0, uint32(p.n-1)))
} }
for _, r := range c.releaseRanges { for _, r := range c.releaseRanges {
@ -179,11 +181,11 @@ func TestFreelist_releaseRange(t *testing.T) {
func TestFreelistHashmap_allocate(t *testing.T) { func TestFreelistHashmap_allocate(t *testing.T) {
f := newTestFreelist() f := newTestFreelist()
if f.freelistType != FreelistMapType { if f.freelistType != common.FreelistMapType {
t.Skip() t.Skip()
} }
ids := []pgid{3, 4, 5, 6, 7, 9, 12, 13, 18} ids := []common.Pgid{3, 4, 5, 6, 7, 9, 12, 13, 18}
f.readIDs(ids) f.readIDs(ids)
f.allocate(1, 3) f.allocate(1, 3)
@ -209,10 +211,10 @@ func TestFreelistHashmap_allocate(t *testing.T) {
// Ensure that a freelist can find contiguous blocks of pages. // Ensure that a freelist can find contiguous blocks of pages.
func TestFreelistArray_allocate(t *testing.T) { func TestFreelistArray_allocate(t *testing.T) {
f := newTestFreelist() f := newTestFreelist()
if f.freelistType != FreelistArrayType { if f.freelistType != common.FreelistArrayType {
t.Skip() t.Skip()
} }
ids := []pgid{3, 4, 5, 6, 7, 9, 12, 13, 18} ids := []common.Pgid{3, 4, 5, 6, 7, 9, 12, 13, 18}
f.readIDs(ids) f.readIDs(ids)
if id := int(f.allocate(1, 3)); id != 3 { if id := int(f.allocate(1, 3)); id != 3 {
t.Fatalf("exp=3; got=%v", id) t.Fatalf("exp=3; got=%v", id)
@ -235,7 +237,7 @@ func TestFreelistArray_allocate(t *testing.T) {
if id := int(f.allocate(1, 0)); id != 0 { if id := int(f.allocate(1, 0)); id != 0 {
t.Fatalf("exp=0; got=%v", id) t.Fatalf("exp=0; got=%v", id)
} }
if exp := []pgid{9, 18}; !reflect.DeepEqual(exp, f.getFreePageIDs()) { if exp := []common.Pgid{9, 18}; !reflect.DeepEqual(exp, f.getFreePageIDs()) {
t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs()) t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs())
} }
@ -248,7 +250,7 @@ func TestFreelistArray_allocate(t *testing.T) {
if id := int(f.allocate(1, 1)); id != 0 { if id := int(f.allocate(1, 1)); id != 0 {
t.Fatalf("exp=0; got=%v", id) t.Fatalf("exp=0; got=%v", id)
} }
if exp := []pgid{}; !reflect.DeepEqual(exp, f.getFreePageIDs()) { if exp := []common.Pgid{}; !reflect.DeepEqual(exp, f.getFreePageIDs()) {
t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs()) t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs())
} }
} }
@ -257,12 +259,12 @@ func TestFreelistArray_allocate(t *testing.T) {
func TestFreelist_read(t *testing.T) { func TestFreelist_read(t *testing.T) {
// Create a page. // Create a page.
var buf [4096]byte var buf [4096]byte
page := (*page)(unsafe.Pointer(&buf[0])) page := (*common.Page)(unsafe.Pointer(&buf[0]))
page.flags = freelistPageFlag page.SetFlags(common.FreelistPageFlag)
page.count = 2 page.SetCount(2)
// Insert 2 page ids. // Insert 2 page ids.
ids := (*[3]pgid)(unsafe.Pointer(uintptr(unsafe.Pointer(page)) + unsafe.Sizeof(*page))) ids := (*[3]common.Pgid)(unsafe.Pointer(uintptr(unsafe.Pointer(page)) + unsafe.Sizeof(*page)))
ids[0] = 23 ids[0] = 23
ids[1] = 50 ids[1] = 50
@ -271,7 +273,7 @@ func TestFreelist_read(t *testing.T) {
f.read(page) f.read(page)
// Ensure that there are two page ids in the freelist. // Ensure that there are two page ids in the freelist.
if exp := []pgid{23, 50}; !reflect.DeepEqual(exp, f.getFreePageIDs()) { if exp := []common.Pgid{23, 50}; !reflect.DeepEqual(exp, f.getFreePageIDs()) {
t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs()) t.Fatalf("exp=%v; got=%v", exp, f.getFreePageIDs())
} }
} }
@ -282,10 +284,10 @@ func TestFreelist_write(t *testing.T) {
var buf [4096]byte var buf [4096]byte
f := newTestFreelist() f := newTestFreelist()
f.readIDs([]pgid{12, 39}) f.readIDs([]common.Pgid{12, 39})
f.pending[100] = &txPending{ids: []pgid{28, 11}} f.pending[100] = &txPending{ids: []common.Pgid{28, 11}}
f.pending[101] = &txPending{ids: []pgid{3}} f.pending[101] = &txPending{ids: []common.Pgid{3}}
p := (*page)(unsafe.Pointer(&buf[0])) p := (*common.Page)(unsafe.Pointer(&buf[0]))
if err := f.write(p); err != nil { if err := f.write(p); err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -296,7 +298,7 @@ func TestFreelist_write(t *testing.T) {
// Ensure that the freelist is correct. // Ensure that the freelist is correct.
// All pages should be present and in reverse order. // All pages should be present and in reverse order.
if exp := []pgid{3, 11, 12, 28, 39}; !reflect.DeepEqual(exp, f2.getFreePageIDs()) { if exp := []common.Pgid{3, 11, 12, 28, 39}; !reflect.DeepEqual(exp, f2.getFreePageIDs()) {
t.Fatalf("exp=%v; got=%v", exp, f2.getFreePageIDs()) t.Fatalf("exp=%v; got=%v", exp, f2.getFreePageIDs())
} }
} }
@ -313,17 +315,17 @@ func benchmark_FreelistRelease(b *testing.B, size int) {
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
txp := &txPending{ids: pending} txp := &txPending{ids: pending}
f := newTestFreelist() f := newTestFreelist()
f.pending = map[txid]*txPending{1: txp} f.pending = map[common.Txid]*txPending{1: txp}
f.readIDs(ids) f.readIDs(ids)
f.release(1) f.release(1)
} }
} }
func randomPgids(n int) []pgid { func randomPgids(n int) []common.Pgid {
rand.Seed(42) rand.Seed(42)
pgids := make(pgids, n) pgids := make(common.Pgids, n)
for i := range pgids { for i := range pgids {
pgids[i] = pgid(rand.Int63()) pgids[i] = common.Pgid(rand.Int63())
} }
sort.Sort(pgids) sort.Sort(pgids)
return pgids return pgids
@ -331,7 +333,7 @@ func randomPgids(n int) []pgid {
func Test_freelist_ReadIDs_and_getFreePageIDs(t *testing.T) { func Test_freelist_ReadIDs_and_getFreePageIDs(t *testing.T) {
f := newTestFreelist() f := newTestFreelist()
exp := []pgid{3, 4, 5, 6, 7, 9, 12, 13, 18} exp := []common.Pgid{3, 4, 5, 6, 7, 9, 12, 13, 18}
f.readIDs(exp) f.readIDs(exp)
@ -340,7 +342,7 @@ func Test_freelist_ReadIDs_and_getFreePageIDs(t *testing.T) {
} }
f2 := newTestFreelist() f2 := newTestFreelist()
var exp2 []pgid var exp2 []common.Pgid
f2.readIDs(exp2) f2.readIDs(exp2)
if got2 := f2.getFreePageIDs(); !reflect.DeepEqual(got2, exp2) { if got2 := f2.getFreePageIDs(); !reflect.DeepEqual(got2, exp2) {
@ -355,53 +357,53 @@ func Test_freelist_mergeWithExist(t *testing.T) {
bm2 := pidSet{5: struct{}{}} bm2 := pidSet{5: struct{}{}}
tests := []struct { tests := []struct {
name string name string
ids []pgid ids []common.Pgid
pgid pgid pgid common.Pgid
want []pgid want []common.Pgid
wantForwardmap map[pgid]uint64 wantForwardmap map[common.Pgid]uint64
wantBackwardmap map[pgid]uint64 wantBackwardmap map[common.Pgid]uint64
wantfreemap map[uint64]pidSet wantfreemap map[uint64]pidSet
}{ }{
{ {
name: "test1", name: "test1",
ids: []pgid{1, 2, 4, 5, 6}, ids: []common.Pgid{1, 2, 4, 5, 6},
pgid: 3, pgid: 3,
want: []pgid{1, 2, 3, 4, 5, 6}, want: []common.Pgid{1, 2, 3, 4, 5, 6},
wantForwardmap: map[pgid]uint64{1: 6}, wantForwardmap: map[common.Pgid]uint64{1: 6},
wantBackwardmap: map[pgid]uint64{6: 6}, wantBackwardmap: map[common.Pgid]uint64{6: 6},
wantfreemap: map[uint64]pidSet{6: bm1}, wantfreemap: map[uint64]pidSet{6: bm1},
}, },
{ {
name: "test2", name: "test2",
ids: []pgid{1, 2, 5, 6}, ids: []common.Pgid{1, 2, 5, 6},
pgid: 3, pgid: 3,
want: []pgid{1, 2, 3, 5, 6}, want: []common.Pgid{1, 2, 3, 5, 6},
wantForwardmap: map[pgid]uint64{1: 3, 5: 2}, wantForwardmap: map[common.Pgid]uint64{1: 3, 5: 2},
wantBackwardmap: map[pgid]uint64{6: 2, 3: 3}, wantBackwardmap: map[common.Pgid]uint64{6: 2, 3: 3},
wantfreemap: map[uint64]pidSet{3: bm1, 2: bm2}, wantfreemap: map[uint64]pidSet{3: bm1, 2: bm2},
}, },
{ {
name: "test3", name: "test3",
ids: []pgid{1, 2}, ids: []common.Pgid{1, 2},
pgid: 3, pgid: 3,
want: []pgid{1, 2, 3}, want: []common.Pgid{1, 2, 3},
wantForwardmap: map[pgid]uint64{1: 3}, wantForwardmap: map[common.Pgid]uint64{1: 3},
wantBackwardmap: map[pgid]uint64{3: 3}, wantBackwardmap: map[common.Pgid]uint64{3: 3},
wantfreemap: map[uint64]pidSet{3: bm1}, wantfreemap: map[uint64]pidSet{3: bm1},
}, },
{ {
name: "test4", name: "test4",
ids: []pgid{2, 3}, ids: []common.Pgid{2, 3},
pgid: 1, pgid: 1,
want: []pgid{1, 2, 3}, want: []common.Pgid{1, 2, 3},
wantForwardmap: map[pgid]uint64{1: 3}, wantForwardmap: map[common.Pgid]uint64{1: 3},
wantBackwardmap: map[pgid]uint64{3: 3}, wantBackwardmap: map[common.Pgid]uint64{3: 3},
wantfreemap: map[uint64]pidSet{3: bm1}, wantfreemap: map[uint64]pidSet{3: bm1},
}, },
} }
for _, tt := range tests { for _, tt := range tests {
f := newTestFreelist() f := newTestFreelist()
if f.freelistType == FreelistArrayType { if f.freelistType == common.FreelistArrayType {
t.Skip() t.Skip()
} }
f.readIDs(tt.ids) f.readIDs(tt.ids)
@ -425,9 +427,9 @@ func Test_freelist_mergeWithExist(t *testing.T) {
// newTestFreelist get the freelist type from env and initial the freelist // newTestFreelist get the freelist type from env and initial the freelist
func newTestFreelist() *freelist { func newTestFreelist() *freelist {
freelistType := FreelistArrayType freelistType := common.FreelistArrayType
if env := os.Getenv(TestFreelistType); env == string(FreelistMapType) { if env := os.Getenv(TestFreelistType); env == string(common.FreelistMapType) {
freelistType = FreelistMapType freelistType = common.FreelistMapType
} }
return newFreelist(freelistType) return newFreelist(freelistType)

View File

@ -12,6 +12,7 @@ import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
bolt "go.etcd.io/bbolt" bolt "go.etcd.io/bbolt"
"go.etcd.io/bbolt/internal/common"
) )
var statsFlag = flag.Bool("stats", false, "show performance stats") var statsFlag = flag.Bool("stats", false, "show performance stats")
@ -44,9 +45,9 @@ func MustOpenDBWithOption(t testing.TB, f string, o *bolt.Options) *DB {
o = bolt.DefaultOptions o = bolt.DefaultOptions
} }
freelistType := bolt.FreelistArrayType freelistType := common.FreelistArrayType
if env := os.Getenv(TestFreelistType); env == string(bolt.FreelistMapType) { if env := os.Getenv(TestFreelistType); env == string(common.FreelistMapType) {
freelistType = bolt.FreelistMapType freelistType = common.FreelistMapType
} }
o.FreelistType = freelistType o.FreelistType = freelistType

54
internal/common/bucket.go Normal file
View File

@ -0,0 +1,54 @@
package common
import (
"fmt"
"unsafe"
)
const BucketHeaderSize = int(unsafe.Sizeof(InBucket{}))
// InBucket represents the on-file representation of a bucket.
// This is stored as the "value" of a bucket key. If the bucket is small enough,
// then its root page can be stored inline in the "value", after the bucket
// header. In the case of inline buckets, the "root" will be 0.
type InBucket struct {
root Pgid // page id of the bucket's root-level page
sequence uint64 // monotonically incrementing, used by NextSequence()
}
func NewInBucket(root Pgid, seq uint64) InBucket {
return InBucket{
root: root,
sequence: seq,
}
}
func (b *InBucket) RootPage() Pgid {
return b.root
}
func (b *InBucket) SetRootPage(id Pgid) {
b.root = id
}
// InSequence returns the sequence. The reason why not naming it `Sequence`
// is to avoid duplicated name as `(*Bucket) Sequence()`
func (b *InBucket) InSequence() uint64 {
return b.sequence
}
func (b *InBucket) SetInSequence(v uint64) {
b.sequence = v
}
func (b *InBucket) IncSequence() {
b.sequence++
}
func (b *InBucket) InlinePage(v []byte) *Page {
return (*Page)(unsafe.Pointer(&v[BucketHeaderSize]))
}
func (b *InBucket) String() string {
return fmt.Sprintf("<pgid=%d,seq=%d>", b.root, b.sequence)
}

View File

@ -1,4 +1,4 @@
package bbolt package common
import "errors" import "errors"

45
internal/common/inode.go Normal file
View File

@ -0,0 +1,45 @@
package common
// Inode represents an internal node inside of a node.
// It can be used to point to elements in a page or point
// to an element which hasn't been added to a page yet.
type Inode struct {
flags uint32
pgid Pgid
key []byte
value []byte
}
type Inodes []Inode
func (in *Inode) Flags() uint32 {
return in.flags
}
func (in *Inode) SetFlags(flags uint32) {
in.flags = flags
}
func (in *Inode) Pgid() Pgid {
return in.pgid
}
func (in *Inode) SetPgid(id Pgid) {
in.pgid = id
}
func (in *Inode) Key() []byte {
return in.key
}
func (in *Inode) SetKey(key []byte) {
in.key = key
}
func (in *Inode) Value() []byte {
return in.value
}
func (in *Inode) SetValue(value []byte) {
in.value = value
}

147
internal/common/meta.go Normal file
View File

@ -0,0 +1,147 @@
package common
import (
"fmt"
"hash/fnv"
"io"
"unsafe"
)
type Meta struct {
magic uint32
version uint32
pageSize uint32
flags uint32
root InBucket
freelist Pgid
pgid Pgid
txid Txid
checksum uint64
}
// Validate checks the marker bytes and version of the meta page to ensure it matches this binary.
func (m *Meta) Validate() error {
if m.magic != Magic {
return ErrInvalid
} else if m.version != Version {
return ErrVersionMismatch
} else if m.checksum != m.Sum64() {
return ErrChecksum
}
return nil
}
// Copy copies one meta object to another.
func (m *Meta) Copy(dest *Meta) {
*dest = *m
}
// Write writes the meta onto a page.
func (m *Meta) Write(p *Page) {
if m.root.root >= m.pgid {
panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
} else if m.freelist >= m.pgid && m.freelist != PgidNoFreelist {
// TODO: reject pgidNoFreeList if !NoFreelistSync
panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
}
// Page id is either going to be 0 or 1 which we can determine by the transaction ID.
p.id = Pgid(m.txid % 2)
p.flags |= MetaPageFlag
// Calculate the checksum.
m.checksum = m.Sum64()
m.Copy(p.Meta())
}
// Sum64 generates the checksum for the meta.
func (m *Meta) Sum64() uint64 {
var h = fnv.New64a()
_, _ = h.Write((*[unsafe.Offsetof(Meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
return h.Sum64()
}
func (m *Meta) Magic() uint32 {
return m.magic
}
func (m *Meta) SetMagic(v uint32) {
m.magic = v
}
func (m *Meta) SetVersion(v uint32) {
m.version = v
}
func (m *Meta) PageSize() uint32 {
return m.pageSize
}
func (m *Meta) SetPageSize(v uint32) {
m.pageSize = v
}
func (m *Meta) Flags() uint32 {
return m.flags
}
func (m *Meta) SetFlags(v uint32) {
m.flags = v
}
func (m *Meta) SetRootBucket(b InBucket) {
m.root = b
}
func (m *Meta) RootBucket() *InBucket {
return &m.root
}
func (m *Meta) Freelist() Pgid {
return m.freelist
}
func (m *Meta) SetFreelist(v Pgid) {
m.freelist = v
}
func (m *Meta) Pgid() Pgid {
return m.pgid
}
func (m *Meta) SetPgid(id Pgid) {
m.pgid = id
}
func (m *Meta) Txid() Txid {
return m.txid
}
func (m *Meta) SetTxid(id Txid) {
m.txid = id
}
func (m *Meta) IncTxid() {
m.txid += 1
}
func (m *Meta) DecTxid() {
m.txid -= 1
}
func (m *Meta) SetChecksum(v uint64) {
m.checksum = v
}
func (m *Meta) Print(w io.Writer) {
fmt.Fprintf(w, "Version: %d\n", m.version)
fmt.Fprintf(w, "Page Size: %d bytes\n", m.pageSize)
fmt.Fprintf(w, "Flags: %08x\n", m.flags)
fmt.Fprintf(w, "Root: <pgid=%d>\n", m.root.root)
fmt.Fprintf(w, "Freelist: <pgid=%d>\n", m.freelist)
fmt.Fprintf(w, "HWM: <pgid=%d>\n", m.pgid)
fmt.Fprintf(w, "Txn ID: %d\n", m.txid)
fmt.Fprintf(w, "Checksum: %016x\n", m.checksum)
fmt.Fprintf(w, "\n")
}

374
internal/common/page.go Normal file
View File

@ -0,0 +1,374 @@
package common
import (
"fmt"
"os"
"sort"
"unsafe"
)
const PageHeaderSize = unsafe.Sizeof(Page{})
const MinKeysPerPage = 2
const BranchPageElementSize = unsafe.Sizeof(branchPageElement{})
const LeafPageElementSize = unsafe.Sizeof(leafPageElement{})
const (
BranchPageFlag = 0x01
LeafPageFlag = 0x02
MetaPageFlag = 0x04
FreelistPageFlag = 0x10
)
const (
BucketLeafFlag = 0x01
)
type Pgid uint64
type Page struct {
id Pgid
flags uint16
count uint16
overflow uint32
}
func NewPage(id Pgid, flags, count uint16, overflow uint32) *Page {
return &Page{
id: id,
flags: flags,
count: count,
overflow: overflow,
}
}
// Typ returns a human-readable page type string used for debugging.
func (p *Page) Typ() string {
if (p.flags & BranchPageFlag) != 0 {
return "branch"
} else if (p.flags & LeafPageFlag) != 0 {
return "leaf"
} else if (p.flags & MetaPageFlag) != 0 {
return "meta"
} else if (p.flags & FreelistPageFlag) != 0 {
return "freelist"
}
return fmt.Sprintf("unknown<%02x>", p.flags)
}
// Meta returns a pointer to the metadata section of the page.
func (p *Page) Meta() *Meta {
return (*Meta)(UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)))
}
func (p *Page) FastCheck(id Pgid) {
Assert(p.id == id, "Page expected to be: %v, but self identifies as %v", id, p.id)
// Only one flag of page-type can be set.
Assert(p.flags == BranchPageFlag ||
p.flags == LeafPageFlag ||
p.flags == MetaPageFlag ||
p.flags == FreelistPageFlag,
"page %v: has unexpected type/flags: %x", p.id, p.flags)
}
// LeafPageElement retrieves the leaf node by index
func (p *Page) LeafPageElement(index uint16) *leafPageElement {
return (*leafPageElement)(UnsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p),
LeafPageElementSize, int(index)))
}
// LeafPageElements retrieves a list of leaf nodes.
func (p *Page) LeafPageElements() []leafPageElement {
if p.count == 0 {
return nil
}
var elems []leafPageElement
data := UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))
UnsafeSlice(unsafe.Pointer(&elems), data, int(p.count))
return elems
}
// BranchPageElement retrieves the branch node by index
func (p *Page) BranchPageElement(index uint16) *branchPageElement {
return (*branchPageElement)(UnsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p),
unsafe.Sizeof(branchPageElement{}), int(index)))
}
// BranchPageElements retrieves a list of branch nodes.
func (p *Page) BranchPageElements() []branchPageElement {
if p.count == 0 {
return nil
}
var elems []branchPageElement
data := UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))
UnsafeSlice(unsafe.Pointer(&elems), data, int(p.count))
return elems
}
func (p *Page) FreelistPageCount() (int, int) {
Assert(p.flags == FreelistPageFlag, fmt.Sprintf("can't get freelist page count from a non-freelist page: %2x", p.flags))
// If the page.count is at the max uint16 value (64k) then it's considered
// an overflow and the size of the freelist is stored as the first element.
var idx, count = 0, int(p.count)
if count == 0xFFFF {
idx = 1
c := *(*Pgid)(UnsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)))
count = int(c)
if count < 0 {
panic(fmt.Sprintf("leading element count %d overflows int", c))
}
}
return idx, count
}
func (p *Page) FreelistPageIds() []Pgid {
Assert(p.flags == FreelistPageFlag, fmt.Sprintf("can't get freelist page IDs from a non-freelist page: %2x", p.flags))
idx, count := p.FreelistPageCount()
if count == 0 {
return nil
}
var ids []Pgid
data := UnsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), unsafe.Sizeof(ids[0]), idx)
UnsafeSlice(unsafe.Pointer(&ids), data, count)
return ids
}
// dump writes n bytes of the page to STDERR as hex output.
func (p *Page) hexdump(n int) {
buf := UnsafeByteSlice(unsafe.Pointer(p), 0, 0, n)
fmt.Fprintf(os.Stderr, "%x\n", buf)
}
func (p *Page) Id() Pgid {
return p.id
}
func (p *Page) SetId(target Pgid) {
p.id = target
}
func (p *Page) Flags() uint16 {
return p.flags
}
func (p *Page) SetFlags(v uint16) {
p.flags = v
}
func (p *Page) FlagsXOR(v uint16) {
p.flags |= v
}
func (p *Page) Count() uint16 {
return p.count
}
func (p *Page) SetCount(target uint16) {
p.count = target
}
func (p *Page) Overflow() uint32 {
return p.overflow
}
func (p *Page) SetOverflow(target uint32) {
p.overflow = target
}
func (p *Page) String() string {
return fmt.Sprintf("ID: %d, Type: %s, count: %d, overflow: %d", p.id, p.Typ(), p.count, p.overflow)
}
type Pages []*Page
func (s Pages) Len() int { return len(s) }
func (s Pages) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s Pages) Less(i, j int) bool { return s[i].id < s[j].id }
// branchPageElement represents a node on a branch page.
type branchPageElement struct {
pos uint32
ksize uint32
pgid Pgid
}
func (n *branchPageElement) Pos() uint32 {
return n.pos
}
func (n *branchPageElement) SetPos(v uint32) {
n.pos = v
}
func (n *branchPageElement) Ksize() uint32 {
return n.ksize
}
func (n *branchPageElement) SetKsize(v uint32) {
n.ksize = v
}
func (n *branchPageElement) Pgid() Pgid {
return n.pgid
}
func (n *branchPageElement) SetPgid(v Pgid) {
n.pgid = v
}
// Key returns a byte slice of the node key.
func (n *branchPageElement) Key() []byte {
return UnsafeByteSlice(unsafe.Pointer(n), 0, int(n.pos), int(n.pos)+int(n.ksize))
}
// leafPageElement represents a node on a leaf page.
type leafPageElement struct {
flags uint32
pos uint32
ksize uint32
vsize uint32
}
func NewLeafPageElement(flags, pos, ksize, vsize uint32) *leafPageElement {
return &leafPageElement{
flags: flags,
pos: pos,
ksize: ksize,
vsize: vsize,
}
}
func (n *leafPageElement) Flags() uint32 {
return n.flags
}
func (n *leafPageElement) SetFlags(v uint32) {
n.flags = v
}
func (n *leafPageElement) Pos() uint32 {
return n.pos
}
func (n *leafPageElement) SetPos(v uint32) {
n.pos = v
}
func (n *leafPageElement) Ksize() uint32 {
return n.ksize
}
func (n *leafPageElement) SetKsize(v uint32) {
n.ksize = v
}
func (n *leafPageElement) Vsize() uint32 {
return n.vsize
}
func (n *leafPageElement) SetVsize(v uint32) {
n.vsize = v
}
// Key returns a byte slice of the node key.
func (n *leafPageElement) Key() []byte {
i := int(n.pos)
j := i + int(n.ksize)
return UnsafeByteSlice(unsafe.Pointer(n), 0, i, j)
}
// Value returns a byte slice of the node value.
func (n *leafPageElement) Value() []byte {
i := int(n.pos) + int(n.ksize)
j := i + int(n.vsize)
return UnsafeByteSlice(unsafe.Pointer(n), 0, i, j)
}
func (n *leafPageElement) IsBucketEntry() bool {
return n.flags&uint32(BucketLeafFlag) != 0
}
func (n *leafPageElement) Bucket() *InBucket {
if n.IsBucketEntry() {
return LoadBucket(n.Value())
} else {
return nil
}
}
// PageInfo represents human readable information about a page.
type PageInfo struct {
ID int
Type string
Count int
OverflowCount int
}
type Pgids []Pgid
func (s Pgids) Len() int { return len(s) }
func (s Pgids) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s Pgids) Less(i, j int) bool { return s[i] < s[j] }
// Merge returns the sorted union of a and b.
func (a Pgids) Merge(b Pgids) Pgids {
// Return the opposite slice if one is nil.
if len(a) == 0 {
return b
}
if len(b) == 0 {
return a
}
merged := make(Pgids, len(a)+len(b))
Mergepgids(merged, a, b)
return merged
}
// Mergepgids copies the sorted union of a and b into dst.
// If dst is too small, it panics.
func Mergepgids(dst, a, b Pgids) {
if len(dst) < len(a)+len(b) {
panic(fmt.Errorf("mergepgids bad len %d < %d + %d", len(dst), len(a), len(b)))
}
// Copy in the opposite slice if one is nil.
if len(a) == 0 {
copy(dst, b)
return
}
if len(b) == 0 {
copy(dst, a)
return
}
// Merged will hold all elements from both lists.
merged := dst[:0]
// Assign lead to the slice with a lower starting value, follow to the higher value.
lead, follow := a, b
if b[0] < a[0] {
lead, follow = b, a
}
// Continue while there are elements in the lead.
for len(lead) > 0 {
// Merge largest prefix of lead that is ahead of follow[0].
n := sort.Search(len(lead), func(i int) bool { return lead[i] > follow[0] })
merged = append(merged, lead[:n]...)
if n >= len(lead) {
break
}
// Swap lead and follow.
lead, follow = follow, lead[n:]
}
// Append what's left in follow.
_ = append(merged, follow...)
}

View File

@ -1,4 +1,4 @@
package bbolt package common
import ( import (
"reflect" "reflect"
@ -9,52 +9,52 @@ import (
// Ensure that the page type can be returned in human readable format. // Ensure that the page type can be returned in human readable format.
func TestPage_typ(t *testing.T) { func TestPage_typ(t *testing.T) {
if typ := (&page{flags: branchPageFlag}).typ(); typ != "branch" { if typ := (&Page{flags: BranchPageFlag}).Typ(); typ != "branch" {
t.Fatalf("exp=branch; got=%v", typ) t.Fatalf("exp=branch; got=%v", typ)
} }
if typ := (&page{flags: leafPageFlag}).typ(); typ != "leaf" { if typ := (&Page{flags: LeafPageFlag}).Typ(); typ != "leaf" {
t.Fatalf("exp=leaf; got=%v", typ) t.Fatalf("exp=leaf; got=%v", typ)
} }
if typ := (&page{flags: metaPageFlag}).typ(); typ != "meta" { if typ := (&Page{flags: MetaPageFlag}).Typ(); typ != "meta" {
t.Fatalf("exp=meta; got=%v", typ) t.Fatalf("exp=meta; got=%v", typ)
} }
if typ := (&page{flags: freelistPageFlag}).typ(); typ != "freelist" { if typ := (&Page{flags: FreelistPageFlag}).Typ(); typ != "freelist" {
t.Fatalf("exp=freelist; got=%v", typ) t.Fatalf("exp=freelist; got=%v", typ)
} }
if typ := (&page{flags: 20000}).typ(); typ != "unknown<4e20>" { if typ := (&Page{flags: 20000}).Typ(); typ != "unknown<4e20>" {
t.Fatalf("exp=unknown<4e20>; got=%v", typ) t.Fatalf("exp=unknown<4e20>; got=%v", typ)
} }
} }
// Ensure that the hexdump debugging function doesn't blow up. // Ensure that the hexdump debugging function doesn't blow up.
func TestPage_dump(t *testing.T) { func TestPage_dump(t *testing.T) {
(&page{id: 256}).hexdump(16) (&Page{id: 256}).hexdump(16)
} }
func TestPgids_merge(t *testing.T) { func TestPgids_merge(t *testing.T) {
a := pgids{4, 5, 6, 10, 11, 12, 13, 27} a := Pgids{4, 5, 6, 10, 11, 12, 13, 27}
b := pgids{1, 3, 8, 9, 25, 30} b := Pgids{1, 3, 8, 9, 25, 30}
c := a.merge(b) c := a.Merge(b)
if !reflect.DeepEqual(c, pgids{1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 25, 27, 30}) { if !reflect.DeepEqual(c, Pgids{1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 25, 27, 30}) {
t.Errorf("mismatch: %v", c) t.Errorf("mismatch: %v", c)
} }
a = pgids{4, 5, 6, 10, 11, 12, 13, 27, 35, 36} a = Pgids{4, 5, 6, 10, 11, 12, 13, 27, 35, 36}
b = pgids{8, 9, 25, 30} b = Pgids{8, 9, 25, 30}
c = a.merge(b) c = a.Merge(b)
if !reflect.DeepEqual(c, pgids{4, 5, 6, 8, 9, 10, 11, 12, 13, 25, 27, 30, 35, 36}) { if !reflect.DeepEqual(c, Pgids{4, 5, 6, 8, 9, 10, 11, 12, 13, 25, 27, 30, 35, 36}) {
t.Errorf("mismatch: %v", c) t.Errorf("mismatch: %v", c)
} }
} }
func TestPgids_merge_quick(t *testing.T) { func TestPgids_merge_quick(t *testing.T) {
if err := quick.Check(func(a, b pgids) bool { if err := quick.Check(func(a, b Pgids) bool {
// Sort incoming lists. // Sort incoming lists.
sort.Sort(a) sort.Sort(a)
sort.Sort(b) sort.Sort(b)
// Merge the two lists together. // Merge the two lists together.
got := a.merge(b) got := a.Merge(b)
// The expected value should be the two lists combined and sorted. // The expected value should be the two lists combined and sorted.
exp := append(a, b...) exp := append(a, b...)

50
internal/common/types.go Normal file
View File

@ -0,0 +1,50 @@
package common
import (
"os"
"runtime"
"time"
)
// MaxMmapStep is the largest step that can be taken when remapping the mmap.
const MaxMmapStep = 1 << 30 // 1GB
// Version represents the data file format version.
const Version = 2
// Magic represents a marker value to indicate that a file is a Bolt DB.
const Magic uint32 = 0xED0CDAED
const PgidNoFreelist Pgid = 0xffffffffffffffff
// DO NOT EDIT. Copied from the "bolt" package.
const pageMaxAllocSize = 0xFFFFFFF
// IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
// syncing changes to a file. This is required as some operating systems,
// such as OpenBSD, do not have a unified buffer cache (UBC) and writes
// must be synchronized using the msync(2) syscall.
const IgnoreNoSync = runtime.GOOS == "openbsd"
// Default values if not set in a DB instance.
const (
DefaultMaxBatchSize int = 1000
DefaultMaxBatchDelay = 10 * time.Millisecond
DefaultAllocSize = 16 * 1024 * 1024
)
// DefaultPageSize is the default page size for db which is set to the OS page size.
var DefaultPageSize = os.Getpagesize()
// FreelistType is the type of the freelist backend
type FreelistType string
const (
// FreelistArrayType indicates backend freelist type is array
FreelistArrayType = FreelistType("array")
// FreelistMapType indicates backend freelist type is hashmap
FreelistMapType = FreelistType("hashmap")
)
// Txid represents the internal transaction identifier.
type Txid uint64

View File

@ -1,19 +1,19 @@
package bbolt package common
import ( import (
"reflect" "reflect"
"unsafe" "unsafe"
) )
func unsafeAdd(base unsafe.Pointer, offset uintptr) unsafe.Pointer { func UnsafeAdd(base unsafe.Pointer, offset uintptr) unsafe.Pointer {
return unsafe.Pointer(uintptr(base) + offset) return unsafe.Pointer(uintptr(base) + offset)
} }
func unsafeIndex(base unsafe.Pointer, offset uintptr, elemsz uintptr, n int) unsafe.Pointer { func UnsafeIndex(base unsafe.Pointer, offset uintptr, elemsz uintptr, n int) unsafe.Pointer {
return unsafe.Pointer(uintptr(base) + offset + uintptr(n)*elemsz) return unsafe.Pointer(uintptr(base) + offset + uintptr(n)*elemsz)
} }
func unsafeByteSlice(base unsafe.Pointer, offset uintptr, i, j int) []byte { func UnsafeByteSlice(base unsafe.Pointer, offset uintptr, i, j int) []byte {
// See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices // See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
// //
// This memory is not allocated from C, but it is unmanaged by Go's // This memory is not allocated from C, but it is unmanaged by Go's
@ -24,14 +24,14 @@ func unsafeByteSlice(base unsafe.Pointer, offset uintptr, i, j int) []byte {
// index 0. However, the wiki never says that the address must be to // index 0. However, the wiki never says that the address must be to
// the beginning of a C allocation (or even that malloc was used at // the beginning of a C allocation (or even that malloc was used at
// all), so this is believed to be correct. // all), so this is believed to be correct.
return (*[maxAllocSize]byte)(unsafeAdd(base, offset))[i:j:j] return (*[pageMaxAllocSize]byte)(UnsafeAdd(base, offset))[i:j:j]
} }
// unsafeSlice modifies the data, len, and cap of a slice variable pointed to by // UnsafeSlice modifies the data, len, and cap of a slice variable pointed to by
// the slice parameter. This helper should be used over other direct // the slice parameter. This helper should be used over other direct
// manipulation of reflect.SliceHeader to prevent misuse, namely, converting // manipulation of reflect.SliceHeader to prevent misuse, namely, converting
// from reflect.SliceHeader to a Go slice type. // from reflect.SliceHeader to a Go slice type.
func unsafeSlice(slice, data unsafe.Pointer, len int) { func UnsafeSlice(slice, data unsafe.Pointer, len int) {
s := (*reflect.SliceHeader)(slice) s := (*reflect.SliceHeader)(slice)
s.Data = uintptr(data) s.Data = uintptr(data)
s.Cap = len s.Cap = len

25
internal/common/utils.go Normal file
View File

@ -0,0 +1,25 @@
package common
import (
"fmt"
"unsafe"
)
// Assert will panic with a given formatted message if the given condition is false.
func Assert(condition bool, msg string, v ...interface{}) {
if !condition {
panic(fmt.Sprintf("assertion failed: "+msg, v...))
}
}
func LoadBucket(buf []byte) *InBucket {
return (*InBucket)(unsafe.Pointer(&buf[0]))
}
func LoadPage(buf []byte) *Page {
return (*Page)(unsafe.Pointer(&buf[0]))
}
func LoadPageMeta(buf []byte) *Meta {
return (*Meta)(unsafe.Pointer(&buf[PageHeaderSize]))
}

View File

@ -2,14 +2,13 @@ package guts_cli
// Low level access to pages / data-structures of the bbolt file. // Low level access to pages / data-structures of the bbolt file.
// TODO(ptab): Merge with bbolt/page file that should get ported to internal.
import ( import (
"errors" "errors"
"fmt" "fmt"
"io" "io"
"os" "os"
"unsafe"
"go.etcd.io/bbolt/internal/common"
) )
var ( var (
@ -17,231 +16,9 @@ var (
ErrCorrupt = errors.New("invalid value") ErrCorrupt = errors.New("invalid value")
) )
// PageHeaderSize represents the size of the bolt.Page header.
const PageHeaderSize = 16
// Represents a marker value to indicate that a file (Meta Page) is a Bolt DB.
const magic uint32 = 0xED0CDAED
// DO NOT EDIT. Copied from the "bolt" package.
const maxAllocSize = 0xFFFFFFF
// DO NOT EDIT. Copied from the "bolt" package.
const (
branchPageFlag = 0x01
leafPageFlag = 0x02
metaPageFlag = 0x04
freelistPageFlag = 0x10
)
// DO NOT EDIT. Copied from the "bolt" package.
const bucketLeafFlag = 0x01
// DO NOT EDIT. Copied from the "bolt" package.
type Pgid uint64
// DO NOT EDIT. Copied from the "bolt" package.
type txid uint64
// DO NOT EDIT. Copied from the "bolt" package.
type Meta struct {
magic uint32
version uint32
pageSize uint32
flags uint32
root Bucket
freelist Pgid
pgid Pgid // High Water Mark (id of next added Page if the file growths)
txid txid
checksum uint64
}
func LoadPageMeta(buf []byte) *Meta {
return (*Meta)(unsafe.Pointer(&buf[PageHeaderSize]))
}
func (m *Meta) RootBucket() *Bucket {
return &m.root
}
func (m *Meta) Txid() uint64 {
return uint64(m.txid)
}
func (m *Meta) Print(w io.Writer) {
fmt.Fprintf(w, "Version: %d\n", m.version)
fmt.Fprintf(w, "Page Size: %d bytes\n", m.pageSize)
fmt.Fprintf(w, "Flags: %08x\n", m.flags)
fmt.Fprintf(w, "Root: <pgid=%d>\n", m.root.root)
fmt.Fprintf(w, "Freelist: <pgid=%d>\n", m.freelist)
fmt.Fprintf(w, "HWM: <pgid=%d>\n", m.pgid)
fmt.Fprintf(w, "Txn ID: %d\n", m.txid)
fmt.Fprintf(w, "Checksum: %016x\n", m.checksum)
fmt.Fprintf(w, "\n")
}
// DO NOT EDIT. Copied from the "bolt" package.
type Bucket struct {
root Pgid
sequence uint64
}
const bucketHeaderSize = int(unsafe.Sizeof(Bucket{}))
func LoadBucket(buf []byte) *Bucket {
return (*Bucket)(unsafe.Pointer(&buf[0]))
}
func (b *Bucket) String() string {
return fmt.Sprintf("<pgid=%d,seq=%d>", b.root, b.sequence)
}
func (b *Bucket) RootPage() Pgid {
return b.root
}
func (b *Bucket) InlinePage(v []byte) *Page {
return (*Page)(unsafe.Pointer(&v[bucketHeaderSize]))
}
// DO NOT EDIT. Copied from the "bolt" package.
type Page struct {
id Pgid
flags uint16
count uint16
overflow uint32
ptr uintptr
}
func LoadPage(buf []byte) *Page {
return (*Page)(unsafe.Pointer(&buf[0]))
}
func (p *Page) FreelistPageCount() int {
// Check for overflow and, if present, adjust actual element count.
if p.count == 0xFFFF {
return int(((*[maxAllocSize]Pgid)(unsafe.Pointer(&p.ptr)))[0])
} else {
return int(p.count)
}
}
func (p *Page) FreelistPagePages() []Pgid {
// Check for overflow and, if present, adjust starting index.
idx := 0
if p.count == 0xFFFF {
idx = 1
}
return (*[maxAllocSize]Pgid)(unsafe.Pointer(&p.ptr))[idx:p.FreelistPageCount()]
}
func (p *Page) Overflow() uint32 {
return p.overflow
}
func (p *Page) String() string {
return fmt.Sprintf("ID: %d, Type: %s, count: %d, overflow: %d", p.id, p.Type(), p.count, p.overflow)
}
// DO NOT EDIT. Copied from the "bolt" package.
// TODO(ptabor): Make the page-types an enum.
func (p *Page) Type() string {
if (p.flags & branchPageFlag) != 0 {
return "branch"
} else if (p.flags & leafPageFlag) != 0 {
return "leaf"
} else if (p.flags & metaPageFlag) != 0 {
return "meta"
} else if (p.flags & freelistPageFlag) != 0 {
return "freelist"
}
return fmt.Sprintf("unknown<%02x>", p.flags)
}
func (p *Page) Count() uint16 {
return p.count
}
func (p *Page) Id() Pgid {
return p.id
}
// DO NOT EDIT. Copied from the "bolt" package.
func (p *Page) LeafPageElement(index uint16) *LeafPageElement {
n := &((*[0x7FFFFFF]LeafPageElement)(unsafe.Pointer(&p.ptr)))[index]
return n
}
// DO NOT EDIT. Copied from the "bolt" package.
func (p *Page) BranchPageElement(index uint16) *BranchPageElement {
return &((*[0x7FFFFFF]BranchPageElement)(unsafe.Pointer(&p.ptr)))[index]
}
func (p *Page) SetId(target Pgid) {
p.id = target
}
func (p *Page) SetCount(target uint16) {
p.count = target
}
func (p *Page) SetOverflow(target uint32) {
p.overflow = target
}
// DO NOT EDIT. Copied from the "bolt" package.
type BranchPageElement struct {
pos uint32
ksize uint32
pgid Pgid
}
// DO NOT EDIT. Copied from the "bolt" package.
func (n *BranchPageElement) Key() []byte {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
return buf[n.pos : n.pos+n.ksize]
}
func (n *BranchPageElement) PgId() Pgid {
return n.pgid
}
// DO NOT EDIT. Copied from the "bolt" package.
type LeafPageElement struct {
flags uint32
pos uint32
ksize uint32
vsize uint32
}
// DO NOT EDIT. Copied from the "bolt" package.
func (n *LeafPageElement) Key() []byte {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
return buf[n.pos : n.pos+n.ksize]
}
// DO NOT EDIT. Copied from the "bolt" package.
func (n *LeafPageElement) Value() []byte {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
return buf[n.pos+n.ksize : n.pos+n.ksize+n.vsize]
}
func (n *LeafPageElement) IsBucketEntry() bool {
return n.flags&uint32(bucketLeafFlag) != 0
}
func (n *LeafPageElement) Bucket() *Bucket {
if n.IsBucketEntry() {
return LoadBucket(n.Value())
} else {
return nil
}
}
// ReadPage reads Page info & full Page data from a path. // ReadPage reads Page info & full Page data from a path.
// This is not transactionally safe. // This is not transactionally safe.
func ReadPage(path string, pageID uint64) (*Page, []byte, error) { func ReadPage(path string, pageID uint64) (*common.Page, []byte, error) {
// Find Page size. // Find Page size.
pageSize, hwm, err := ReadPageAndHWMSize(path) pageSize, hwm, err := ReadPageAndHWMSize(path)
if err != nil { if err != nil {
@ -264,11 +41,11 @@ func ReadPage(path string, pageID uint64) (*Page, []byte, error) {
} }
// Determine total number of blocks. // Determine total number of blocks.
p := LoadPage(buf) p := common.LoadPage(buf)
if p.id != Pgid(pageID) { if p.Id() != common.Pgid(pageID) {
return nil, nil, fmt.Errorf("error: %w due to unexpected Page id: %d != %d", ErrCorrupt, p.id, pageID) return nil, nil, fmt.Errorf("error: %w due to unexpected Page id: %d != %d", ErrCorrupt, p.Id(), pageID)
} }
overflowN := p.overflow overflowN := p.Overflow()
if overflowN >= uint32(hwm)-3 { // we exclude 2 Meta pages and the current Page. if overflowN >= uint32(hwm)-3 { // we exclude 2 Meta pages and the current Page.
return nil, nil, fmt.Errorf("error: %w, Page claims to have %d overflow pages (>=hwm=%d). Interrupting to avoid risky OOM", ErrCorrupt, overflowN, hwm) return nil, nil, fmt.Errorf("error: %w, Page claims to have %d overflow pages (>=hwm=%d). Interrupting to avoid risky OOM", ErrCorrupt, overflowN, hwm)
} }
@ -280,16 +57,16 @@ func ReadPage(path string, pageID uint64) (*Page, []byte, error) {
} else if n != len(buf) { } else if n != len(buf) {
return nil, nil, io.ErrUnexpectedEOF return nil, nil, io.ErrUnexpectedEOF
} }
p = LoadPage(buf) p = common.LoadPage(buf)
if p.id != Pgid(pageID) { if p.Id() != common.Pgid(pageID) {
return nil, nil, fmt.Errorf("error: %w due to unexpected Page id: %d != %d", ErrCorrupt, p.id, pageID) return nil, nil, fmt.Errorf("error: %w due to unexpected Page id: %d != %d", ErrCorrupt, p.Id(), pageID)
} }
return p, buf, nil return p, buf, nil
} }
func WritePage(path string, pageBuf []byte) error { func WritePage(path string, pageBuf []byte) error {
page := LoadPage(pageBuf) page := common.LoadPage(pageBuf)
pageSize, _, err := ReadPageAndHWMSize(path) pageSize, _, err := ReadPageAndHWMSize(path)
if err != nil { if err != nil {
return err return err
@ -309,7 +86,7 @@ func WritePage(path string, pageBuf []byte) error {
// ReadPageAndHWMSize reads Page size and HWM (id of the last+1 Page). // ReadPageAndHWMSize reads Page size and HWM (id of the last+1 Page).
// This is not transactionally safe. // This is not transactionally safe.
func ReadPageAndHWMSize(path string) (uint64, Pgid, error) { func ReadPageAndHWMSize(path string) (uint64, common.Pgid, error) {
// Open database file. // Open database file.
f, err := os.Open(path) f, err := os.Open(path)
if err != nil { if err != nil {
@ -324,28 +101,28 @@ func ReadPageAndHWMSize(path string) (uint64, Pgid, error) {
} }
// Read Page size from metadata. // Read Page size from metadata.
m := LoadPageMeta(buf) m := common.LoadPageMeta(buf)
if m.magic != magic { if m.Magic() != common.Magic {
return 0, 0, fmt.Errorf("the Meta Page has wrong (unexpected) magic") return 0, 0, fmt.Errorf("the Meta Page has wrong (unexpected) magic")
} }
return uint64(m.pageSize), Pgid(m.pgid), nil return uint64(m.PageSize()), common.Pgid(m.Pgid()), nil
} }
// GetRootPage returns the root-page (according to the most recent transaction). // GetRootPage returns the root-page (according to the most recent transaction).
func GetRootPage(path string) (root Pgid, activeMeta Pgid, err error) { func GetRootPage(path string) (root common.Pgid, activeMeta common.Pgid, err error) {
_, buf0, err0 := ReadPage(path, 0) _, buf0, err0 := ReadPage(path, 0)
if err0 != nil { if err0 != nil {
return 0, 0, err0 return 0, 0, err0
} }
m0 := LoadPageMeta(buf0) m0 := common.LoadPageMeta(buf0)
_, buf1, err1 := ReadPage(path, 1) _, buf1, err1 := ReadPage(path, 1)
if err1 != nil { if err1 != nil {
return 0, 1, err1 return 0, 1, err1
} }
m1 := LoadPageMeta(buf1) m1 := common.LoadPageMeta(buf1)
if m0.txid < m1.txid { if m0.Txid() < m1.Txid() {
return m1.root.root, 1, nil return m1.RootBucket().RootPage(), 1, nil
} else { } else {
return m0.root.root, 0, nil return m0.RootBucket().RootPage(), 0, nil
} }
} }

View File

@ -2,10 +2,11 @@ package surgeon
import ( import (
"fmt" "fmt"
"go.etcd.io/bbolt/internal/common"
"go.etcd.io/bbolt/internal/guts_cli" "go.etcd.io/bbolt/internal/guts_cli"
) )
func CopyPage(path string, srcPage guts_cli.Pgid, target guts_cli.Pgid) error { func CopyPage(path string, srcPage common.Pgid, target common.Pgid) error {
p1, d1, err1 := guts_cli.ReadPage(path, uint64(srcPage)) p1, d1, err1 := guts_cli.ReadPage(path, uint64(srcPage))
if err1 != nil { if err1 != nil {
return err1 return err1
@ -14,7 +15,7 @@ func CopyPage(path string, srcPage guts_cli.Pgid, target guts_cli.Pgid) error {
return guts_cli.WritePage(path, d1) return guts_cli.WritePage(path, d1)
} }
func ClearPage(path string, pgId guts_cli.Pgid) error { func ClearPage(path string, pgId common.Pgid) error {
// Read the page // Read the page
p, buf, err := guts_cli.ReadPage(path, uint64(pgId)) p, buf, err := guts_cli.ReadPage(path, uint64(pgId))
if err != nil { if err != nil {

View File

@ -9,6 +9,7 @@ import (
"bytes" "bytes"
"fmt" "fmt"
"go.etcd.io/bbolt/internal/common"
"go.etcd.io/bbolt/internal/guts_cli" "go.etcd.io/bbolt/internal/guts_cli"
) )
@ -20,7 +21,7 @@ func NewXRay(path string) XRay {
return XRay{path} return XRay{path}
} }
func (n XRay) traverse(stack []guts_cli.Pgid, callback func(page *guts_cli.Page, stack []guts_cli.Pgid) error) error { func (n XRay) traverse(stack []common.Pgid, callback func(page *common.Page, stack []common.Pgid) error) error {
p, data, err := guts_cli.ReadPage(n.path, uint64(stack[len(stack)-1])) p, data, err := guts_cli.ReadPage(n.path, uint64(stack[len(stack)-1]))
if err != nil { if err != nil {
return fmt.Errorf("failed reading page (stack %v): %w", stack, err) return fmt.Errorf("failed reading page (stack %v): %w", stack, err)
@ -29,10 +30,10 @@ func (n XRay) traverse(stack []guts_cli.Pgid, callback func(page *guts_cli.Page,
if err != nil { if err != nil {
return fmt.Errorf("failed callback for page (stack %v): %w", stack, err) return fmt.Errorf("failed callback for page (stack %v): %w", stack, err)
} }
switch p.Type() { switch p.Typ() {
case "meta": case "meta":
{ {
m := guts_cli.LoadPageMeta(data) m := common.LoadPageMeta(data)
r := m.RootBucket().RootPage() r := m.RootBucket().RootPage()
return n.traverse(append(stack, r), callback) return n.traverse(append(stack, r), callback)
} }
@ -40,7 +41,7 @@ func (n XRay) traverse(stack []guts_cli.Pgid, callback func(page *guts_cli.Page,
{ {
for i := uint16(0); i < p.Count(); i++ { for i := uint16(0); i < p.Count(); i++ {
bpe := p.BranchPageElement(i) bpe := p.BranchPageElement(i)
if err := n.traverse(append(stack, bpe.PgId()), callback); err != nil { if err := n.traverse(append(stack, bpe.Pgid()), callback); err != nil {
return err return err
} }
} }
@ -73,19 +74,19 @@ func (n XRay) traverse(stack []guts_cli.Pgid, callback func(page *guts_cli.Page,
// As it traverses multiple buckets, so in theory there might be multiple keys with the given name. // As it traverses multiple buckets, so in theory there might be multiple keys with the given name.
// Note: For simplicity it's currently implemented as traversing of the whole reachable tree. // Note: For simplicity it's currently implemented as traversing of the whole reachable tree.
// If key is a bucket name, a page-path referencing the key will be returned as well. // If key is a bucket name, a page-path referencing the key will be returned as well.
func (n XRay) FindPathsToKey(key []byte) ([][]guts_cli.Pgid, error) { func (n XRay) FindPathsToKey(key []byte) ([][]common.Pgid, error) {
var found [][]guts_cli.Pgid var found [][]common.Pgid
rootPage, _, err := guts_cli.GetRootPage(n.path) rootPage, _, err := guts_cli.GetRootPage(n.path)
if err != nil { if err != nil {
return nil, err return nil, err
} }
err = n.traverse([]guts_cli.Pgid{rootPage}, err = n.traverse([]common.Pgid{rootPage},
func(page *guts_cli.Page, stack []guts_cli.Pgid) error { func(page *common.Page, stack []common.Pgid) error {
if page.Type() == "leaf" { if page.Typ() == "leaf" {
for i := uint16(0); i < page.Count(); i++ { for i := uint16(0); i < page.Count(); i++ {
if bytes.Equal(page.LeafPageElement(i).Key(), key) { if bytes.Equal(page.LeafPageElement(i).Key(), key) {
var copyPath []guts_cli.Pgid var copyPath []common.Pgid
copyPath = append(copyPath, stack...) copyPath = append(copyPath, stack...)
found = append(found, copyPath) found = append(found, copyPath)
} }

194
node.go
View File

@ -5,6 +5,8 @@ import (
"fmt" "fmt"
"sort" "sort"
"unsafe" "unsafe"
"go.etcd.io/bbolt/internal/common"
) )
// node represents an in-memory, deserialized page. // node represents an in-memory, deserialized page.
@ -14,10 +16,10 @@ type node struct {
unbalanced bool unbalanced bool
spilled bool spilled bool
key []byte key []byte
pgid pgid pgid common.Pgid
parent *node parent *node
children nodes children nodes
inodes inodes inodes common.Inodes
} }
// root returns the top-level node this node is attached to. // root returns the top-level node this node is attached to.
@ -38,10 +40,10 @@ func (n *node) minKeys() int {
// size returns the size of the node after serialization. // size returns the size of the node after serialization.
func (n *node) size() int { func (n *node) size() int {
sz, elsz := pageHeaderSize, n.pageElementSize() sz, elsz := common.PageHeaderSize, n.pageElementSize()
for i := 0; i < len(n.inodes); i++ { for i := 0; i < len(n.inodes); i++ {
item := &n.inodes[i] item := &n.inodes[i]
sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value)) sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value()))
} }
return int(sz) return int(sz)
} }
@ -50,10 +52,10 @@ func (n *node) size() int {
// This is an optimization to avoid calculating a large node when we only need // This is an optimization to avoid calculating a large node when we only need
// to know if it fits inside a certain page size. // to know if it fits inside a certain page size.
func (n *node) sizeLessThan(v uintptr) bool { func (n *node) sizeLessThan(v uintptr) bool {
sz, elsz := pageHeaderSize, n.pageElementSize() sz, elsz := common.PageHeaderSize, n.pageElementSize()
for i := 0; i < len(n.inodes); i++ { for i := 0; i < len(n.inodes); i++ {
item := &n.inodes[i] item := &n.inodes[i]
sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value)) sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value()))
if sz >= v { if sz >= v {
return false return false
} }
@ -64,9 +66,9 @@ func (n *node) sizeLessThan(v uintptr) bool {
// pageElementSize returns the size of each page element based on the type of node. // pageElementSize returns the size of each page element based on the type of node.
func (n *node) pageElementSize() uintptr { func (n *node) pageElementSize() uintptr {
if n.isLeaf { if n.isLeaf {
return leafPageElementSize return common.LeafPageElementSize
} }
return branchPageElementSize return common.BranchPageElementSize
} }
// childAt returns the child node at a given index. // childAt returns the child node at a given index.
@ -74,12 +76,12 @@ func (n *node) childAt(index int) *node {
if n.isLeaf { if n.isLeaf {
panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index)) panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
} }
return n.bucket.node(n.inodes[index].pgid, n) return n.bucket.node(n.inodes[index].Pgid(), n)
} }
// childIndex returns the index of a given child node. // childIndex returns the index of a given child node.
func (n *node) childIndex(child *node) int { func (n *node) childIndex(child *node) int {
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 }) index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), child.key) != -1 })
return index return index
} }
@ -113,9 +115,9 @@ func (n *node) prevSibling() *node {
} }
// put inserts a key/value. // put inserts a key/value.
func (n *node) put(oldKey, newKey, value []byte, pgId pgid, flags uint32) { func (n *node) put(oldKey, newKey, value []byte, pgId common.Pgid, flags uint32) {
if pgId >= n.bucket.tx.meta.pgid { if pgId >= n.bucket.tx.meta.Pgid() {
panic(fmt.Sprintf("pgId (%d) above high water mark (%d)", pgId, n.bucket.tx.meta.pgid)) panic(fmt.Sprintf("pgId (%d) above high water mark (%d)", pgId, n.bucket.tx.meta.Pgid()))
} else if len(oldKey) <= 0 { } else if len(oldKey) <= 0 {
panic("put: zero-length old key") panic("put: zero-length old key")
} else if len(newKey) <= 0 { } else if len(newKey) <= 0 {
@ -123,30 +125,30 @@ func (n *node) put(oldKey, newKey, value []byte, pgId pgid, flags uint32) {
} }
// Find insertion index. // Find insertion index.
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 }) index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), oldKey) != -1 })
// Add capacity and shift nodes if we don't have an exact match and need to insert. // Add capacity and shift nodes if we don't have an exact match and need to insert.
exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)) exact := len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].Key(), oldKey)
if !exact { if !exact {
n.inodes = append(n.inodes, inode{}) n.inodes = append(n.inodes, common.Inode{})
copy(n.inodes[index+1:], n.inodes[index:]) copy(n.inodes[index+1:], n.inodes[index:])
} }
inode := &n.inodes[index] inode := &n.inodes[index]
inode.flags = flags inode.SetFlags(flags)
inode.key = newKey inode.SetKey(newKey)
inode.value = value inode.SetValue(value)
inode.pgid = pgId inode.SetPgid(pgId)
_assert(len(inode.key) > 0, "put: zero-length inode key") common.Assert(len(inode.Key()) > 0, "put: zero-length inode key")
} }
// del removes a key from the node. // del removes a key from the node.
func (n *node) del(key []byte) { func (n *node) del(key []byte) {
// Find index of key. // Find index of key.
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 }) index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), key) != -1 })
// Exit if the key isn't found. // Exit if the key isn't found.
if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) { if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].Key(), key) {
return return
} }
@ -158,30 +160,30 @@ func (n *node) del(key []byte) {
} }
// read initializes the node from a page. // read initializes the node from a page.
func (n *node) read(p *page) { func (n *node) read(p *common.Page) {
n.pgid = p.id n.pgid = p.Id()
n.isLeaf = ((p.flags & leafPageFlag) != 0) n.isLeaf = (p.Flags() & common.LeafPageFlag) != 0
n.inodes = make(inodes, int(p.count)) n.inodes = make(common.Inodes, int(p.Count()))
for i := 0; i < int(p.count); i++ { for i := 0; i < int(p.Count()); i++ {
inode := &n.inodes[i] inode := &n.inodes[i]
if n.isLeaf { if n.isLeaf {
elem := p.leafPageElement(uint16(i)) elem := p.LeafPageElement(uint16(i))
inode.flags = elem.flags inode.SetFlags(elem.Flags())
inode.key = elem.key() inode.SetKey(elem.Key())
inode.value = elem.value() inode.SetValue(elem.Value())
} else { } else {
elem := p.branchPageElement(uint16(i)) elem := p.BranchPageElement(uint16(i))
inode.pgid = elem.pgid inode.SetPgid(elem.Pgid())
inode.key = elem.key() inode.SetKey(elem.Key())
} }
_assert(len(inode.key) > 0, "read: zero-length inode key") common.Assert(len(inode.Key()) > 0, "read: zero-length inode key")
} }
// Save first key so we can find the node in the parent when we spill. // Save first key, so we can find the node in the parent when we spill.
if len(n.inodes) > 0 { if len(n.inodes) > 0 {
n.key = n.inodes[0].key n.key = n.inodes[0].Key()
_assert(len(n.key) > 0, "read: zero-length node key") common.Assert(len(n.key) > 0, "read: zero-length node key")
} else { } else {
n.key = nil n.key = nil
} }
@ -190,23 +192,23 @@ func (n *node) read(p *page) {
// write writes the items onto one or more pages. // write writes the items onto one or more pages.
// The page should have p.id (might be 0 for meta or bucket-inline page) and p.overflow set // The page should have p.id (might be 0 for meta or bucket-inline page) and p.overflow set
// and the rest should be zeroed. // and the rest should be zeroed.
func (n *node) write(p *page) { func (n *node) write(p *common.Page) {
_assert(p.count == 0 && p.flags == 0, "node cannot be written into a not empty page") common.Assert(p.Count() == 0 && p.Flags() == 0, "node cannot be written into a not empty page")
// Initialize page. // Initialize page.
if n.isLeaf { if n.isLeaf {
p.flags = leafPageFlag p.SetFlags(common.LeafPageFlag)
} else { } else {
p.flags = branchPageFlag p.SetFlags(common.BranchPageFlag)
} }
if len(n.inodes) >= 0xFFFF { if len(n.inodes) >= 0xFFFF {
panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id)) panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.Id()))
} }
p.count = uint16(len(n.inodes)) p.SetCount(uint16(len(n.inodes)))
// Stop here if there are no items to write. // Stop here if there are no items to write.
if p.count == 0 { if p.Count() == 0 {
return return
} }
@ -214,32 +216,32 @@ func (n *node) write(p *page) {
// off tracks the offset into p of the start of the next data. // off tracks the offset into p of the start of the next data.
off := unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes)) off := unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes))
for i, item := range n.inodes { for i, item := range n.inodes {
_assert(len(item.key) > 0, "write: zero-length inode key") common.Assert(len(item.Key()) > 0, "write: zero-length inode key")
// Create a slice to write into of needed size and advance // Create a slice to write into of needed size and advance
// byte pointer for next iteration. // byte pointer for next iteration.
sz := len(item.key) + len(item.value) sz := len(item.Key()) + len(item.Value())
b := unsafeByteSlice(unsafe.Pointer(p), off, 0, sz) b := common.UnsafeByteSlice(unsafe.Pointer(p), off, 0, sz)
off += uintptr(sz) off += uintptr(sz)
// Write the page element. // Write the page element.
if n.isLeaf { if n.isLeaf {
elem := p.leafPageElement(uint16(i)) elem := p.LeafPageElement(uint16(i))
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) elem.SetPos(uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))))
elem.flags = item.flags elem.SetFlags(item.Flags())
elem.ksize = uint32(len(item.key)) elem.SetKsize(uint32(len(item.Key())))
elem.vsize = uint32(len(item.value)) elem.SetVsize(uint32(len(item.Value())))
} else { } else {
elem := p.branchPageElement(uint16(i)) elem := p.BranchPageElement(uint16(i))
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) elem.SetPos(uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))))
elem.ksize = uint32(len(item.key)) elem.SetKsize(uint32(len(item.Key())))
elem.pgid = item.pgid elem.SetPgid(item.Pgid())
_assert(elem.pgid != p.id, "write: circular dependency occurred") common.Assert(elem.Pgid() != p.Id(), "write: circular dependency occurred")
} }
// Write data for the element to the end of the page. // Write data for the element to the end of the page.
l := copy(b, item.key) l := copy(b, item.Key())
copy(b[l:], item.value) copy(b[l:], item.Value())
} }
// DEBUG ONLY: n.dump() // DEBUG ONLY: n.dump()
@ -273,7 +275,7 @@ func (n *node) split(pageSize uintptr) []*node {
func (n *node) splitTwo(pageSize uintptr) (*node, *node) { func (n *node) splitTwo(pageSize uintptr) (*node, *node) {
// Ignore the split if the page doesn't have at least enough nodes for // Ignore the split if the page doesn't have at least enough nodes for
// two pages or if the nodes can fit in a single page. // two pages or if the nodes can fit in a single page.
if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) { if len(n.inodes) <= (common.MinKeysPerPage*2) || n.sizeLessThan(pageSize) {
return n, nil return n, nil
} }
@ -313,17 +315,17 @@ func (n *node) splitTwo(pageSize uintptr) (*node, *node) {
// It returns the index as well as the size of the first page. // It returns the index as well as the size of the first page.
// This is only be called from split(). // This is only be called from split().
func (n *node) splitIndex(threshold int) (index, sz uintptr) { func (n *node) splitIndex(threshold int) (index, sz uintptr) {
sz = pageHeaderSize sz = common.PageHeaderSize
// Loop until we only have the minimum number of keys required for the second page. // Loop until we only have the minimum number of keys required for the second page.
for i := 0; i < len(n.inodes)-minKeysPerPage; i++ { for i := 0; i < len(n.inodes)-common.MinKeysPerPage; i++ {
index = uintptr(i) index = uintptr(i)
inode := n.inodes[i] inode := n.inodes[i]
elsize := n.pageElementSize() + uintptr(len(inode.key)) + uintptr(len(inode.value)) elsize := n.pageElementSize() + uintptr(len(inode.Key())) + uintptr(len(inode.Value()))
// If we have at least the minimum number of keys and adding another // If we have at least the minimum number of keys and adding another
// node would put us over the threshold then exit and return. // node would put us over the threshold then exit and return.
if index >= minKeysPerPage && sz+elsize > uintptr(threshold) { if index >= common.MinKeysPerPage && sz+elsize > uintptr(threshold) {
break break
} }
@ -360,7 +362,7 @@ func (n *node) spill() error {
for _, node := range nodes { for _, node := range nodes {
// Add node's page to the freelist if it's not new. // Add node's page to the freelist if it's not new.
if node.pgid > 0 { if node.pgid > 0 {
tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid)) tx.db.freelist.free(tx.meta.Txid(), tx.page(node.pgid))
node.pgid = 0 node.pgid = 0
} }
@ -371,10 +373,10 @@ func (n *node) spill() error {
} }
// Write the node. // Write the node.
if p.id >= tx.meta.pgid { if p.Id() >= tx.meta.Pgid() {
panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)) panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.Id(), tx.meta.Pgid()))
} }
node.pgid = p.id node.pgid = p.Id()
node.write(p) node.write(p)
node.spilled = true node.spilled = true
@ -382,12 +384,12 @@ func (n *node) spill() error {
if node.parent != nil { if node.parent != nil {
var key = node.key var key = node.key
if key == nil { if key == nil {
key = node.inodes[0].key key = node.inodes[0].Key()
} }
node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0) node.parent.put(key, node.inodes[0].Key(), nil, node.pgid, 0)
node.key = node.inodes[0].key node.key = node.inodes[0].Key()
_assert(len(node.key) > 0, "spill: zero-length node key") common.Assert(len(node.key) > 0, "spill: zero-length node key")
} }
// Update the statistics. // Update the statistics.
@ -426,14 +428,14 @@ func (n *node) rebalance() {
// If root node is a branch and only has one node then collapse it. // If root node is a branch and only has one node then collapse it.
if !n.isLeaf && len(n.inodes) == 1 { if !n.isLeaf && len(n.inodes) == 1 {
// Move root's child up. // Move root's child up.
child := n.bucket.node(n.inodes[0].pgid, n) child := n.bucket.node(n.inodes[0].Pgid(), n)
n.isLeaf = child.isLeaf n.isLeaf = child.isLeaf
n.inodes = child.inodes[:] n.inodes = child.inodes[:]
n.children = child.children n.children = child.children
// Reparent all child nodes being moved. // Reparent all child nodes being moved.
for _, inode := range n.inodes { for _, inode := range n.inodes {
if child, ok := n.bucket.nodes[inode.pgid]; ok { if child, ok := n.bucket.nodes[inode.Pgid()]; ok {
child.parent = n child.parent = n
} }
} }
@ -457,11 +459,11 @@ func (n *node) rebalance() {
return return
} }
_assert(n.parent.numChildren() > 1, "parent must have at least 2 children") common.Assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
// Destination node is right sibling if idx == 0, otherwise left sibling. // Destination node is right sibling if idx == 0, otherwise left sibling.
var target *node var target *node
var useNextSibling = (n.parent.childIndex(n) == 0) var useNextSibling = n.parent.childIndex(n) == 0
if useNextSibling { if useNextSibling {
target = n.nextSibling() target = n.nextSibling()
} else { } else {
@ -472,7 +474,7 @@ func (n *node) rebalance() {
if useNextSibling { if useNextSibling {
// Reparent all child nodes being moved. // Reparent all child nodes being moved.
for _, inode := range target.inodes { for _, inode := range target.inodes {
if child, ok := n.bucket.nodes[inode.pgid]; ok { if child, ok := n.bucket.nodes[inode.Pgid()]; ok {
child.parent.removeChild(child) child.parent.removeChild(child)
child.parent = n child.parent = n
child.parent.children = append(child.parent.children, child) child.parent.children = append(child.parent.children, child)
@ -488,7 +490,7 @@ func (n *node) rebalance() {
} else { } else {
// Reparent all child nodes being moved. // Reparent all child nodes being moved.
for _, inode := range n.inodes { for _, inode := range n.inodes {
if child, ok := n.bucket.nodes[inode.pgid]; ok { if child, ok := n.bucket.nodes[inode.Pgid()]; ok {
child.parent.removeChild(child) child.parent.removeChild(child)
child.parent = target child.parent = target
child.parent.children = append(child.parent.children, child) child.parent.children = append(child.parent.children, child)
@ -525,20 +527,20 @@ func (n *node) dereference() {
key := make([]byte, len(n.key)) key := make([]byte, len(n.key))
copy(key, n.key) copy(key, n.key)
n.key = key n.key = key
_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node") common.Assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
} }
for i := range n.inodes { for i := range n.inodes {
inode := &n.inodes[i] inode := &n.inodes[i]
key := make([]byte, len(inode.key)) key := make([]byte, len(inode.Key()))
copy(key, inode.key) copy(key, inode.Key())
inode.key = key inode.SetKey(key)
_assert(len(inode.key) > 0, "dereference: zero-length inode key") common.Assert(len(inode.Key()) > 0, "dereference: zero-length inode key")
value := make([]byte, len(inode.value)) value := make([]byte, len(inode.Value()))
copy(value, inode.value) copy(value, inode.Value())
inode.value = value inode.SetValue(value)
} }
// Recursively dereference children. // Recursively dereference children.
@ -553,7 +555,7 @@ func (n *node) dereference() {
// free adds the node's underlying page to the freelist. // free adds the node's underlying page to the freelist.
func (n *node) free() { func (n *node) free() {
if n.pgid != 0 { if n.pgid != 0 {
n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid)) n.bucket.tx.db.freelist.free(n.bucket.tx.meta.Txid(), n.bucket.tx.page(n.pgid))
n.pgid = 0 n.pgid = 0
} }
} }
@ -594,17 +596,5 @@ type nodes []*node
func (s nodes) Len() int { return len(s) } func (s nodes) Len() int { return len(s) }
func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s nodes) Less(i, j int) bool { func (s nodes) Less(i, j int) bool {
return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 return bytes.Compare(s[i].inodes[0].Key(), s[j].inodes[0].Key()) == -1
} }
// inode represents an internal node inside of a node.
// It can be used to point to elements in a page or point
// to an element which hasn't been added to a page yet.
type inode struct {
flags uint32
pgid pgid
key []byte
value []byte
}
type inodes []inode

View File

@ -3,30 +3,34 @@ package bbolt
import ( import (
"testing" "testing"
"unsafe" "unsafe"
"go.etcd.io/bbolt/internal/common"
) )
// Ensure that a node can insert a key/value. // Ensure that a node can insert a key/value.
func TestNode_put(t *testing.T) { func TestNode_put(t *testing.T) {
n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{meta: &meta{pgid: 1}}}} m := &common.Meta{}
m.SetPgid(1)
n := &node{inodes: make(common.Inodes, 0), bucket: &Bucket{tx: &Tx{meta: m}}}
n.put([]byte("baz"), []byte("baz"), []byte("2"), 0, 0) n.put([]byte("baz"), []byte("baz"), []byte("2"), 0, 0)
n.put([]byte("foo"), []byte("foo"), []byte("0"), 0, 0) n.put([]byte("foo"), []byte("foo"), []byte("0"), 0, 0)
n.put([]byte("bar"), []byte("bar"), []byte("1"), 0, 0) n.put([]byte("bar"), []byte("bar"), []byte("1"), 0, 0)
n.put([]byte("foo"), []byte("foo"), []byte("3"), 0, leafPageFlag) n.put([]byte("foo"), []byte("foo"), []byte("3"), 0, common.LeafPageFlag)
if len(n.inodes) != 3 { if len(n.inodes) != 3 {
t.Fatalf("exp=3; got=%d", len(n.inodes)) t.Fatalf("exp=3; got=%d", len(n.inodes))
} }
if k, v := n.inodes[0].key, n.inodes[0].value; string(k) != "bar" || string(v) != "1" { if k, v := n.inodes[0].Key(), n.inodes[0].Value(); string(k) != "bar" || string(v) != "1" {
t.Fatalf("exp=<bar,1>; got=<%s,%s>", k, v) t.Fatalf("exp=<bar,1>; got=<%s,%s>", k, v)
} }
if k, v := n.inodes[1].key, n.inodes[1].value; string(k) != "baz" || string(v) != "2" { if k, v := n.inodes[1].Key(), n.inodes[1].Value(); string(k) != "baz" || string(v) != "2" {
t.Fatalf("exp=<baz,2>; got=<%s,%s>", k, v) t.Fatalf("exp=<baz,2>; got=<%s,%s>", k, v)
} }
if k, v := n.inodes[2].key, n.inodes[2].value; string(k) != "foo" || string(v) != "3" { if k, v := n.inodes[2].Key(), n.inodes[2].Value(); string(k) != "foo" || string(v) != "3" {
t.Fatalf("exp=<foo,3>; got=<%s,%s>", k, v) t.Fatalf("exp=<foo,3>; got=<%s,%s>", k, v)
} }
if n.inodes[2].flags != uint32(leafPageFlag) { if n.inodes[2].Flags() != uint32(common.LeafPageFlag) {
t.Fatalf("not a leaf: %d", n.inodes[2].flags) t.Fatalf("not a leaf: %d", n.inodes[2].Flags())
} }
} }
@ -34,18 +38,19 @@ func TestNode_put(t *testing.T) {
func TestNode_read_LeafPage(t *testing.T) { func TestNode_read_LeafPage(t *testing.T) {
// Create a page. // Create a page.
var buf [4096]byte var buf [4096]byte
page := (*page)(unsafe.Pointer(&buf[0])) page := (*common.Page)(unsafe.Pointer(&buf[0]))
page.flags = leafPageFlag page.SetFlags(common.LeafPageFlag)
page.count = 2 page.SetCount(2)
// Insert 2 elements at the beginning. sizeof(leafPageElement) == 16 // Insert 2 elements at the beginning. sizeof(leafPageElement) == 16
nodes := (*[3]leafPageElement)(unsafe.Pointer(uintptr(unsafe.Pointer(page)) + unsafe.Sizeof(*page))) nodes := page.LeafPageElements()
nodes[0] = leafPageElement{flags: 0, pos: 32, ksize: 3, vsize: 4} // pos = sizeof(leafPageElement) * 2 //nodes := (*[3]leafPageElement)(unsafe.Pointer(uintptr(unsafe.Pointer(page)) + unsafe.Sizeof(*page)))
nodes[1] = leafPageElement{flags: 0, pos: 23, ksize: 10, vsize: 3} // pos = sizeof(leafPageElement) + 3 + 4 nodes[0] = *common.NewLeafPageElement(0, 32, 3, 4) // pos = sizeof(leafPageElement) * 2
nodes[1] = *common.NewLeafPageElement(0, 23, 10, 3) // pos = sizeof(leafPageElement) + 3 + 4
// Write data for the nodes at the end. // Write data for the nodes at the end.
const s = "barfoozhelloworldbye" const s = "barfoozhelloworldbye"
data := unsafeByteSlice(unsafe.Pointer(&nodes[2]), 0, 0, len(s)) data := common.UnsafeByteSlice(unsafe.Pointer(uintptr(unsafe.Pointer(page))+unsafe.Sizeof(*page)+common.LeafPageElementSize*2), 0, 0, len(s))
copy(data, s) copy(data, s)
// Deserialize page into a leaf. // Deserialize page into a leaf.
@ -59,10 +64,10 @@ func TestNode_read_LeafPage(t *testing.T) {
if len(n.inodes) != 2 { if len(n.inodes) != 2 {
t.Fatalf("exp=2; got=%d", len(n.inodes)) t.Fatalf("exp=2; got=%d", len(n.inodes))
} }
if k, v := n.inodes[0].key, n.inodes[0].value; string(k) != "bar" || string(v) != "fooz" { if k, v := n.inodes[0].Key(), n.inodes[0].Value(); string(k) != "bar" || string(v) != "fooz" {
t.Fatalf("exp=<bar,fooz>; got=<%s,%s>", k, v) t.Fatalf("exp=<bar,fooz>; got=<%s,%s>", k, v)
} }
if k, v := n.inodes[1].key, n.inodes[1].value; string(k) != "helloworld" || string(v) != "bye" { if k, v := n.inodes[1].Key(), n.inodes[1].Value(); string(k) != "helloworld" || string(v) != "bye" {
t.Fatalf("exp=<helloworld,bye>; got=<%s,%s>", k, v) t.Fatalf("exp=<helloworld,bye>; got=<%s,%s>", k, v)
} }
} }
@ -70,14 +75,16 @@ func TestNode_read_LeafPage(t *testing.T) {
// Ensure that a node can serialize into a leaf page. // Ensure that a node can serialize into a leaf page.
func TestNode_write_LeafPage(t *testing.T) { func TestNode_write_LeafPage(t *testing.T) {
// Create a node. // Create a node.
n := &node{isLeaf: true, inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}} m := &common.Meta{}
m.SetPgid(1)
n := &node{isLeaf: true, inodes: make(common.Inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: m}}}
n.put([]byte("susy"), []byte("susy"), []byte("que"), 0, 0) n.put([]byte("susy"), []byte("susy"), []byte("que"), 0, 0)
n.put([]byte("ricki"), []byte("ricki"), []byte("lake"), 0, 0) n.put([]byte("ricki"), []byte("ricki"), []byte("lake"), 0, 0)
n.put([]byte("john"), []byte("john"), []byte("johnson"), 0, 0) n.put([]byte("john"), []byte("john"), []byte("johnson"), 0, 0)
// Write it to a page. // Write it to a page.
var buf [4096]byte var buf [4096]byte
p := (*page)(unsafe.Pointer(&buf[0])) p := (*common.Page)(unsafe.Pointer(&buf[0]))
n.write(p) n.write(p)
// Read the page back in. // Read the page back in.
@ -88,13 +95,13 @@ func TestNode_write_LeafPage(t *testing.T) {
if len(n2.inodes) != 3 { if len(n2.inodes) != 3 {
t.Fatalf("exp=3; got=%d", len(n2.inodes)) t.Fatalf("exp=3; got=%d", len(n2.inodes))
} }
if k, v := n2.inodes[0].key, n2.inodes[0].value; string(k) != "john" || string(v) != "johnson" { if k, v := n2.inodes[0].Key(), n2.inodes[0].Value(); string(k) != "john" || string(v) != "johnson" {
t.Fatalf("exp=<john,johnson>; got=<%s,%s>", k, v) t.Fatalf("exp=<john,johnson>; got=<%s,%s>", k, v)
} }
if k, v := n2.inodes[1].key, n2.inodes[1].value; string(k) != "ricki" || string(v) != "lake" { if k, v := n2.inodes[1].Key(), n2.inodes[1].Value(); string(k) != "ricki" || string(v) != "lake" {
t.Fatalf("exp=<ricki,lake>; got=<%s,%s>", k, v) t.Fatalf("exp=<ricki,lake>; got=<%s,%s>", k, v)
} }
if k, v := n2.inodes[2].key, n2.inodes[2].value; string(k) != "susy" || string(v) != "que" { if k, v := n2.inodes[2].Key(), n2.inodes[2].Value(); string(k) != "susy" || string(v) != "que" {
t.Fatalf("exp=<susy,que>; got=<%s,%s>", k, v) t.Fatalf("exp=<susy,que>; got=<%s,%s>", k, v)
} }
} }
@ -102,7 +109,9 @@ func TestNode_write_LeafPage(t *testing.T) {
// Ensure that a node can split into appropriate subgroups. // Ensure that a node can split into appropriate subgroups.
func TestNode_split(t *testing.T) { func TestNode_split(t *testing.T) {
// Create a node. // Create a node.
n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}} m := &common.Meta{}
m.SetPgid(1)
n := &node{inodes: make(common.Inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: m}}}
n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0) n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0)
n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0) n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0)
n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0) n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0)
@ -127,7 +136,9 @@ func TestNode_split(t *testing.T) {
// Ensure that a page with the minimum number of inodes just returns a single node. // Ensure that a page with the minimum number of inodes just returns a single node.
func TestNode_split_MinKeys(t *testing.T) { func TestNode_split_MinKeys(t *testing.T) {
// Create a node. // Create a node.
n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}} m := &common.Meta{}
m.SetPgid(1)
n := &node{inodes: make(common.Inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: m}}}
n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0) n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0)
n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0) n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0)
@ -141,7 +152,9 @@ func TestNode_split_MinKeys(t *testing.T) {
// Ensure that a node that has keys that all fit on a page just returns one leaf. // Ensure that a node that has keys that all fit on a page just returns one leaf.
func TestNode_split_SinglePage(t *testing.T) { func TestNode_split_SinglePage(t *testing.T) {
// Create a node. // Create a node.
n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}} m := &common.Meta{}
m.SetPgid(1)
n := &node{inodes: make(common.Inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: m}}}
n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0) n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0)
n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0) n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0)
n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0) n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0)

214
page.go
View File

@ -1,214 +0,0 @@
package bbolt
import (
"fmt"
"os"
"sort"
"unsafe"
)
const pageHeaderSize = unsafe.Sizeof(page{})
const minKeysPerPage = 2
const branchPageElementSize = unsafe.Sizeof(branchPageElement{})
const leafPageElementSize = unsafe.Sizeof(leafPageElement{})
const (
branchPageFlag = 0x01
leafPageFlag = 0x02
metaPageFlag = 0x04
freelistPageFlag = 0x10
)
const (
bucketLeafFlag = 0x01
)
type pgid uint64
type page struct {
id pgid
flags uint16
count uint16
overflow uint32
}
// typ returns a human readable page type string used for debugging.
func (p *page) typ() string {
if (p.flags & branchPageFlag) != 0 {
return "branch"
} else if (p.flags & leafPageFlag) != 0 {
return "leaf"
} else if (p.flags & metaPageFlag) != 0 {
return "meta"
} else if (p.flags & freelistPageFlag) != 0 {
return "freelist"
}
return fmt.Sprintf("unknown<%02x>", p.flags)
}
// meta returns a pointer to the metadata section of the page.
func (p *page) meta() *meta {
return (*meta)(unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)))
}
func (p *page) fastCheck(id pgid) {
_assert(p.id == id, "Page expected to be: %v, but self identifies as %v", id, p.id)
// Only one flag of page-type can be set.
_assert(p.flags == branchPageFlag ||
p.flags == leafPageFlag ||
p.flags == metaPageFlag ||
p.flags == freelistPageFlag,
"page %v: has unexpected type/flags: %x", p.id, p.flags)
}
// leafPageElement retrieves the leaf node by index
func (p *page) leafPageElement(index uint16) *leafPageElement {
return (*leafPageElement)(unsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p),
leafPageElementSize, int(index)))
}
// leafPageElements retrieves a list of leaf nodes.
func (p *page) leafPageElements() []leafPageElement {
if p.count == 0 {
return nil
}
var elems []leafPageElement
data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))
unsafeSlice(unsafe.Pointer(&elems), data, int(p.count))
return elems
}
// branchPageElement retrieves the branch node by index
func (p *page) branchPageElement(index uint16) *branchPageElement {
return (*branchPageElement)(unsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p),
unsafe.Sizeof(branchPageElement{}), int(index)))
}
// branchPageElements retrieves a list of branch nodes.
func (p *page) branchPageElements() []branchPageElement {
if p.count == 0 {
return nil
}
var elems []branchPageElement
data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))
unsafeSlice(unsafe.Pointer(&elems), data, int(p.count))
return elems
}
// dump writes n bytes of the page to STDERR as hex output.
func (p *page) hexdump(n int) {
buf := unsafeByteSlice(unsafe.Pointer(p), 0, 0, n)
fmt.Fprintf(os.Stderr, "%x\n", buf)
}
type pages []*page
func (s pages) Len() int { return len(s) }
func (s pages) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s pages) Less(i, j int) bool { return s[i].id < s[j].id }
// branchPageElement represents a node on a branch page.
type branchPageElement struct {
pos uint32
ksize uint32
pgid pgid
}
// key returns a byte slice of the node key.
func (n *branchPageElement) key() []byte {
return unsafeByteSlice(unsafe.Pointer(n), 0, int(n.pos), int(n.pos)+int(n.ksize))
}
// leafPageElement represents a node on a leaf page.
type leafPageElement struct {
flags uint32
pos uint32
ksize uint32
vsize uint32
}
// key returns a byte slice of the node key.
func (n *leafPageElement) key() []byte {
i := int(n.pos)
j := i + int(n.ksize)
return unsafeByteSlice(unsafe.Pointer(n), 0, i, j)
}
// value returns a byte slice of the node value.
func (n *leafPageElement) value() []byte {
i := int(n.pos) + int(n.ksize)
j := i + int(n.vsize)
return unsafeByteSlice(unsafe.Pointer(n), 0, i, j)
}
// PageInfo represents human readable information about a page.
type PageInfo struct {
ID int
Type string
Count int
OverflowCount int
}
type pgids []pgid
func (s pgids) Len() int { return len(s) }
func (s pgids) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s pgids) Less(i, j int) bool { return s[i] < s[j] }
// merge returns the sorted union of a and b.
func (a pgids) merge(b pgids) pgids {
// Return the opposite slice if one is nil.
if len(a) == 0 {
return b
}
if len(b) == 0 {
return a
}
merged := make(pgids, len(a)+len(b))
mergepgids(merged, a, b)
return merged
}
// mergepgids copies the sorted union of a and b into dst.
// If dst is too small, it panics.
func mergepgids(dst, a, b pgids) {
if len(dst) < len(a)+len(b) {
panic(fmt.Errorf("mergepgids bad len %d < %d + %d", len(dst), len(a), len(b)))
}
// Copy in the opposite slice if one is nil.
if len(a) == 0 {
copy(dst, b)
return
}
if len(b) == 0 {
copy(dst, a)
return
}
// Merged will hold all elements from both lists.
merged := dst[:0]
// Assign lead to the slice with a lower starting value, follow to the higher value.
lead, follow := a, b
if b[0] < a[0] {
lead, follow = b, a
}
// Continue while there are elements in the lead.
for len(lead) > 0 {
// Merge largest prefix of lead that is ahead of follow[0].
n := sort.Search(len(lead), func(i int) bool { return lead[i] > follow[0] })
merged = append(merged, lead[:n]...)
if n >= len(lead) {
break
}
// Swap lead and follow.
lead, follow = follow, lead[n:]
}
// Append what's left in follow.
_ = append(merged, follow...)
}

141
tx.go
View File

@ -9,10 +9,9 @@ import (
"sync/atomic" "sync/atomic"
"time" "time"
"unsafe" "unsafe"
)
// txid represents the internal transaction identifier. "go.etcd.io/bbolt/internal/common"
type txid uint64 )
// Tx represents a read-only or read/write transaction on the database. // Tx represents a read-only or read/write transaction on the database.
// Read-only transactions can be used for retrieving values for keys and creating cursors. // Read-only transactions can be used for retrieving values for keys and creating cursors.
@ -26,9 +25,9 @@ type Tx struct {
writable bool writable bool
managed bool managed bool
db *DB db *DB
meta *meta meta *common.Meta
root Bucket root Bucket
pages map[pgid]*page pages map[common.Pgid]*common.Page
stats TxStats stats TxStats
commitHandlers []func() commitHandlers []func()
@ -47,24 +46,24 @@ func (tx *Tx) init(db *DB) {
tx.pages = nil tx.pages = nil
// Copy the meta page since it can be changed by the writer. // Copy the meta page since it can be changed by the writer.
tx.meta = &meta{} tx.meta = &common.Meta{}
db.meta().copy(tx.meta) db.meta().Copy(tx.meta)
// Copy over the root bucket. // Copy over the root bucket.
tx.root = newBucket(tx) tx.root = newBucket(tx)
tx.root.bucket = &bucket{} tx.root.InBucket = &common.InBucket{}
*tx.root.bucket = tx.meta.root *tx.root.InBucket = *(tx.meta.RootBucket())
// Increment the transaction id and add a page cache for writable transactions. // Increment the transaction id and add a page cache for writable transactions.
if tx.writable { if tx.writable {
tx.pages = make(map[pgid]*page) tx.pages = make(map[common.Pgid]*common.Page)
tx.meta.txid += txid(1) tx.meta.IncTxid()
} }
} }
// ID returns the transaction id. // ID returns the transaction id.
func (tx *Tx) ID() int { func (tx *Tx) ID() int {
return int(tx.meta.txid) return int(tx.meta.Txid())
} }
// DB returns a reference to the database that created the transaction. // DB returns a reference to the database that created the transaction.
@ -74,7 +73,7 @@ func (tx *Tx) DB() *DB {
// Size returns current database size in bytes as seen by this transaction. // Size returns current database size in bytes as seen by this transaction.
func (tx *Tx) Size() int64 { func (tx *Tx) Size() int64 {
return int64(tx.meta.pgid) * int64(tx.db.pageSize) return int64(tx.meta.Pgid()) * int64(tx.db.pageSize)
} }
// Writable returns whether the transaction can perform write operations. // Writable returns whether the transaction can perform write operations.
@ -140,11 +139,11 @@ func (tx *Tx) OnCommit(fn func()) {
// Returns an error if a disk write error occurs, or if Commit is // Returns an error if a disk write error occurs, or if Commit is
// called on a read-only transaction. // called on a read-only transaction.
func (tx *Tx) Commit() error { func (tx *Tx) Commit() error {
_assert(!tx.managed, "managed tx commit not allowed") common.Assert(!tx.managed, "managed tx commit not allowed")
if tx.db == nil { if tx.db == nil {
return ErrTxClosed return common.ErrTxClosed
} else if !tx.writable { } else if !tx.writable {
return ErrTxNotWritable return common.ErrTxNotWritable
} }
// TODO(benbjohnson): Use vectorized I/O to write out dirty pages. // TODO(benbjohnson): Use vectorized I/O to write out dirty pages.
@ -156,7 +155,7 @@ func (tx *Tx) Commit() error {
tx.stats.IncRebalanceTime(time.Since(startTime)) tx.stats.IncRebalanceTime(time.Since(startTime))
} }
opgid := tx.meta.pgid opgid := tx.meta.Pgid()
// spill data onto dirty pages. // spill data onto dirty pages.
startTime = time.Now() startTime = time.Now()
@ -167,11 +166,11 @@ func (tx *Tx) Commit() error {
tx.stats.IncSpillTime(time.Since(startTime)) tx.stats.IncSpillTime(time.Since(startTime))
// Free the old root bucket. // Free the old root bucket.
tx.meta.root.root = tx.root.root tx.meta.RootBucket().SetRootPage(tx.root.RootPage())
// Free the old freelist because commit writes out a fresh freelist. // Free the old freelist because commit writes out a fresh freelist.
if tx.meta.freelist != pgidNoFreelist { if tx.meta.Freelist() != common.PgidNoFreelist {
tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist)) tx.db.freelist.free(tx.meta.Txid(), tx.db.page(tx.meta.Freelist()))
} }
if !tx.db.NoFreelistSync { if !tx.db.NoFreelistSync {
@ -180,12 +179,12 @@ func (tx *Tx) Commit() error {
return err return err
} }
} else { } else {
tx.meta.freelist = pgidNoFreelist tx.meta.SetFreelist(common.PgidNoFreelist)
} }
// If the high water mark has moved up then attempt to grow the database. // If the high water mark has moved up then attempt to grow the database.
if tx.meta.pgid > opgid { if tx.meta.Pgid() > opgid {
if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil { if err := tx.db.grow(int(tx.meta.Pgid()+1) * tx.db.pageSize); err != nil {
tx.rollback() tx.rollback()
return err return err
} }
@ -244,7 +243,7 @@ func (tx *Tx) commitFreelist() error {
tx.rollback() tx.rollback()
return err return err
} }
tx.meta.freelist = p.id tx.meta.SetFreelist(p.Id())
return nil return nil
} }
@ -252,9 +251,9 @@ func (tx *Tx) commitFreelist() error {
// Rollback closes the transaction and ignores all previous updates. Read-only // Rollback closes the transaction and ignores all previous updates. Read-only
// transactions must be rolled back and not committed. // transactions must be rolled back and not committed.
func (tx *Tx) Rollback() error { func (tx *Tx) Rollback() error {
_assert(!tx.managed, "managed tx rollback not allowed") common.Assert(!tx.managed, "managed tx rollback not allowed")
if tx.db == nil { if tx.db == nil {
return ErrTxClosed return common.ErrTxClosed
} }
tx.nonPhysicalRollback() tx.nonPhysicalRollback()
return nil return nil
@ -266,7 +265,7 @@ func (tx *Tx) nonPhysicalRollback() {
return return
} }
if tx.writable { if tx.writable {
tx.db.freelist.rollback(tx.meta.txid) tx.db.freelist.rollback(tx.meta.Txid())
} }
tx.close() tx.close()
} }
@ -277,7 +276,7 @@ func (tx *Tx) rollback() {
return return
} }
if tx.writable { if tx.writable {
tx.db.freelist.rollback(tx.meta.txid) tx.db.freelist.rollback(tx.meta.Txid())
// When mmap fails, the `data`, `dataref` and `datasz` may be reset to // When mmap fails, the `data`, `dataref` and `datasz` may be reset to
// zero values, and there is no way to reload free page IDs in this case. // zero values, and there is no way to reload free page IDs in this case.
if tx.db.data != nil { if tx.db.data != nil {
@ -287,7 +286,7 @@ func (tx *Tx) rollback() {
tx.db.freelist.noSyncReload(tx.db.freepages()) tx.db.freelist.noSyncReload(tx.db.freepages())
} else { } else {
// Read free page list from freelist page. // Read free page list from freelist page.
tx.db.freelist.reload(tx.db.page(tx.db.meta().freelist)) tx.db.freelist.reload(tx.db.page(tx.db.meta().Freelist()))
} }
} }
} }
@ -352,13 +351,13 @@ func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) {
// Generate a meta page. We use the same page data for both meta pages. // Generate a meta page. We use the same page data for both meta pages.
buf := make([]byte, tx.db.pageSize) buf := make([]byte, tx.db.pageSize)
page := (*page)(unsafe.Pointer(&buf[0])) page := (*common.Page)(unsafe.Pointer(&buf[0]))
page.flags = metaPageFlag page.SetFlags(common.MetaPageFlag)
*page.meta() = *tx.meta *page.Meta() = *tx.meta
// Write meta 0. // Write meta 0.
page.id = 0 page.SetId(0)
page.meta().checksum = page.meta().sum64() page.Meta().SetChecksum(page.Meta().Sum64())
nn, err := w.Write(buf) nn, err := w.Write(buf)
n += int64(nn) n += int64(nn)
if err != nil { if err != nil {
@ -366,9 +365,9 @@ func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) {
} }
// Write meta 1 with a lower transaction id. // Write meta 1 with a lower transaction id.
page.id = 1 page.SetId(1)
page.meta().txid -= 1 page.Meta().DecTxid()
page.meta().checksum = page.meta().sum64() page.Meta().SetChecksum(page.Meta().Sum64())
nn, err = w.Write(buf) nn, err = w.Write(buf)
n += int64(nn) n += int64(nn)
if err != nil { if err != nil {
@ -408,14 +407,14 @@ func (tx *Tx) CopyFile(path string, mode os.FileMode) error {
} }
// allocate returns a contiguous block of memory starting at a given page. // allocate returns a contiguous block of memory starting at a given page.
func (tx *Tx) allocate(count int) (*page, error) { func (tx *Tx) allocate(count int) (*common.Page, error) {
p, err := tx.db.allocate(tx.meta.txid, count) p, err := tx.db.allocate(tx.meta.Txid(), count)
if err != nil { if err != nil {
return nil, err return nil, err
} }
// Save to our page cache. // Save to our page cache.
tx.pages[p.id] = p tx.pages[p.Id()] = p
// Update statistics. // Update statistics.
tx.stats.IncPageCount(int64(count)) tx.stats.IncPageCount(int64(count))
@ -427,18 +426,18 @@ func (tx *Tx) allocate(count int) (*page, error) {
// write writes any dirty pages to disk. // write writes any dirty pages to disk.
func (tx *Tx) write() error { func (tx *Tx) write() error {
// Sort pages by id. // Sort pages by id.
pages := make(pages, 0, len(tx.pages)) pages := make(common.Pages, 0, len(tx.pages))
for _, p := range tx.pages { for _, p := range tx.pages {
pages = append(pages, p) pages = append(pages, p)
} }
// Clear out page cache early. // Clear out page cache early.
tx.pages = make(map[pgid]*page) tx.pages = make(map[common.Pgid]*common.Page)
sort.Sort(pages) sort.Sort(pages)
// Write pages to disk in order. // Write pages to disk in order.
for _, p := range pages { for _, p := range pages {
rem := (uint64(p.overflow) + 1) * uint64(tx.db.pageSize) rem := (uint64(p.Overflow()) + 1) * uint64(tx.db.pageSize)
offset := int64(p.id) * int64(tx.db.pageSize) offset := int64(p.Id()) * int64(tx.db.pageSize)
var written uintptr var written uintptr
// Write out page in "max allocation" sized chunks. // Write out page in "max allocation" sized chunks.
@ -447,7 +446,7 @@ func (tx *Tx) write() error {
if sz > maxAllocSize-1 { if sz > maxAllocSize-1 {
sz = maxAllocSize - 1 sz = maxAllocSize - 1
} }
buf := unsafeByteSlice(unsafe.Pointer(p), written, 0, int(sz)) buf := common.UnsafeByteSlice(unsafe.Pointer(p), written, 0, int(sz))
if _, err := tx.db.ops.writeAt(buf, offset); err != nil { if _, err := tx.db.ops.writeAt(buf, offset); err != nil {
return err return err
@ -469,7 +468,7 @@ func (tx *Tx) write() error {
} }
// Ignore file sync if flag is set on DB. // Ignore file sync if flag is set on DB.
if !tx.db.NoSync || IgnoreNoSync { if !tx.db.NoSync || common.IgnoreNoSync {
if err := fdatasync(tx.db); err != nil { if err := fdatasync(tx.db); err != nil {
return err return err
} }
@ -479,11 +478,11 @@ func (tx *Tx) write() error {
for _, p := range pages { for _, p := range pages {
// Ignore page sizes over 1 page. // Ignore page sizes over 1 page.
// These are allocated using make() instead of the page pool. // These are allocated using make() instead of the page pool.
if int(p.overflow) != 0 { if int(p.Overflow()) != 0 {
continue continue
} }
buf := unsafeByteSlice(unsafe.Pointer(p), 0, 0, tx.db.pageSize) buf := common.UnsafeByteSlice(unsafe.Pointer(p), 0, 0, tx.db.pageSize)
// See https://go.googlesource.com/go/+/f03c9202c43e0abb130669852082117ca50aa9b1 // See https://go.googlesource.com/go/+/f03c9202c43e0abb130669852082117ca50aa9b1
for i := range buf { for i := range buf {
@ -500,13 +499,13 @@ func (tx *Tx) writeMeta() error {
// Create a temporary buffer for the meta page. // Create a temporary buffer for the meta page.
buf := make([]byte, tx.db.pageSize) buf := make([]byte, tx.db.pageSize)
p := tx.db.pageInBuffer(buf, 0) p := tx.db.pageInBuffer(buf, 0)
tx.meta.write(p) tx.meta.Write(p)
// Write the meta page to file. // Write the meta page to file.
if _, err := tx.db.ops.writeAt(buf, int64(p.id)*int64(tx.db.pageSize)); err != nil { if _, err := tx.db.ops.writeAt(buf, int64(p.Id())*int64(tx.db.pageSize)); err != nil {
return err return err
} }
if !tx.db.NoSync || IgnoreNoSync { if !tx.db.NoSync || common.IgnoreNoSync {
if err := fdatasync(tx.db); err != nil { if err := fdatasync(tx.db); err != nil {
return err return err
} }
@ -520,69 +519,69 @@ func (tx *Tx) writeMeta() error {
// page returns a reference to the page with a given id. // page returns a reference to the page with a given id.
// If page has been written to then a temporary buffered page is returned. // If page has been written to then a temporary buffered page is returned.
func (tx *Tx) page(id pgid) *page { func (tx *Tx) page(id common.Pgid) *common.Page {
// Check the dirty pages first. // Check the dirty pages first.
if tx.pages != nil { if tx.pages != nil {
if p, ok := tx.pages[id]; ok { if p, ok := tx.pages[id]; ok {
p.fastCheck(id) p.FastCheck(id)
return p return p
} }
} }
// Otherwise return directly from the mmap. // Otherwise return directly from the mmap.
p := tx.db.page(id) p := tx.db.page(id)
p.fastCheck(id) p.FastCheck(id)
return p return p
} }
// forEachPage iterates over every page within a given page and executes a function. // forEachPage iterates over every page within a given page and executes a function.
func (tx *Tx) forEachPage(pgidnum pgid, fn func(*page, int, []pgid)) { func (tx *Tx) forEachPage(pgidnum common.Pgid, fn func(*common.Page, int, []common.Pgid)) {
stack := make([]pgid, 10) stack := make([]common.Pgid, 10)
stack[0] = pgidnum stack[0] = pgidnum
tx.forEachPageInternal(stack[:1], fn) tx.forEachPageInternal(stack[:1], fn)
} }
func (tx *Tx) forEachPageInternal(pgidstack []pgid, fn func(*page, int, []pgid)) { func (tx *Tx) forEachPageInternal(pgidstack []common.Pgid, fn func(*common.Page, int, []common.Pgid)) {
p := tx.page(pgidstack[len(pgidstack)-1]) p := tx.page(pgidstack[len(pgidstack)-1])
// Execute function. // Execute function.
fn(p, len(pgidstack)-1, pgidstack) fn(p, len(pgidstack)-1, pgidstack)
// Recursively loop over children. // Recursively loop over children.
if (p.flags & branchPageFlag) != 0 { if (p.Flags() & common.BranchPageFlag) != 0 {
for i := 0; i < int(p.count); i++ { for i := 0; i < int(p.Count()); i++ {
elem := p.branchPageElement(uint16(i)) elem := p.BranchPageElement(uint16(i))
tx.forEachPageInternal(append(pgidstack, elem.pgid), fn) tx.forEachPageInternal(append(pgidstack, elem.Pgid()), fn)
} }
} }
} }
// Page returns page information for a given page number. // Page returns page information for a given page number.
// This is only safe for concurrent use when used by a writable transaction. // This is only safe for concurrent use when used by a writable transaction.
func (tx *Tx) Page(id int) (*PageInfo, error) { func (tx *Tx) Page(id int) (*common.PageInfo, error) {
if tx.db == nil { if tx.db == nil {
return nil, ErrTxClosed return nil, common.ErrTxClosed
} else if pgid(id) >= tx.meta.pgid { } else if common.Pgid(id) >= tx.meta.Pgid() {
return nil, nil return nil, nil
} }
if tx.db.freelist == nil { if tx.db.freelist == nil {
return nil, ErrFreePagesNotLoaded return nil, common.ErrFreePagesNotLoaded
} }
// Build the page info. // Build the page info.
p := tx.db.page(pgid(id)) p := tx.db.page(common.Pgid(id))
info := &PageInfo{ info := &common.PageInfo{
ID: id, ID: id,
Count: int(p.count), Count: int(p.Count()),
OverflowCount: int(p.overflow), OverflowCount: int(p.Overflow()),
} }
// Determine the type (or if it's free). // Determine the type (or if it's free).
if tx.db.freelist.freed(pgid(id)) { if tx.db.freelist.freed(common.Pgid(id)) {
info.Type = "free" info.Type = "free"
} else { } else {
info.Type = p.typ() info.Type = p.Typ()
} }
return info, nil return info, nil

View File

@ -3,6 +3,8 @@ package bbolt
import ( import (
"encoding/hex" "encoding/hex"
"fmt" "fmt"
"go.etcd.io/bbolt/internal/common"
) )
// Check performs several consistency checks on the database for this transaction. // Check performs several consistency checks on the database for this transaction.
@ -37,8 +39,8 @@ func (tx *Tx) check(kvStringer KVStringer, ch chan error) {
tx.db.loadFreelist() tx.db.loadFreelist()
// Check if any pages are double freed. // Check if any pages are double freed.
freed := make(map[pgid]bool) freed := make(map[common.Pgid]bool)
all := make([]pgid, tx.db.freelist.count()) all := make([]common.Pgid, tx.db.freelist.count())
tx.db.freelist.copyall(all) tx.db.freelist.copyall(all)
for _, id := range all { for _, id := range all {
if freed[id] { if freed[id] {
@ -48,12 +50,12 @@ func (tx *Tx) check(kvStringer KVStringer, ch chan error) {
} }
// Track every reachable page. // Track every reachable page.
reachable := make(map[pgid]*page) reachable := make(map[common.Pgid]*common.Page)
reachable[0] = tx.page(0) // meta0 reachable[0] = tx.page(0) // meta0
reachable[1] = tx.page(1) // meta1 reachable[1] = tx.page(1) // meta1
if tx.meta.freelist != pgidNoFreelist { if tx.meta.Freelist() != common.PgidNoFreelist {
for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ { for i := uint32(0); i <= tx.page(tx.meta.Freelist()).Overflow(); i++ {
reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist) reachable[tx.meta.Freelist()+common.Pgid(i)] = tx.page(tx.meta.Freelist())
} }
} }
@ -61,7 +63,7 @@ func (tx *Tx) check(kvStringer KVStringer, ch chan error) {
tx.checkBucket(&tx.root, reachable, freed, kvStringer, ch) tx.checkBucket(&tx.root, reachable, freed, kvStringer, ch)
// Ensure all pages below high water mark are either reachable or freed. // Ensure all pages below high water mark are either reachable or freed.
for i := pgid(0); i < tx.meta.pgid; i++ { for i := common.Pgid(0); i < tx.meta.Pgid(); i++ {
_, isReachable := reachable[i] _, isReachable := reachable[i]
if !isReachable && !freed[i] { if !isReachable && !freed[i] {
ch <- fmt.Errorf("page %d: unreachable unfreed", int(i)) ch <- fmt.Errorf("page %d: unreachable unfreed", int(i))
@ -72,22 +74,22 @@ func (tx *Tx) check(kvStringer KVStringer, ch chan error) {
close(ch) close(ch)
} }
func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bool, func (tx *Tx) checkBucket(b *Bucket, reachable map[common.Pgid]*common.Page, freed map[common.Pgid]bool,
kvStringer KVStringer, ch chan error) { kvStringer KVStringer, ch chan error) {
// Ignore inline buckets. // Ignore inline buckets.
if b.root == 0 { if b.RootPage() == 0 {
return return
} }
// Check every page used by this bucket. // Check every page used by this bucket.
b.tx.forEachPage(b.root, func(p *page, _ int, stack []pgid) { b.tx.forEachPage(b.RootPage(), func(p *common.Page, _ int, stack []common.Pgid) {
if p.id > tx.meta.pgid { if p.Id() > tx.meta.Pgid() {
ch <- fmt.Errorf("page %d: out of bounds: %d (stack: %v)", int(p.id), int(b.tx.meta.pgid), stack) ch <- fmt.Errorf("page %d: out of bounds: %d (stack: %v)", int(p.Id()), int(b.tx.meta.Pgid()), stack)
} }
// Ensure each page is only referenced once. // Ensure each page is only referenced once.
for i := pgid(0); i <= pgid(p.overflow); i++ { for i := common.Pgid(0); i <= common.Pgid(p.Overflow()); i++ {
var id = p.id + i var id = p.Id() + i
if _, ok := reachable[id]; ok { if _, ok := reachable[id]; ok {
ch <- fmt.Errorf("page %d: multiple references (stack: %v)", int(id), stack) ch <- fmt.Errorf("page %d: multiple references (stack: %v)", int(id), stack)
} }
@ -95,14 +97,14 @@ func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bo
} }
// We should only encounter un-freed leaf and branch pages. // We should only encounter un-freed leaf and branch pages.
if freed[p.id] { if freed[p.Id()] {
ch <- fmt.Errorf("page %d: reachable freed", int(p.id)) ch <- fmt.Errorf("page %d: reachable freed", int(p.Id()))
} else if (p.flags&branchPageFlag) == 0 && (p.flags&leafPageFlag) == 0 { } else if (p.Flags()&common.BranchPageFlag) == 0 && (p.Flags()&common.LeafPageFlag) == 0 {
ch <- fmt.Errorf("page %d: invalid type: %s (stack: %v)", int(p.id), p.typ(), stack) ch <- fmt.Errorf("page %d: invalid type: %s (stack: %v)", int(p.Id()), p.Typ(), stack)
} }
}) })
tx.recursivelyCheckPages(b.root, kvStringer.KeyToString, ch) tx.recursivelyCheckPages(b.RootPage(), kvStringer.KeyToString, ch)
// Check each bucket within this bucket. // Check each bucket within this bucket.
_ = b.ForEachBucket(func(k []byte) error { _ = b.ForEachBucket(func(k []byte) error {
@ -117,7 +119,7 @@ func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bo
// key order constraints: // key order constraints:
// - keys on pages must be sorted // - keys on pages must be sorted
// - keys on children pages are between 2 consecutive keys on the parent's branch page). // - keys on children pages are between 2 consecutive keys on the parent's branch page).
func (tx *Tx) recursivelyCheckPages(pgId pgid, keyToString func([]byte) string, ch chan error) { func (tx *Tx) recursivelyCheckPages(pgId common.Pgid, keyToString func([]byte) string, ch chan error) {
tx.recursivelyCheckPagesInternal(pgId, nil, nil, nil, keyToString, ch) tx.recursivelyCheckPagesInternal(pgId, nil, nil, nil, keyToString, ch)
} }
@ -127,36 +129,36 @@ func (tx *Tx) recursivelyCheckPages(pgId pgid, keyToString func([]byte) string,
// - Are in right ordering relationship to their parents. // - Are in right ordering relationship to their parents.
// `pagesStack` is expected to contain IDs of pages from the tree root to `pgid` for the clean debugging message. // `pagesStack` is expected to contain IDs of pages from the tree root to `pgid` for the clean debugging message.
func (tx *Tx) recursivelyCheckPagesInternal( func (tx *Tx) recursivelyCheckPagesInternal(
pgId pgid, minKeyClosed, maxKeyOpen []byte, pagesStack []pgid, pgId common.Pgid, minKeyClosed, maxKeyOpen []byte, pagesStack []common.Pgid,
keyToString func([]byte) string, ch chan error) (maxKeyInSubtree []byte) { keyToString func([]byte) string, ch chan error) (maxKeyInSubtree []byte) {
p := tx.page(pgId) p := tx.page(pgId)
pagesStack = append(pagesStack, pgId) pagesStack = append(pagesStack, pgId)
switch { switch {
case p.flags&branchPageFlag != 0: case p.Flags()&common.BranchPageFlag != 0:
// For branch page we navigate ranges of all subpages. // For branch page we navigate ranges of all subpages.
runningMin := minKeyClosed runningMin := minKeyClosed
for i := range p.branchPageElements() { for i := range p.BranchPageElements() {
elem := p.branchPageElement(uint16(i)) elem := p.BranchPageElement(uint16(i))
verifyKeyOrder(elem.pgid, "branch", i, elem.key(), runningMin, maxKeyOpen, ch, keyToString, pagesStack) verifyKeyOrder(elem.Pgid(), "branch", i, elem.Key(), runningMin, maxKeyOpen, ch, keyToString, pagesStack)
maxKey := maxKeyOpen maxKey := maxKeyOpen
if i < len(p.branchPageElements())-1 { if i < len(p.BranchPageElements())-1 {
maxKey = p.branchPageElement(uint16(i + 1)).key() maxKey = p.BranchPageElement(uint16(i + 1)).Key()
} }
maxKeyInSubtree = tx.recursivelyCheckPagesInternal(elem.pgid, elem.key(), maxKey, pagesStack, keyToString, ch) maxKeyInSubtree = tx.recursivelyCheckPagesInternal(elem.Pgid(), elem.Key(), maxKey, pagesStack, keyToString, ch)
runningMin = maxKeyInSubtree runningMin = maxKeyInSubtree
} }
return maxKeyInSubtree return maxKeyInSubtree
case p.flags&leafPageFlag != 0: case p.Flags()&common.LeafPageFlag != 0:
runningMin := minKeyClosed runningMin := minKeyClosed
for i := range p.leafPageElements() { for i := range p.LeafPageElements() {
elem := p.leafPageElement(uint16(i)) elem := p.LeafPageElement(uint16(i))
verifyKeyOrder(pgId, "leaf", i, elem.key(), runningMin, maxKeyOpen, ch, keyToString, pagesStack) verifyKeyOrder(pgId, "leaf", i, elem.Key(), runningMin, maxKeyOpen, ch, keyToString, pagesStack)
runningMin = elem.key() runningMin = elem.Key()
} }
if p.count > 0 { if p.Count() > 0 {
return p.leafPageElement(p.count - 1).key() return p.LeafPageElement(p.Count() - 1).Key()
} }
default: default:
ch <- fmt.Errorf("unexpected page type for pgId:%d", pgId) ch <- fmt.Errorf("unexpected page type for pgId:%d", pgId)
@ -168,7 +170,7 @@ func (tx *Tx) recursivelyCheckPagesInternal(
* verifyKeyOrder checks whether an entry with given #index on pgId (pageType: "branch|leaf") that has given "key", * verifyKeyOrder checks whether an entry with given #index on pgId (pageType: "branch|leaf") that has given "key",
* is within range determined by (previousKey..maxKeyOpen) and reports found violations to the channel (ch). * is within range determined by (previousKey..maxKeyOpen) and reports found violations to the channel (ch).
*/ */
func verifyKeyOrder(pgId pgid, pageType string, index int, key []byte, previousKey []byte, maxKeyOpen []byte, ch chan error, keyToString func([]byte) string, pagesStack []pgid) { func verifyKeyOrder(pgId common.Pgid, pageType string, index int, key []byte, previousKey []byte, maxKeyOpen []byte, ch chan error, keyToString func([]byte) string, pagesStack []common.Pgid) {
if index == 0 && previousKey != nil && compareKeys(previousKey, key) > 0 { if index == 0 && previousKey != nil && compareKeys(previousKey, key) > 0 {
ch <- fmt.Errorf("the first key[%d]=(hex)%s on %s page(%d) needs to be >= the key in the ancestor (%s). Stack: %v", ch <- fmt.Errorf("the first key[%d]=(hex)%s on %s page(%d) needs to be >= the key in the ancestor (%s). Stack: %v",
index, keyToString(key), pageType, pgId, keyToString(previousKey), pagesStack) index, keyToString(key), pageType, pgId, keyToString(previousKey), pagesStack)

View File

@ -15,6 +15,7 @@ import (
bolt "go.etcd.io/bbolt" bolt "go.etcd.io/bbolt"
"go.etcd.io/bbolt/internal/btesting" "go.etcd.io/bbolt/internal/btesting"
"go.etcd.io/bbolt/internal/common"
) )
// TestTx_Check_ReadOnly tests consistency checking on a ReadOnly database. // TestTx_Check_ReadOnly tests consistency checking on a ReadOnly database.
@ -84,7 +85,7 @@ func TestTx_Commit_ErrTxClosed(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
if err := tx.Commit(); err != bolt.ErrTxClosed { if err := tx.Commit(); err != common.ErrTxClosed {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }
@ -101,7 +102,7 @@ func TestTx_Rollback_ErrTxClosed(t *testing.T) {
if err := tx.Rollback(); err != nil { if err := tx.Rollback(); err != nil {
t.Fatal(err) t.Fatal(err)
} }
if err := tx.Rollback(); err != bolt.ErrTxClosed { if err := tx.Rollback(); err != common.ErrTxClosed {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }
@ -113,7 +114,7 @@ func TestTx_Commit_ErrTxNotWritable(t *testing.T) {
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
if err := tx.Commit(); err != bolt.ErrTxNotWritable { if err := tx.Commit(); err != common.ErrTxNotWritable {
t.Fatal(err) t.Fatal(err)
} }
// Close the view transaction // Close the view transaction
@ -165,7 +166,7 @@ func TestTx_CreateBucket_ErrTxNotWritable(t *testing.T) {
db := btesting.MustCreateDB(t) db := btesting.MustCreateDB(t)
if err := db.View(func(tx *bolt.Tx) error { if err := db.View(func(tx *bolt.Tx) error {
_, err := tx.CreateBucket([]byte("foo")) _, err := tx.CreateBucket([]byte("foo"))
if err != bolt.ErrTxNotWritable { if err != common.ErrTxNotWritable {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -185,7 +186,7 @@ func TestTx_CreateBucket_ErrTxClosed(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
if _, err := tx.CreateBucket([]byte("foo")); err != bolt.ErrTxClosed { if _, err := tx.CreateBucket([]byte("foo")); err != common.ErrTxClosed {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }
@ -293,11 +294,11 @@ func TestTx_CreateBucketIfNotExists(t *testing.T) {
func TestTx_CreateBucketIfNotExists_ErrBucketNameRequired(t *testing.T) { func TestTx_CreateBucketIfNotExists_ErrBucketNameRequired(t *testing.T) {
db := btesting.MustCreateDB(t) db := btesting.MustCreateDB(t)
if err := db.Update(func(tx *bolt.Tx) error { if err := db.Update(func(tx *bolt.Tx) error {
if _, err := tx.CreateBucketIfNotExists([]byte{}); err != bolt.ErrBucketNameRequired { if _, err := tx.CreateBucketIfNotExists([]byte{}); err != common.ErrBucketNameRequired {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
if _, err := tx.CreateBucketIfNotExists(nil); err != bolt.ErrBucketNameRequired { if _, err := tx.CreateBucketIfNotExists(nil); err != common.ErrBucketNameRequired {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
@ -323,7 +324,7 @@ func TestTx_CreateBucket_ErrBucketExists(t *testing.T) {
// Create the same bucket again. // Create the same bucket again.
if err := db.Update(func(tx *bolt.Tx) error { if err := db.Update(func(tx *bolt.Tx) error {
if _, err := tx.CreateBucket([]byte("widgets")); err != bolt.ErrBucketExists { if _, err := tx.CreateBucket([]byte("widgets")); err != common.ErrBucketExists {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -336,7 +337,7 @@ func TestTx_CreateBucket_ErrBucketExists(t *testing.T) {
func TestTx_CreateBucket_ErrBucketNameRequired(t *testing.T) { func TestTx_CreateBucket_ErrBucketNameRequired(t *testing.T) {
db := btesting.MustCreateDB(t) db := btesting.MustCreateDB(t)
if err := db.Update(func(tx *bolt.Tx) error { if err := db.Update(func(tx *bolt.Tx) error {
if _, err := tx.CreateBucket(nil); err != bolt.ErrBucketNameRequired { if _, err := tx.CreateBucket(nil); err != common.ErrBucketNameRequired {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -401,7 +402,7 @@ func TestTx_DeleteBucket_ErrTxClosed(t *testing.T) {
if err := tx.Commit(); err != nil { if err := tx.Commit(); err != nil {
t.Fatal(err) t.Fatal(err)
} }
if err := tx.DeleteBucket([]byte("foo")); err != bolt.ErrTxClosed { if err := tx.DeleteBucket([]byte("foo")); err != common.ErrTxClosed {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
} }
@ -410,7 +411,7 @@ func TestTx_DeleteBucket_ErrTxClosed(t *testing.T) {
func TestTx_DeleteBucket_ReadOnly(t *testing.T) { func TestTx_DeleteBucket_ReadOnly(t *testing.T) {
db := btesting.MustCreateDB(t) db := btesting.MustCreateDB(t)
if err := db.View(func(tx *bolt.Tx) error { if err := db.View(func(tx *bolt.Tx) error {
if err := tx.DeleteBucket([]byte("foo")); err != bolt.ErrTxNotWritable { if err := tx.DeleteBucket([]byte("foo")); err != common.ErrTxNotWritable {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil
@ -423,7 +424,7 @@ func TestTx_DeleteBucket_ReadOnly(t *testing.T) {
func TestTx_DeleteBucket_NotFound(t *testing.T) { func TestTx_DeleteBucket_NotFound(t *testing.T) {
db := btesting.MustCreateDB(t) db := btesting.MustCreateDB(t)
if err := db.Update(func(tx *bolt.Tx) error { if err := db.Update(func(tx *bolt.Tx) error {
if err := tx.DeleteBucket([]byte("widgets")); err != bolt.ErrBucketNotFound { if err := tx.DeleteBucket([]byte("widgets")); err != common.ErrBucketNotFound {
t.Fatalf("unexpected error: %s", err) t.Fatalf("unexpected error: %s", err)
} }
return nil return nil