pull/34/head
Ben Johnson 2014-01-24 16:32:18 -07:00
parent 20b26eac78
commit 73ab1d420d
22 changed files with 219 additions and 3694 deletions

View File

@ -191,5 +191,8 @@ In leaf pages, these nodes store the actual key/value data.
The following is a list of items to do on the Bolt project:
1. Resize map. (Make sure there are no reader txns before resizing)
2. DB.Copy()
1. Calculate freelist on db.Open(). (Traverse branches, set bitmap index, load free pages into a list -- lazy load in the future).
2. Resize map. (Make sure there are no reader txns before resizing)
3. DB.Copy()
4. Merge pages.
5. Rebalance (after deletion).

23
bnode.go Normal file
View File

@ -0,0 +1,23 @@
package bolt
import (
"unsafe"
)
// bnode represents a node on a branch page.
type bnode struct {
flags uint16
keySize uint16
pgid pgid
data uintptr // Pointer to the beginning of the data.
}
// key returns a byte slice that of the key data.
func (n *bnode) key() []byte {
return (*[MaxKeySize]byte)(unsafe.Pointer(&n.data))[:n.keySize]
}
// bnodeSize returns the number of bytes required to store a key as a branch node.
func bnodeSize(key []byte) int {
return int(unsafe.Offsetof((*bnode)(nil)).data) + len(key)
}

View File

@ -1,16 +0,0 @@
package bolt
import (
"unsafe"
)
const (
bigNode = 0x01
subNode = 0x02
dupNode = 0x04
)
// key returns a byte slice that of the key data.
func (n *branchNode) key() []byte {
return (*[MaxKeySize]byte)(unsafe.Pointer(&n.data))[:n.keySize]
}

View File

@ -1,31 +1,17 @@
package bolt
const (
MDB_DUPSORT = 0x04
)
// TODO: #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */
// TODO: #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID))
// TODO: #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE)
// TODO: #define FREE_DBI 0
type bucketid uint32
type Bucket struct {
*bucket
transaction *Transaction
name string
isNew bool
dirty bool
valid bool
name string
}
type bucket struct {
id uint32
pad uint32
flags uint16
depth uint16
branches pgno
leafs pgno
overflows pgno
entries uint64
root pgno
id bucketid
flags uint32
root pgid
branches pgid
leafs pgid
entries uint64
}

View File

@ -3,11 +3,6 @@ package bolt
const Version = 1
const (
MaxKeySize = 511
MaxKeySize = 0x8000
MaxDataSize = 0xffffffff
)
const (
DefaultMapSize = 1048576
DefaultReaderCount = 126
)

2682
cursor.go

File diff suppressed because it is too large Load Diff

39
db.go
View File

@ -13,7 +13,7 @@ const (
)
var (
DatabaseNotOpenError = &Error{"db is not open", nil}
DatabaseNotOpenError = &Error{"db is not open", nil}
DatabaseAlreadyOpenedError = &Error{"db already open", nil}
TransactionInProgressError = &Error{"writable transaction is already in progress", nil}
)
@ -22,26 +22,26 @@ type DB struct {
sync.Mutex
opened bool
os _os
syscall _syscall
path string
file file
metafile file
data []byte
buf []byte
meta0 *meta
meta1 *meta
pageSize int
rwtransaction *RWTransaction
transactions []*Transaction
maxPageNumber int /**< me_mapsize / me_psize */
freePages []int /** IDL of pages that became unused in a write txn */
dirtyPages []int /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */
os _os
syscall _syscall
path string
file file
metafile file
data []byte
buf []byte
meta0 *meta
meta1 *meta
pageSize int
rwtransaction *RWTransaction
transactions []*Transaction
maxPageNumber int /**< me_mapsize / me_psize */
freePages []int /** IDL of pages that became unused in a write txn */
dirtyPages []int /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */
// TODO: scratch []*page // list of temp pages for writing.
readers []*reader
maxFreeOnePage int /** Max number of freelist items that can fit in a single overflow page */
maxFreeOnePage int /** Max number of freelist items that can fit in a single overflow page */
maxPageDataSize int
maxNodeSize int /** Max size of a node on a page */
maxKeySize int /**< max size of a key */
@ -281,8 +281,3 @@ func (db *DB) Stat() *stat {
// TODO: Calculate size, depth, page count (by type), entry count, readers, etc.
return nil
}
func (db *DB) minTxnID() txnid {
// TODO: Find the oldest transaction id.
return 0
}

View File

@ -1,26 +1,5 @@
package bolt
var (
KeyExistError = &Error{"key/data pair already exists", nil}
NotFoundError = &Error{"no matching key/data pair found", nil}
PageNotFoundError = &Error{"requested page not found", nil}
CorruptedError = &Error{"located page was wrong type", nil}
PanicError = &Error{"update of meta page failed", nil}
VersionMismatchError = &Error{"database environment version mismatch", nil}
InvalidError = &Error{"file is not a bolt file", nil}
MapFullError = &Error{"environment mapsize limit reached", nil}
BucketFullError = &Error{"environment maxdbs limit reached", nil}
ReadersFullError = &Error{"environment maxreaders limit reached", nil}
TransactionFullError = &Error{"transaction has too many dirty pages - transaction too big", nil}
CursorFullError = &Error{"internal error - cursor stack limit reached", nil}
PageFullError = &Error{"internal error - page has no more space", nil}
MapResizedError = &Error{"database contents grew beyond environment mapsize", nil}
IncompatibleError = &Error{"operation and db incompatible, or db flags changed", nil}
BadReaderSlotError = &Error{"invalid reuse of reader locktable slot", nil}
BadTransactionError = &Error{"transaction cannot recover - it must be aborted", nil}
BadValueSizeError = &Error{"too big key/data or key is empty", nil}
)
type Error struct {
message string
cause error

12
file.go
View File

@ -1,12 +0,0 @@
package bolt
import (
"os"
)
type file interface {
Fd() uintptr
ReadAt(b []byte, off int64) (n int, err error)
Stat() (fi os.FileInfo, err error)
WriteAt(b []byte, off int64) (n int, err error)
}

View File

@ -1,65 +0,0 @@
package bolt
import (
"os"
"time"
"github.com/stretchr/testify/mock"
)
type mockfile struct {
mock.Mock
fd uintptr
}
func (m *mockfile) Fd() uintptr {
return m.fd
}
func (m *mockfile) ReadAt(b []byte, off int64) (n int, err error) {
args := m.Called(b, off)
return args.Int(0), args.Error(1)
}
func (m *mockfile) Stat() (os.FileInfo, error) {
args := m.Called()
return args.Get(0).(os.FileInfo), args.Error(1)
}
func (m *mockfile) WriteAt(b []byte, off int64) (n int, err error) {
args := m.Called(b, off)
return args.Int(0), args.Error(1)
}
type mockfileinfo struct {
name string
size int64
mode os.FileMode
modTime time.Time
isDir bool
sys interface{}
}
func (m *mockfileinfo) Name() string {
return m.name
}
func (m *mockfileinfo) Size() int64 {
return m.size
}
func (m *mockfileinfo) Mode() os.FileMode {
return m.mode
}
func (m *mockfileinfo) ModTime() time.Time {
return m.modTime
}
func (m *mockfileinfo) IsDir() bool {
return m.isDir
}
func (m *mockfileinfo) Sys() interface{} {
return m.sys
}

10
info.go
View File

@ -1,10 +0,0 @@
package bolt
// Info contains information about the database.
type Info struct {
MapSize int
LastPageID int
LastTransactionID int
MaxReaders int
ReaderCount int
}

30
lnode.go Normal file
View File

@ -0,0 +1,30 @@
package bolt
import (
"unsafe"
)
type nodeid uint16
// lnode represents a node on a leaf page.
type lnode struct {
flags uint16
keySize uint16
dataSize uint32
data uintptr // Pointer to the beginning of the data.
}
// key returns a byte slice that of the node key.
func (n *lnode) key() []byte {
return (*[MaxKeySize]byte)(unsafe.Pointer(&n.data))[:n.keySize]
}
// data returns a byte slice that of the node data.
func (n *lnode) data() []byte {
return (*[MaxKeySize]byte)(unsafe.Pointer(&n.data))[n.keySize : n.keySize+n.dataSize]
}
// lnodeSize returns the number of bytes required to store a key+data as a leaf node.
func lnodeSize(key []byte, data []byte) int {
return int(unsafe.Offsetof((*lnode)(nil)).data) + len(key) + len(data)
}

48
meta.go
View File

@ -4,37 +4,16 @@ var (
InvalidMetaPageError = &Error{"Invalid meta page", nil}
)
// TODO: #define mm_psize mm_dbs[0].md_pad
// TODO: #define mm_flags mm_dbs[0].md_flags
// TODO:
// typedef union MDB_metabuf {
// MDB_page mb_page;
// struct {
// char mm_pad[PAGEHDRSZ];
// MDB_meta mm_meta;
// } mb_metabuf;
// } MDB_metabuf;
// TODO:
// typedef struct MDB_dbx {
// MDB_val md_name; /**< name of the database */
// MDB_cmp_func *md_cmp; /**< function for comparing keys */
// MDB_cmp_func *md_dcmp; /**< function for comparing data items */
// MDB_rel_func *md_rel; /**< user relocate function */
// void *md_relctx; /**< user-provided context for md_rel */
// } MDB_dbx;
const magic uint32 = 0xC0DEC0DE
const version uint32 = 1
type meta struct {
magic uint32
version uint32
free bucket
buckets bucket
pgno int
txnid int
magic uint32
version uint32
buckets bucket
pageSize uint32
pgid pgid
txnid txnid
}
// validate checks the marker bytes and version of the meta page to ensure it matches this binary.
@ -46,18 +25,3 @@ func (m *meta) validate() error {
}
return nil
}
// Read the environment parameters of a DB environment before
// mapping it into memory.
// @param[in] env the environment handle
// @param[out] meta address of where to store the meta information
// @return 0 on success, non-zero on failure.
func (m *meta) read(p *page) error {
/*
if (off == 0 || m->mm_txnid > meta->mm_txnid)
*meta = *m;
}
return 0;
*/
return nil
}

40
node.go
View File

@ -1,40 +0,0 @@
package bolt
import (
"unsafe"
)
// node represents a node on a page.
type node struct {
flags uint16
keySize uint16
}
// leafNode represents a node on a leaf page.
type leafNode struct {
node
dataSize uint32
data uintptr // Pointer to the beginning of the data.
}
// branchNode represents a node on a branch page.
type branchNode struct {
node
pgno uint32
data uintptr // Pointer to the beginning of the data.
}
// key returns a byte slice that of the key data.
func (n *leafNode) key() []byte {
return (*[MaxKeySize]byte)(unsafe.Pointer(&n.data))[:n.keySize]
}
func leafNodeSize(key []byte, data []byte) int {
// TODO: Return even(sizeof(node) + len(key) + len(data))
return 0
}
func branchNodeSize(key []byte) int {
// TODO: Return even(sizeof(node) + len(key))
return 0
}

7
os.go
View File

@ -10,6 +10,13 @@ type _os interface {
Getpagesize() int
}
type file interface {
Fd() uintptr
ReadAt(b []byte, off int64) (n int, err error)
Stat() (fi os.FileInfo, err error)
WriteAt(b []byte, off int64) (n int, err error)
}
type sysos struct{}
func (o *sysos) OpenFile(name string, flag int, perm os.FileMode) (file file, err error) {

View File

@ -24,3 +24,60 @@ func (m *mockos) Getpagesize() int {
args := m.Called()
return args.Int(0)
}
type mockfile struct {
mock.Mock
fd uintptr
}
func (m *mockfile) Fd() uintptr {
return m.fd
}
func (m *mockfile) ReadAt(b []byte, off int64) (n int, err error) {
args := m.Called(b, off)
return args.Int(0), args.Error(1)
}
func (m *mockfile) Stat() (os.FileInfo, error) {
args := m.Called()
return args.Get(0).(os.FileInfo), args.Error(1)
}
func (m *mockfile) WriteAt(b []byte, off int64) (n int, err error) {
args := m.Called(b, off)
return args.Int(0), args.Error(1)
}
type mockfileinfo struct {
name string
size int64
mode os.FileMode
modTime time.Time
isDir bool
sys interface{}
}
func (m *mockfileinfo) Name() string {
return m.name
}
func (m *mockfileinfo) Size() int64 {
return m.size
}
func (m *mockfileinfo) Mode() os.FileMode {
return m.mode
}
func (m *mockfileinfo) ModTime() time.Time {
return m.modTime
}
func (m *mockfileinfo) IsDir() bool {
return m.isDir
}
func (m *mockfileinfo) Sys() interface{} {
return m.sys
}

122
page.go
View File

@ -8,55 +8,26 @@ import (
const maxPageSize = 0x8000
const minKeyCount = 2
var _page page
const pageHeaderSize = int(unsafe.Offsetof(_page.ptr))
const pageHeaderSize = int(unsafe.Offsetof(((*page)(nil)).data))
const minPageKeys = 2
const fillThreshold = 250 // 25%
const (
p_branch = 0x01
p_leaf = 0x02
p_overflow = 0x04
p_meta = 0x08
p_dirty = 0x10 /**< dirty page, also set for #P_SUBP pages */
p_invalid = ^pgno(0)
p_branch = 0x01
p_leaf = 0x02
p_meta = 0x04
)
// maxCommitPages is the maximum number of pages to commit in one writev() call.
const maxCommitPages = 64
/* max bytes to write in one call */
const maxWriteByteCount uint = 0x80000000 // TODO: #define MAX_WRITE 0x80000000U >> (sizeof(ssize_t) == 4))
// TODO:
// #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES
// #undef MDB_COMMIT_PAGES
// #define MDB_COMMIT_PAGES IOV_MAX
// #endif
const (
MDB_PS_MODIFY = 1
MDB_PS_ROOTONLY = 2
MDB_PS_FIRST = 4
MDB_PS_LAST = 8
)
// TODO: #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */
type pgno uint64
type txnid uint64
type indx uint16
type pgid uint64
type page struct {
id pgno
flags uint16
lower indx
upper indx
overflow uint32
ptr uintptr
id pgid
flags uint32
lower uint16
upper uint16
count uint32
data uintptr
}
// meta returns a pointer to the metadata section of the page.
@ -80,72 +51,7 @@ func (p *page) init(pageSize int) {
m := (*meta)(unsafe.Pointer(&p.ptr))
m.magic = magic
m.version = version
m.free.pad = uint32(pageSize)
m.pgno = 1
m.free.root = p_invalid
m.buckets.root = p_invalid
}
// branchNode retrieves the branch node at the given index within the page.
func (p *page) branchNode(index indx) *branchNode {
b := (*[maxPageSize]byte)(unsafe.Pointer(&p.ptr))
return (*branchNode)(unsafe.Pointer(&b[index*indx(unsafe.Sizeof(index))]))
}
// leafNode retrieves the leaf node at the given index within the page.
func (p *page) leafNode(index indx) *leafNode {
b := (*[maxPageSize]byte)(unsafe.Pointer(&p.ptr))
return (*leafNode)(unsafe.Pointer(&b[index*indx(unsafe.Sizeof(index))]))
}
// numkeys returns the number of nodes in the page.
func (p *page) numkeys() int {
return int((p.lower - indx(pageHeaderSize)) >> 1)
}
// remainingSize returns the number of bytes left in the page.
func (p *page) remainingSize() int {
return int(p.upper - p.lower)
}
// find returns the node with the smallest entry larger or equal to the key.
// This function also returns a boolean stating if an exact match was made.
func (p *page) find(key []byte, pageSize int) (*node, int, bool) {
// TODO: MDB_page *mp = mc->mc_pg[mc->mc_top];
var node *node
nkeys := p.numkeys()
low, high := 1, nkeys-1
if (p.flags & p_leaf) != 0 {
low = 0
}
// Perform a binary search to find the correct node.
var i, rc int
for low <= high {
i = (low + high) / 2
node = p.node(indx(i))
rc = bytes.Compare(key, node.key())
if rc == 0 {
break
} else if rc > 0 {
low = i + 1
} else {
high = i - 1
}
}
// Found entry is less than key so grab the next one.
if rc > 0 {
i++
}
// If index is beyond key range then return nil.
if i >= nkeys {
node = nil
}
exact := (rc == 0 && nkeys > 0)
return node, i, exact
m.pageSize = uint32(pageSize)
m.pgid = 1
m.buckets.root = 0
}

View File

@ -1,5 +0,0 @@
package bolt
type reader struct {
txnid int
}

View File

@ -1,18 +1,9 @@
package bolt
/*
type RWCursor interface {
Put([]byte, []byte) (error)
Delete([]byte) (error)
}
*/
// RWCursor represents a cursor that can read and write data for a bucket.
type RWCursor struct {
Cursor
transaction *RWTransaction
reclaimed []pgno /**< Reclaimed freeDB pages, or NULL before use (was me_pghead) */
last txnid /**< ID of last used record, or 0 if len(reclaimed) == 0 */
}
func (c *RWCursor) Put(key []byte, value []byte) error {
@ -71,337 +62,14 @@ func (c *RWCursor) Put(key []byte, value []byte) error {
}
/*
return nil
}
insert = rc;
if (insert) {
// The key does not exist
DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top]));
if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
LEAFSIZE(key, data) > env->me_nodemax)
{
// Too big for a node, insert in sub-DB
fp_flags = P_LEAF|P_DIRTY;
fp = env->me_pbuf;
fp->mp_pad = data->mv_size; // used if MDB_DUPFIXED
fp->mp_lower = fp->mp_upper = olddata.mv_size = PAGEHDRSZ;
goto prep_subDB;
}
} else {
more:
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
olddata.mv_size = NODEDSZ(leaf);
olddata.mv_data = NODEDATA(leaf);
// DB has dups?
if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
// Prepare (sub-)page/sub-DB to accept the new item,
// if needed. fp: old sub-page or a header faking
// it. mp: new (sub-)page. offset: growth in page
// size. xdata: node data with new page or DB.
ssize_t i, offset = 0;
mp = fp = xdata.mv_data = env->me_pbuf;
mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
// Was a single item before, must convert now
if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
// Just overwrite the current item
if (flags == MDB_CURRENT)
goto current;
#if UINT_MAX < SIZE_MAX
if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t))
#ifdef MISALIGNED_OK
mc->mc_dbx->md_dcmp = mdb_cmp_long;
#else
mc->mc_dbx->md_dcmp = mdb_cmp_cint;
#endif
#endif
// if data matches, skip it
if (!mc->mc_dbx->md_dcmp(data, &olddata)) {
if (flags & MDB_NODUPDATA)
rc = MDB_KEYEXIST;
else if (flags & MDB_MULTIPLE)
goto next_mult;
else
rc = MDB_SUCCESS;
return rc;
}
// Back up original data item
dkey.mv_size = olddata.mv_size;
dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size);
// Make sub-page header for the dup items, with dummy body
fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
fp->mp_lower = PAGEHDRSZ;
xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
fp->mp_flags |= P_LEAF2;
fp->mp_pad = data->mv_size;
xdata.mv_size += 2 * data->mv_size; // leave space for 2 more
} else {
xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
(dkey.mv_size & 1) + (data->mv_size & 1);
}
fp->mp_upper = xdata.mv_size;
olddata.mv_size = fp->mp_upper; // pretend olddata is fp
} else if (leaf->mn_flags & F_SUBDATA) {
// Data is on sub-DB, just store it
flags |= F_DUPDATA|F_SUBDATA;
goto put_sub;
} else {
// Data is on sub-page
fp = olddata.mv_data;
switch (flags) {
default:
i = -(ssize_t)SIZELEFT(fp);
if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
offset = i += (ssize_t) EVEN(
sizeof(indx_t) + NODESIZE + data->mv_size);
} else {
i += offset = fp->mp_pad;
offset *= 4; // space for 4 more
}
if (i > 0)
break;
// FALLTHRU: Sub-page is big enough
case MDB_CURRENT:
fp->mp_flags |= P_DIRTY;
COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
flags |= F_DUPDATA;
goto put_sub;
}
xdata.mv_size = olddata.mv_size + offset;
}
fp_flags = fp->mp_flags;
if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) {
// Too big for a sub-page, convert to sub-DB
fp_flags &= ~P_SUBP;
prep_subDB:
dummy.md_pad = 0;
dummy.md_flags = 0;
dummy.md_depth = 1;
dummy.md_branch_pages = 0;
dummy.md_leaf_pages = 1;
dummy.md_overflow_pages = 0;
dummy.md_entries = NUMKEYS(fp);
xdata.mv_size = sizeof(MDB_db);
xdata.mv_data = &dummy;
if ((rc = mdb_page_alloc(mc, 1, &mp)))
return rc;
offset = env->me_psize - olddata.mv_size;
flags |= F_DUPDATA|F_SUBDATA;
dummy.md_root = mp->mp_pgno;
}
if (mp != fp) {
mp->mp_flags = fp_flags | P_DIRTY;
mp->mp_pad = fp->mp_pad;
mp->mp_lower = fp->mp_lower;
mp->mp_upper = fp->mp_upper + offset;
if (fp_flags & P_LEAF2) {
memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
} else {
memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper,
olddata.mv_size - fp->mp_upper);
for (i = NUMKEYS(fp); --i >= 0; )
mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
}
}
rdata = &xdata;
flags |= F_DUPDATA;
do_sub = 1;
if (!insert)
mdb_node_del(mc, 0);
goto new_sub;
}
current:
// overflow page overwrites need special handling
if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
MDB_page *omp;
pgno_t pg;
int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
memcpy(&pg, olddata.mv_data, sizeof(pg));
if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)
return rc2;
ovpages = omp->mp_pages;
// Is the ov page large enough?
if (ovpages >= dpages) {
if (!(omp->mp_flags & P_DIRTY) &&
(level || (env->me_flags & MDB_WRITEMAP)))
{
rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
if (rc)
return rc;
level = 0; // dirty in this txn or clean
}
// Is it dirty?
if (omp->mp_flags & P_DIRTY) {
// yes, overwrite it. Note in this case we don't
// bother to try shrinking the page if the new data
// is smaller than the overflow threshold.
if (level > 1) {
// It is writable only in a parent txn
size_t sz = (size_t) env->me_psize * ovpages, off;
MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages);
MDB_ID2 id2;
if (!np)
return ENOMEM;
id2.mid = pg;
id2.mptr = np;
rc = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2);
mdb_cassert(mc, rc == 0);
if (!(flags & MDB_RESERVE)) {
// Copy end of page, adjusting alignment so
// compiler may copy words instead of bytes.
off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t);
memcpy((size_t *)((char *)np + off),
(size_t *)((char *)omp + off), sz - off);
sz = PAGEHDRSZ;
}
memcpy(np, omp, sz); // Copy beginning of page
omp = np;
}
SETDSZ(leaf, data->mv_size);
if (F_ISSET(flags, MDB_RESERVE))
data->mv_data = METADATA(omp);
else
memcpy(METADATA(omp), data->mv_data, data->mv_size);
goto done;
}
}
if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
return rc2;
} else if (data->mv_size == olddata.mv_size) {
// same size, just replace it. Note that we could
// also reuse this node if the new data is smaller,
// but instead we opt to shrink the node in that case.
if (F_ISSET(flags, MDB_RESERVE))
data->mv_data = olddata.mv_data;
else if (data->mv_size)
memcpy(olddata.mv_data, data->mv_data, data->mv_size);
else
memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
goto done;
}
mdb_node_del(mc, 0);
mc->mc_db->md_entries--;
}
rdata = data;
new_sub:
nflags = flags & NODE_ADD_FLAGS;
nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
nflags &= ~MDB_APPEND;
if (!insert)
nflags |= MDB_SPLIT_REPLACE;
rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags);
} else {
// There is room already in this leaf page.
rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
if (rc == 0 && !do_sub && insert) {
// Adjust other cursors pointing to mp
MDB_cursor *m2, *m3;
MDB_dbi dbi = mc->mc_dbi;
unsigned i = mc->mc_top;
MDB_page *mp = mc->mc_pg[i];
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
if (mc->mc_flags & C_SUB)
m3 = &m2->mc_xcursor->mx_cursor;
else
m3 = m2;
if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) {
m3->mc_ki[i]++;
}
}
}
}
if (rc != MDB_SUCCESS)
mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
else {
// Now store the actual data in the child DB. Note that we're
// storing the user data in the keys field, so there are strict
// size limits on dupdata. The actual data fields of the child
// DB are all zero size.
if (do_sub) {
int xflags;
put_sub:
xdata.mv_size = 0;
xdata.mv_data = "";
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
if (flags & MDB_CURRENT) {
xflags = MDB_CURRENT|MDB_NOSPILL;
} else {
mdb_xcursor_init1(mc, leaf);
xflags = (flags & MDB_NODUPDATA) ?
MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL;
}
// converted, write the original data first
if (dkey.mv_size) {
rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
if (rc)
return rc;
{
// Adjust other cursors pointing to mp
MDB_cursor *m2;
unsigned i = mc->mc_top;
MDB_page *mp = mc->mc_pg[i];
for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
if (!(m2->mc_flags & C_INITIALIZED)) continue;
if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) {
mdb_xcursor_init1(m2, leaf);
}
}
}
// we've done our job
dkey.mv_size = 0;
}
if (flags & MDB_APPENDDUP)
xflags |= MDB_APPEND;
rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
if (flags & F_SUBDATA) {
void *db = NODEDATA(leaf);
memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
}
}
// sub-writes might have failed so check rc again.
// Don't increment count if we just replaced an existing item.
if (!rc && !(flags & MDB_CURRENT))
mc->mc_db->md_entries++;
if (flags & MDB_MULTIPLE) {
if (!rc) {
next_mult:
mcount++;
// let caller know how many succeeded, if any
data[1].mv_size = mcount;
if (mcount < dcount) {
data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
goto more;
}
}
}
}
done:
// If we succeeded and the key didn't exist before, make sure
// the cursor is marked valid.
if (!rc && insert)
mc->mc_flags |= C_INITIALIZED;
return rc;
*/
func (c *Cursor) Delete(key []byte) error {
// TODO: Traverse to the correct node.
// TODO: If missing, exit.
// TODO: Remove node from page.
// TODO: If page is empty then add it to the freelist.
return nil
}
@ -417,7 +85,6 @@ func (c *RWCursor) newLeafPage() (*page, error) {
p.flags = p_leaf | p_dirty
p.lower = pageHeaderSize
p.upper = c.transaction.db.pageSize
c.leafs += 1
return p, nil
}
@ -434,175 +101,6 @@ func (b *RWCursor) newBranchPage() (*page, error) {
p.flags = p_branch | p_dirty
p.lower = pageHeaderSize
p.upper = c.transaction.db.pageSize
c.bucket.branches += 1
return p, nil
}
// newOverflowPage allocates and initialize new overflow pages.
func (b *RWCursor) newOverflowPage(count int) (*page, error) {
// Allocate page.
p, err := c.allocatePage(count)
if err != nil {
return nil, err
}
// Set flags and bounds.
p.flags = p_overflow | p_dirty
p.lower = pageHeaderSize
p.upper = c.transaction.db.pageSize
c.bucket.overflows += count
return p, nil
}
// Allocate page numbers and memory for writing. Maintain me_pglast,
// me_pghead and mt_next_pgno.
//
// If there are free pages available from older transactions, they
// are re-used first. Otherwise allocate a new page at mt_next_pgno.
// Do not modify the freedB, just merge freeDB records into me_pghead[]
// and move me_pglast to say which records were consumed. Only this
// function can create me_pghead and move me_pglast/mt_next_pgno.
// @param[in] mc cursor A cursor handle identifying the transaction and
// database for which we are allocating.
// @param[in] num the number of pages to allocate.
// @param[out] mp Address of the allocated page(s). Requests for multiple pages
// will always be satisfied by a single contiguous chunk of memory.
// @return 0 on success, non-zero on failure.
// allocatePage allocates a new page.
func (c *RWCursor) allocatePage(count int) (*page, error) {
head := env.pagestate.head
// TODO?
// If our dirty list is already full, we can't do anything
// if (txn->mt_dirty_room == 0) {
// rc = MDB_TXN_FULL;
// goto fail;
// }
/*
int rc, retry = INT_MAX;
MDB_txn *txn = mc->mc_txn;
MDB_env *env = txn->mt_env;
pgno_t pgno, *mop = env->me_pghead;
unsigned i, j, k, mop_len = mop ? mop[0] : 0, n2 = num-1;
MDB_page *np;
txnid_t oldest = 0, last;
MDB_cursor_op op;
MDB_cursor m2;
*mp = NULL;
for (op = MDB_FIRST;; op = MDB_NEXT) {
MDB_val key, data;
MDB_node *leaf;
pgno_t *idl, old_id, new_id;
// Seek a big enough contiguous page range. Prefer
// pages at the tail, just truncating the list.
if (mop_len > n2) {
i = mop_len;
do {
pgno = mop[i];
if (mop[i-n2] == pgno+n2)
goto search_done;
} while (--i > n2);
if (Max_retries < INT_MAX && --retry < 0)
break;
}
if (op == MDB_FIRST) { // 1st iteration
// Prepare to fetch more and coalesce
oldest = mdb_find_oldest(txn);
last = env->me_pglast;
mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
if (last) {
op = MDB_SET_RANGE;
key.mv_data = &last; // will look up last+1
key.mv_size = sizeof(last);
}
}
last++;
// Do not fetch more if the record will be too recent
if (oldest <= last)
break;
rc = mdb_cursor_get(&m2, &key, NULL, op);
if (rc) {
if (rc == MDB_NOTFOUND)
break;
goto fail;
}
last = *(txnid_t*)key.mv_data;
if (oldest <= last)
break;
np = m2.mc_pg[m2.mc_top];
leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]);
if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS)
return rc;
idl = (MDB_ID *) data.mv_data;
i = idl[0];
if (!mop) {
if (!(env->me_pghead = mop = mdb_midl_alloc(i))) {
rc = ENOMEM;
goto fail;
}
} else {
if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0)
goto fail;
mop = env->me_pghead;
}
env->me_pglast = last;
// Merge in descending sorted order
j = mop_len;
k = mop_len += i;
mop[0] = (pgno_t)-1;
old_id = mop[j];
while (i) {
new_id = idl[i--];
for (; old_id < new_id; old_id = mop[--j])
mop[k--] = old_id;
mop[k--] = new_id;
}
mop[0] = mop_len;
}
// Use new pages from the map when nothing suitable in the freeDB
i = 0;
pgno = txn->mt_next_pgno;
if (pgno + num >= env->me_maxpg) {
DPUTS("DB size maxed out");
rc = MDB_MAP_FULL;
goto fail;
}
search_done:
if (!(np = mdb_page_malloc(txn, num))) {
rc = ENOMEM;
goto fail;
}
if (i) {
mop[0] = mop_len -= num;
// Move any stragglers down
for (j = i-num; j < mop_len; )
mop[++j] = mop[++i];
} else {
txn->mt_next_pgno = pgno + num;
}
np->mp_pgno = pgno;
mdb_page_dirty(txn, np);
*mp = np;
return MDB_SUCCESS;
fail:
txn->mt_flags |= MDB_TXN_ERROR;
return rc;
*/
return nil
}

View File

@ -6,7 +6,7 @@ type RWTransaction struct {
Transaction
dirtyPages map[int]*page
freePages map[int]*page
freelist []pgno
}
// TODO: Allocate scratch meta page.
@ -95,7 +95,6 @@ func (t *Transaction) Put(name string, key []byte, value []byte) error {
return c.Put(key, value)
}
// page returns a reference to the page with a given id.
// If page has been written to then a temporary bufferred page is returned.
func (t *Transaction) page(id int) *page {
@ -127,3 +126,10 @@ func (t *RWTransaction) DeleteBucket(name string) error {
return nil
}
// allocate returns a contiguous block of memory starting at a given page.
func (t *RWTransaction) allocate(count int) (*page, error) {
// TODO: Find a continuous block of free pages.
// TODO: If no free pages are available, resize the mmap to allocate more.
return nil, nil
}

View File

@ -1,6 +1,6 @@
package bolt
type stat struct {
type Stat struct {
PageSize int
Depth int
BranchPageCount int

View File

@ -6,7 +6,7 @@ import (
)
var (
InvalidTransactionError = &Error{"txn is invalid", nil}
InvalidTransactionError = &Error{"txn is invalid", nil}
BucketAlreadyExistsError = &Error{"bucket already exists", nil}
)
@ -17,6 +17,8 @@ const (
ps_last = 8
)
type txnid uint64
type Transaction struct {
id int
db *DB
@ -158,166 +160,6 @@ func (t *Transaction) Stat(name string) *stat {
// //
// //
// Save the freelist as of this transaction to the freeDB.
// This changes the freelist. Keep trying until it stabilizes.
func (t *Transaction) saveFreelist() error {
/*
// env->me_pghead[] can grow and shrink during this call.
// env->me_pglast and txn->mt_free_pgs[] can only grow.
// Page numbers cannot disappear from txn->mt_free_pgs[].
MDB_cursor mc;
MDB_env *env = txn->mt_env;
int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
txnid_t pglast = 0, head_id = 0;
pgno_t freecnt = 0, *free_pgs, *mop;
ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
if (env->me_pghead) {
// Make sure first page of freeDB is touched and on freelist
rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY);
if (rc && rc != MDB_NOTFOUND)
return rc;
}
// MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP)
clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
? SSIZE_MAX : maxfree_1pg;
for (;;) {
// Come back here after each Put() in case freelist changed
MDB_val key, data;
pgno_t *pgs;
ssize_t j;
// If using records from freeDB which we have not yet
// deleted, delete them and any we reserved for me_pghead.
while (pglast < env->me_pglast) {
rc = mdb_cursor_first(&mc, &key, NULL);
if (rc)
return rc;
pglast = head_id = *(txnid_t *)key.mv_data;
total_room = head_room = 0;
mdb_tassert(txn, pglast <= env->me_pglast);
rc = mdb_cursor_del(&mc, 0);
if (rc)
return rc;
}
// Save the IDL of pages freed by this txn, to a single record
if (freecnt < txn->mt_free_pgs[0]) {
if (!freecnt) {
// Make sure last page of freeDB is touched and on freelist
rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY);
if (rc && rc != MDB_NOTFOUND)
return rc;
}
free_pgs = txn->mt_free_pgs;
// Write to last page of freeDB
key.mv_size = sizeof(txn->mt_txnid);
key.mv_data = &txn->mt_txnid;
do {
freecnt = free_pgs[0];
data.mv_size = MDB_IDL_SIZEOF(free_pgs);
rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
if (rc)
return rc;
// Retry if mt_free_pgs[] grew during the Put()
free_pgs = txn->mt_free_pgs;
} while (freecnt < free_pgs[0]);
mdb_midl_sort(free_pgs);
memcpy(data.mv_data, free_pgs, data.mv_size);
#if (MDB_DEBUG) > 1
{
unsigned int i = free_pgs[0];
DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u",
txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i));
for (; i; i--)
DPRINTF(("IDL %"Z"u", free_pgs[i]));
}
#endif
continue;
}
mop = env->me_pghead;
mop_len = mop ? mop[0] : 0;
// Reserve records for me_pghead[]. Split it if multi-page,
// to avoid searching freeDB for a page range. Use keys in
// range [1,me_pglast]: Smaller than txnid of oldest reader.
if (total_room >= mop_len) {
if (total_room == mop_len || --more < 0)
break;
} else if (head_room >= maxfree_1pg && head_id > 1) {
// Keep current record (overflow page), add a new one
head_id--;
head_room = 0;
}
// (Re)write {key = head_id, IDL length = head_room}
total_room -= head_room;
head_room = mop_len - total_room;
if (head_room > maxfree_1pg && head_id > 1) {
// Overflow multi-page for part of me_pghead
head_room /= head_id; // amortize page sizes
head_room += maxfree_1pg - head_room % (maxfree_1pg + 1);
} else if (head_room < 0) {
// Rare case, not bothering to delete this record
head_room = 0;
}
key.mv_size = sizeof(head_id);
key.mv_data = &head_id;
data.mv_size = (head_room + 1) * sizeof(pgno_t);
rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
if (rc)
return rc;
// IDL is initially empty, zero out at least the length
pgs = (pgno_t *)data.mv_data;
j = head_room > clean_limit ? head_room : 0;
do {
pgs[j] = 0;
} while (--j >= 0);
total_room += head_room;
}
// Fill in the reserved me_pghead records
rc = MDB_SUCCESS;
if (mop_len) {
MDB_val key, data;
mop += mop_len;
rc = mdb_cursor_first(&mc, &key, &data);
for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) {
unsigned flags = MDB_CURRENT;
txnid_t id = *(txnid_t *)key.mv_data;
ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1;
MDB_ID save;
mdb_tassert(txn, len >= 0 && id <= env->me_pglast);
key.mv_data = &id;
if (len > mop_len) {
len = mop_len;
data.mv_size = (len + 1) * sizeof(MDB_ID);
flags = 0;
}
data.mv_data = mop -= len;
save = mop[0];
mop[0] = len;
rc = mdb_cursor_put(&mc, &key, &data, flags);
mop[0] = save;
if (rc || !(mop_len -= len))
break;
}
}
return rc;
*/
return nil
}
// Return the data associated with a given node.
// @param[in] txn The transaction for this operation.
// @param[in] leaf The node being read.