diff --git a/README.md b/README.md index 4c96076..53798f6 100644 --- a/README.md +++ b/README.md @@ -191,5 +191,8 @@ In leaf pages, these nodes store the actual key/value data. The following is a list of items to do on the Bolt project: -1. Resize map. (Make sure there are no reader txns before resizing) -2. DB.Copy() +1. Calculate freelist on db.Open(). (Traverse branches, set bitmap index, load free pages into a list -- lazy load in the future). +2. Resize map. (Make sure there are no reader txns before resizing) +3. DB.Copy() +4. Merge pages. +5. Rebalance (after deletion). diff --git a/bnode.go b/bnode.go new file mode 100644 index 0000000..d272e93 --- /dev/null +++ b/bnode.go @@ -0,0 +1,23 @@ +package bolt + +import ( + "unsafe" +) + +// bnode represents a node on a branch page. +type bnode struct { + flags uint16 + keySize uint16 + pgid pgid + data uintptr // Pointer to the beginning of the data. +} + +// key returns a byte slice that of the key data. +func (n *bnode) key() []byte { + return (*[MaxKeySize]byte)(unsafe.Pointer(&n.data))[:n.keySize] +} + +// bnodeSize returns the number of bytes required to store a key as a branch node. +func bnodeSize(key []byte) int { + return int(unsafe.Offsetof((*bnode)(nil)).data) + len(key) +} diff --git a/branch_node.go b/branch_node.go deleted file mode 100644 index a5bd071..0000000 --- a/branch_node.go +++ /dev/null @@ -1,16 +0,0 @@ -package bolt - -import ( - "unsafe" -) - -const ( - bigNode = 0x01 - subNode = 0x02 - dupNode = 0x04 -) - -// key returns a byte slice that of the key data. -func (n *branchNode) key() []byte { - return (*[MaxKeySize]byte)(unsafe.Pointer(&n.data))[:n.keySize] -} diff --git a/bucket.go b/bucket.go index ece92b5..f9c3566 100644 --- a/bucket.go +++ b/bucket.go @@ -1,31 +1,17 @@ package bolt -const ( - MDB_DUPSORT = 0x04 -) - -// TODO: #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ -// TODO: #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) -// TODO: #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) -// TODO: #define FREE_DBI 0 +type bucketid uint32 type Bucket struct { *bucket - transaction *Transaction - name string - isNew bool - dirty bool - valid bool + name string } type bucket struct { - id uint32 - pad uint32 - flags uint16 - depth uint16 - branches pgno - leafs pgno - overflows pgno - entries uint64 - root pgno + id bucketid + flags uint32 + root pgid + branches pgid + leafs pgid + entries uint64 } diff --git a/const.go b/const.go index 9e7a993..58640ee 100644 --- a/const.go +++ b/const.go @@ -3,11 +3,6 @@ package bolt const Version = 1 const ( - MaxKeySize = 511 + MaxKeySize = 0x8000 MaxDataSize = 0xffffffff ) - -const ( - DefaultMapSize = 1048576 - DefaultReaderCount = 126 -) diff --git a/cursor.go b/cursor.go index d61ce80..d1990dc 100644 --- a/cursor.go +++ b/cursor.go @@ -1,1593 +1,14 @@ package bolt -// TODO: #define CURSOR_STACK 32 - -const ( - c_initialized = 0x01 /**< cursor has been initialized and is valid */ - c_eof = 0x02 /**< No more data */ - c_del = 0x08 /**< last op was a cursor_del */ - c_splitting = 0x20 /**< Cursor is in page_split */ - c_untrack = 0x40 /**< Un-track cursor when closing */ -) - type Cursor struct { - flags int - next *Cursor - backup *Cursor transaction *Transaction bucket *Bucket - top int - pages []*page - indices []int /* the index of the node for the page at the same level */ + stack []stackelem } -// , data []byte, op int -func (c *Cursor) Get(key []byte) ([]byte, error) { - /* - int rc; - int exact = 0; - int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); - - switch (op) { - case MDB_GET_CURRENT: - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = EINVAL; - } else { - MDB_page *mp = mc->mc_pg[mc->mc_top]; - int nkeys = NUMKEYS(mp); - if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { - mc->mc_ki[mc->mc_top] = nkeys; - rc = MDB_NOTFOUND; - break; - } - rc = MDB_SUCCESS; - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - } else { - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDB_GET_KEY(leaf, key); - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (mc->mc_flags & C_DEL) - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); - } else { - rc = mdb_node_read(mc->mc_txn, leaf, data); - } - } - } - } - break; - case MDB_GET_BOTH: - case MDB_GET_BOTH_RANGE: - if (data == NULL) { - rc = EINVAL; - break; - } - if (mc->mc_xcursor == NULL) { - rc = MDB_INCOMPATIBLE; - break; - } - // FALLTHRU - case MDB_SET: - case MDB_SET_KEY: - case MDB_SET_RANGE: - if (key == NULL) { - rc = EINVAL; - } else { - rc = mdb_cursor_set(mc, key, data, op, - op == MDB_SET_RANGE ? NULL : &exact); - } - break; - case MDB_GET_MULTIPLE: - if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { - rc = EINVAL; - break; - } - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - rc = MDB_INCOMPATIBLE; - break; - } - rc = MDB_SUCCESS; - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || - (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) - break; - goto fetchm; - case MDB_NEXT_MULTIPLE: - if (data == NULL) { - rc = EINVAL; - break; - } - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - rc = MDB_INCOMPATIBLE; - break; - } - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_first(mc, key, data); - else - rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); - if (rc == MDB_SUCCESS) { - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - MDB_cursor *mx; - fetchm: - mx = &mc->mc_xcursor->mx_cursor; - data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * - mx->mc_db->md_pad; - data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); - mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; - } else { - rc = MDB_NOTFOUND; - } - } - break; - case MDB_NEXT: - case MDB_NEXT_DUP: - case MDB_NEXT_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_first(mc, key, data); - else - rc = mdb_cursor_next(mc, key, data, op); - break; - case MDB_PREV: - case MDB_PREV_DUP: - case MDB_PREV_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = mdb_cursor_last(mc, key, data); - if (rc) - break; - mc->mc_flags |= C_INITIALIZED; - mc->mc_ki[mc->mc_top]++; - } - rc = mdb_cursor_prev(mc, key, data, op); - break; - case MDB_FIRST: - rc = mdb_cursor_first(mc, key, data); - break; - case MDB_FIRST_DUP: - mfunc = mdb_cursor_first; - mmove: - if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { - rc = EINVAL; - break; - } - if (mc->mc_xcursor == NULL) { - rc = MDB_INCOMPATIBLE; - break; - } - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - rc = EINVAL; - break; - } - rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); - break; - case MDB_LAST: - rc = mdb_cursor_last(mc, key, data); - break; - case MDB_LAST_DUP: - mfunc = mdb_cursor_last; - goto mmove; - default: - DPRINTF(("unhandled/unimplemented cursor operation %u", op)); - rc = EINVAL; - break; - } - - if (mc->mc_flags & C_DEL) - mc->mc_flags ^= C_DEL; - - return rc; - */ - return nil, nil -} - -// page retrieves a page with a given key. -func (c *Cursor) page(key []byte, flags int) (*page, error) { - p := c.pages[c.top] - - for { - // Find the page index. - var index indx - if (flags & ps_first) != 0 { - index = 0 - } else if (flags & ps_last) != 0 { - index = indx(p.numkeys()) - 1 - } else { - node, i, exact := p.find(key, c.transaction.db.pageSize) - if exact { - c.indices[c.top] = i - } - if node == nil { - index = indx(p.numkeys()) - 1 - } else { - index = indx(c.indices[c.top]) - if !exact { - index -= 1 - } - } - } - - // Find the node index. - node := p.node(index) - - // Traverse to next page. - p = c.transaction.db.page(c.transaction.db.data, int(node.pgno())) - c.indices[c.top] = int(index) - c.push(p) - - // TODO: - // if (flags & MDB_PS_MODIFY) { - // if ((rc = mdb_page_touch(mc)) != 0) - // return rc; - // mp = mc->mc_pg[mc->mc_top]; - // } - } - - // If we ended up with a non-leaf page by the end then something is wrong. - if p.flags&p_leaf == 0 { - return nil, CorruptedError - } - - // TODO: mc->mc_flags |= C_INITIALIZED; - // TODO: mc->mc_flags &= ~C_EOF; - - return p, nil -} - -// pop moves the last page off the cursor's page stack. -func (c *Cursor) pop() { - top := len(c.pages) - 1 - c.pages = c.pages[0:c.top] - c.indices = c.indices[0:c.top] -} - -// push moves a page onto the top of the cursor's page stack. -func (c *Cursor) push(p *page) { - c.pages = append(c.pages, p) - c.indices = append(c.indices, 0) - c.top = len(c.pages) - 1 -} - -// currentPage retrieves the last page on the page stack. -func (c *Cursor) currentPage() *page { - top := len(c.pages) - if top > 0 { - return c.pages[top] - } - return nil -} - -// branchNode retrieves the branch node pointed to on the current page. -func (c *Cursor) currentBranchNode() *page { - top := len(c.pages) - if top > 0 { - return c.pages[top].branchNode(c.indices[top]) - } - return nil -} - -// lnode retrieves the leaf node pointed to on the current page. -func (c *Cursor) currentLeafNode() *node { - top := len(c.pages) - if top > 0 { - return c.pages[top].leaf(c.indices[top]) - } - return nil -} - -// // -// // -// // -// // -// // -// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ CONVERTED ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ // -// // -// // -// // -// // -// // - -// Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. -// @param[in] mc A cursor handle for the current operation. -// @param[in] pflags Flags of the pages to update: -// P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. -// @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). -// @return 0 on success, non-zero on failure. -func (c *Cursor) xkeep(pflags int, all int) error { - /* - enum { Mask = P_SUBP|P_DIRTY|P_KEEP }; - MDB_txn *txn = mc->mc_txn; - MDB_cursor *m3; - MDB_xcursor *mx; - MDB_page *dp, *mp; - MDB_node *leaf; - unsigned i, j; - int rc = MDB_SUCCESS, level; - - // Mark pages seen by cursors - if (mc->mc_flags & C_UNTRACK) - mc = NULL; // will find mc in mt_cursors - for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { - for (; mc; mc=mc->mc_next) { - if (!(mc->mc_flags & C_INITIALIZED)) - continue; - for (m3 = mc;; m3 = &mx->mx_cursor) { - mp = NULL; - for (j=0; jmc_snum; j++) { - mp = m3->mc_pg[j]; - if ((mp->mp_flags & Mask) == pflags) - mp->mp_flags ^= P_KEEP; - } - mx = m3->mc_xcursor; - // Proceed to mx if it is at a sub-database - if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) - break; - if (! (mp && (mp->mp_flags & P_LEAF))) - break; - leaf = NODEPTR(mp, m3->mc_ki[j-1]); - if (!(leaf->mn_flags & F_SUBDATA)) - break; - } - } - if (i == 0) - break; - } - - if (all) { - // Mark dirty root pages - for (i=0; imt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - pgno_t pgno = txn->mt_dbs[i].md_root; - if (pgno == P_INVALID) - continue; - if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) - break; - if ((dp->mp_flags & Mask) == pflags && level <= 1) - dp->mp_flags ^= P_KEEP; - } - } - } - - return rc; - */ - return nil -} - -// Spill pages from the dirty list back to disk. -// This is intended to prevent running into #MDB_TXN_FULL situations, -// but note that they may still occur in a few cases: -// 1) our estimate of the txn size could be too small. Currently this -// seems unlikely, except with a large number of #MDB_MULTIPLE items. -// 2) child txns may run out of space if their parents dirtied a -// lot of pages and never spilled them. TODO: we probably should do -// a preemptive spill during #mdb_txn_begin() of a child txn, if -// the parent's dirty_room is below a given threshold. -// -// Otherwise, if not using nested txns, it is expected that apps will -// not run into #MDB_TXN_FULL any more. The pages are flushed to disk -// the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. -// If the txn never references them again, they can be left alone. -// If the txn only reads them, they can be used without any fuss. -// If the txn writes them again, they can be dirtied immediately without -// going thru all of the work of #mdb_page_touch(). Such references are -// handled by #mdb_page_unspill(). -// -// Also note, we never spill DB root pages, nor pages of active cursors, -// because we'll need these back again soon anyway. And in nested txns, -// we can't spill a page in a child txn if it was already spilled in a -// parent txn. That would alter the parent txns' data even though -// the child hasn't committed yet, and we'd have no way to undo it if -// the child aborted. -// -// @param[in] m0 cursor A cursor handle identifying the transaction and -// database for which we are checking space. -// @param[in] key For a put operation, the key being stored. -// @param[in] data For a put operation, the data being stored. -// @return 0 on success, non-zero on failure. -func (c *Cursor) spill(key []byte, data []byte) error { - /* - MDB_txn *txn = m0->mc_txn; - MDB_page *dp; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned int i, j, need; - int rc; - - if (m0->mc_flags & C_SUB) - return MDB_SUCCESS; - - // Estimate how much space this op will take - i = m0->mc_db->md_depth; - // Named DBs also dirty the main DB - if (m0->mc_dbi > MAIN_DBI) - i += txn->mt_dbs[MAIN_DBI].md_depth; - // For puts, roughly factor in the key+data size - if (key) - i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; - i += i; // double it for good measure - need = i; - - if (txn->mt_dirty_room > i) - return MDB_SUCCESS; - - if (!txn->mt_spill_pgs) { - txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); - if (!txn->mt_spill_pgs) - return ENOMEM; - } else { - // purge deleted slots - MDB_IDL sl = txn->mt_spill_pgs; - unsigned int num = sl[0]; - j=0; - for (i=1; i<=num; i++) { - if (!(sl[i] & 1)) - sl[++j] = sl[i]; - } - sl[0] = j; - } - - // Preserve pages which may soon be dirtied again - if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) - goto done; - - // Less aggressive spill - we originally spilled the entire dirty list, - // with a few exceptions for cursor pages and DB root pages. But this - // turns out to be a lot of wasted effort because in a large txn many - // of those pages will need to be used again. So now we spill only 1/8th - // of the dirty pages. Testing revealed this to be a good tradeoff, - // better than 1/2, 1/4, or 1/10. - if (need < MDB_IDL_UM_MAX / 8) - need = MDB_IDL_UM_MAX / 8; - - // Save the page IDs of all the pages we're flushing - // flush from the tail forward, this saves a lot of shifting later on. - for (i=dl[0].mid; i && need; i--) { - MDB_ID pn = dl[i].mid << 1; - dp = dl[i].mptr; - if (dp->mp_flags & P_KEEP) - continue; - // Can't spill twice, make sure it's not already in a parent's - // spill list. - if (txn->mt_parent) { - MDB_txn *tx2; - for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { - if (tx2->mt_spill_pgs) { - j = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { - dp->mp_flags |= P_KEEP; - break; - } - } - } - if (tx2) - continue; - } - if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) - goto done; - need--; - } - mdb_midl_sort(txn->mt_spill_pgs); - - // Flush the spilled part of dirty list - if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) - goto done; - - // Reset any dirty pages we kept that page_flush didn't see - rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); - - done: - txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; - return rc; - return 0 - } - */ - return nil -} - -// Copy the used portions of a non-overflow page. -// @param[in] dst page to copy into -// @param[in] src page to copy from -// @param[in] psize size of a page -func (p *page) copyTo(dst *page, size int) { - /* - enum { Align = sizeof(pgno_t) }; - indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; - - // If page isn't full, just copy the used portion. Adjust - // alignment so memcpy may copy words instead of bytes. - if ((unused &= -Align) && !IS_LEAF2(src)) { - upper &= -Align; - memcpy(dst, src, (lower + (Align-1)) & -Align); - memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), - psize - upper); - } else { - memcpy(dst, src, psize - unused); - } - */ -} - -// Touch a page: make it dirty and re-insert into tree with updated pgno. -// @param[in] mc cursor pointing to the page to be touched -// @return 0 on success, non-zero on failure. -func (c *Cursor) page_touch() int { - /* - MDB_page *mp = mc->mc_pg[mc->mc_top], *np; - MDB_txn *txn = mc->mc_txn; - MDB_cursor *m2, *m3; - pgno_t pgno; - int rc; - - if (!F_ISSET(mp->mp_flags, P_DIRTY)) { - if (txn->mt_flags & MDB_TXN_SPILLS) { - np = NULL; - rc = mdb_page_unspill(txn, mp, &np); - if (rc) - goto fail; - if (np) - goto done; - } - if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || - (rc = mdb_page_alloc(mc, 1, &np))) - goto fail; - pgno = np->mp_pgno; - DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), - mp->mp_pgno, pgno)); - mdb_cassert(mc, mp->mp_pgno != pgno); - mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); - // Update the parent page, if any, to point to the new page - if (mc->mc_top) { - MDB_page *parent = mc->mc_pg[mc->mc_top-1]; - MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); - SETPGNO(node, pgno); - } else { - mc->mc_db->md_root = pgno; - } - } else if (txn->mt_parent && !IS_SUBP(mp)) { - MDB_ID2 mid, *dl = txn->mt_u.dirty_list; - pgno = mp->mp_pgno; - // If txn has a parent, make sure the page is in our - // dirty list. - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - if (mp != dl[x].mptr) { // bad cursor? - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; - } - return 0; - } - } - mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); - // No - copy it - np = mdb_page_malloc(txn, 1); - if (!np) - return ENOMEM; - mid.mid = pgno; - mid.mptr = np; - rc = mdb_mid2l_insert(dl, &mid); - mdb_cassert(mc, rc == 0); - } else { - return 0; - } - - mdb_page_copy(np, mp, txn->mt_env->me_psize); - np->mp_pgno = pgno; - np->mp_flags |= P_DIRTY; - - done: - // Adjust cursors pointing to mp - mc->mc_pg[mc->mc_top] = np; - m2 = txn->mt_cursors[mc->mc_dbi]; - if (mc->mc_flags & C_SUB) { - for (; m2; m2=m2->mc_next) { - m3 = &m2->mc_xcursor->mx_cursor; - if (m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[mc->mc_top] == mp) - m3->mc_pg[mc->mc_top] = np; - } - } else { - for (; m2; m2=m2->mc_next) { - if (m2->mc_snum < mc->mc_snum) continue; - if (m2->mc_pg[mc->mc_top] == mp) { - m2->mc_pg[mc->mc_top] = np; - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) - { - MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]); - if (!(leaf->mn_flags & F_SUBDATA)) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } - } - } - } - return 0; - - fail: - txn->mt_flags |= MDB_TXN_ERROR; - return rc; - */ - - return 0 -} - -// Search for the lowest key under the current branch page. -// This just bypasses a NUMKEYS check in the current page -// before calling mdb_page_search_root(), because the callers -// are all in situations where the current page is known to -// be underfilled. -func (c *Cursor) searchLowest() error { - /* - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_node *node = NODEPTR(mp, 0); - int rc; - - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) - return rc; - - mc->mc_ki[mc->mc_top] = 0; - if ((rc = mdb_cursor_push(mc, mp))) - return rc; - return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); - */ - return nil -} - -func (c *Cursor) freeOverflowPage(p *page) error { - /* - MDB_txn *txn = mc->mc_txn; - pgno_t pg = mp->mp_pgno; - unsigned x = 0, ovpages = mp->mp_pages; - MDB_env *env = txn->mt_env; - MDB_IDL sl = txn->mt_spill_pgs; - MDB_ID pn = pg << 1; - int rc; - - DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); - // If the page is dirty or on the spill list we just acquired it, - // so we should give it back to our current free list, if any. - // Otherwise put it onto the list of pages we freed in this txn. - // - // Won't create me_pghead: me_pglast must be inited along with it. - // Unsupported in nested txns: They would need to hide the page - // range in ancestor txns' dirty and spilled lists. - if (env->me_pghead && - !txn->mt_parent && - ((mp->mp_flags & P_DIRTY) || - (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) - { - unsigned i, j; - pgno_t *mop; - MDB_ID2 *dl, ix, iy; - rc = mdb_midl_need(&env->me_pghead, ovpages); - if (rc) - return rc; - if (!(mp->mp_flags & P_DIRTY)) { - // This page is no longer spilled - if (x == sl[0]) - sl[0]--; - else - sl[x] |= 1; - goto release; - } - // Remove from dirty list - dl = txn->mt_u.dirty_list; - x = dl[0].mid--; - for (ix = dl[x]; ix.mptr != mp; ix = iy) { - if (x > 1) { - x--; - iy = dl[x]; - dl[x] = ix; - } else { - mdb_cassert(mc, x > 1); - j = ++(dl[0].mid); - dl[j] = ix; // Unsorted. OK when MDB_TXN_ERROR. - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; - } - } - if (!(env->me_flags & MDB_WRITEMAP)) - mdb_dpage_free(env, mp); - release: - // Insert in me_pghead - mop = env->me_pghead; - j = mop[0] + ovpages; - for (i = mop[0]; i && mop[i] < pg; i--) - mop[j--] = mop[i]; - while (j>i) - mop[j--] = pg++; - mop[0] += ovpages; - } else { - rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); - if (rc) - return rc; - } - mc->mc_db->md_overflow_pages -= ovpages; - return 0; - */ - return nil -} - -// Find a sibling for a page. -// Replaces the page at the top of the cursor's stack with the -// specified sibling, if one exists. -// @param[in] mc The cursor for this operation. -// @param[in] move_right Non-zero if the right sibling is requested, -// otherwise the left sibling. -// @return 0 on success, non-zero on failure. -func (c *Cursor) sibling(moveRight bool) error { - /* - int rc; - MDB_node *indx; - MDB_page *mp; - - if (mc->mc_snum < 2) { - return MDB_NOTFOUND; // root has no siblings - } - - mdb_cursor_pop(mc); - DPRINTF(("parent page is page %"Z"u, index %u", - mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); - - if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) - : (mc->mc_ki[mc->mc_top] == 0)) { - DPRINTF(("no more keys left, moving to %s sibling", - move_right ? "right" : "left")); - if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { - // undo cursor_pop before returning - mc->mc_top++; - mc->mc_snum++; - return rc; - } - } else { - if (move_right) - mc->mc_ki[mc->mc_top]++; - else - mc->mc_ki[mc->mc_top]--; - DPRINTF(("just moving to %s index key %u", - move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); - } - mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); - - indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) { - // mc will be inconsistent if caller does mc_snum++ as above - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - return rc; - } - - mdb_cursor_push(mc, mp); - if (!move_right) - mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; - - return MDB_SUCCESS; - */ - return nil -} - -// Move the cursor to the next data item. -func (c *Cursor) Next(key []byte, data []byte, op int) error { - /* - MDB_page *mp; - MDB_node *leaf; - int rc; - - if (mc->mc_flags & C_EOF) { - return MDB_NOTFOUND; - } - - mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); - - mp = mc->mc_pg[mc->mc_top]; - - if (mc->mc_db->md_flags & MDB_DUPSORT) { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_NEXT || op == MDB_NEXT_DUP) { - rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); - if (op != MDB_NEXT || rc != MDB_NOTFOUND) { - if (rc == MDB_SUCCESS) - MDB_GET_KEY(leaf, key); - return rc; - } - } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (op == MDB_NEXT_DUP) - return MDB_NOTFOUND; - } - } - - DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", - mdb_dbg_pgno(mp), (void *) mc)); - if (mc->mc_flags & C_DEL) - goto skip; - - if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { - DPUTS("=====> move to next sibling page"); - if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { - mc->mc_flags |= C_EOF; - return rc; - } - mp = mc->mc_pg[mc->mc_top]; - DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); - } else - mc->mc_ki[mc->mc_top]++; - - skip: - DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", - mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); - - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc != MDB_SUCCESS) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; - */ - return nil -} - -// Move the cursor to the previous data item. -func (c *Cursor) prev(key []byte, data []byte, op int) error { - /* - MDB_page *mp; - MDB_node *leaf; - int rc; - - mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); - - mp = mc->mc_pg[mc->mc_top]; - - if (mc->mc_db->md_flags & MDB_DUPSORT) { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_PREV || op == MDB_PREV_DUP) { - rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); - if (op != MDB_PREV || rc != MDB_NOTFOUND) { - if (rc == MDB_SUCCESS) - MDB_GET_KEY(leaf, key); - return rc; - } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (op == MDB_PREV_DUP) - return MDB_NOTFOUND; - } - } - } - - DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", - mdb_dbg_pgno(mp), (void *) mc)); - - if (mc->mc_ki[mc->mc_top] == 0) { - DPUTS("=====> move to prev sibling page"); - if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { - return rc; - } - mp = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; - DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); - } else - mc->mc_ki[mc->mc_top]--; - - mc->mc_flags &= ~C_EOF; - - DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", - mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); - - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc != MDB_SUCCESS) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; - */ - return nil -} - -// Set the cursor on a specific data item. -// (bool return is whether it is exact). -func (c *Cursor) set(key []byte, data []byte, op int) (error, bool) { - /* - int rc; - MDB_page *mp; - MDB_node *leaf = NULL; - DKBUF; - - if (key->mv_size == 0) - return MDB_BAD_VALSIZE; - - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - // See if we're already on the right page - if (mc->mc_flags & C_INITIALIZED) { - MDB_val nodekey; - - mp = mc->mc_pg[mc->mc_top]; - if (!NUMKEYS(mp)) { - mc->mc_ki[mc->mc_top] = 0; - return MDB_NOTFOUND; - } - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_size = mc->mc_db->md_pad; - nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); - } else { - leaf = NODEPTR(mp, 0); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - // Probably happens rarely, but first node on the page - // was the one we wanted. - mc->mc_ki[mc->mc_top] = 0; - if (exactp) - *exactp = 1; - goto set1; - } - if (rc > 0) { - unsigned int i; - unsigned int nkeys = NUMKEYS(mp); - if (nkeys > 1) { - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_data = LEAF2KEY(mp, - nkeys-1, nodekey.mv_size); - } else { - leaf = NODEPTR(mp, nkeys-1); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - // last node was the one we wanted - mc->mc_ki[mc->mc_top] = nkeys-1; - if (exactp) - *exactp = 1; - goto set1; - } - if (rc < 0) { - if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { - // This is definitely the right page, skip search_page - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_data = LEAF2KEY(mp, - mc->mc_ki[mc->mc_top], nodekey.mv_size); - } else { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - // current node was the one we wanted - if (exactp) - *exactp = 1; - goto set1; - } - } - rc = 0; - goto set2; - } - } - // If any parents have right-sibs, search. - // Otherwise, there's nothing further. - for (i=0; imc_top; i++) - if (mc->mc_ki[i] < - NUMKEYS(mc->mc_pg[i])-1) - break; - if (i == mc->mc_top) { - // There are no other pages - mc->mc_ki[mc->mc_top] = nkeys; - return MDB_NOTFOUND; - } - } - if (!mc->mc_top) { - // There are no other pages - mc->mc_ki[mc->mc_top] = 0; - if (op == MDB_SET_RANGE) { - rc = 0; - goto set1; - } else - return MDB_NOTFOUND; - } - } - - rc = mdb_page_search(mc, key, 0); - if (rc != MDB_SUCCESS) - return rc; - - mp = mc->mc_pg[mc->mc_top]; - mdb_cassert(mc, IS_LEAF(mp)); - - set2: - leaf = mdb_node_search(mc, key, exactp); - if (exactp != NULL && !*exactp) { - // MDB_SET specified and not an exact match. - return MDB_NOTFOUND; - } - - if (leaf == NULL) { - DPUTS("===> inexact leaf not found, goto sibling"); - if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) - return rc; // no entries matched - mp = mc->mc_pg[mc->mc_top]; - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, 0); - } - - set1: - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - } else { - int ex2, *ex2p; - if (op == MDB_GET_BOTH) { - ex2p = &ex2; - ex2 = 0; - } else { - ex2p = NULL; - } - rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); - if (rc != MDB_SUCCESS) - return rc; - } - } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { - MDB_val d2; - if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS) - return rc; - rc = mc->mc_dbx->md_dcmp(data, &d2); - if (rc) { - if (op == MDB_GET_BOTH || rc > 0) - return MDB_NOTFOUND; - rc = 0; - *data = d2; - } - - } else { - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; - } - } - - // The key already matches in all other cases - if (op == MDB_SET_RANGE || op == MDB_SET_KEY) - MDB_GET_KEY(leaf, key); - DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); - - return rc; - */ - - return nil, false -} - -// Move the cursor to the first item in the database. -func (c *Cursor) first(key []byte, data []byte) error { - /* - int rc; - MDB_node *leaf; - - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); - if (rc != MDB_SUCCESS) - return rc; - } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - - leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - mc->mc_ki[mc->mc_top] = 0; - - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); - return MDB_SUCCESS; - } - - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc) - return rc; - } else { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; - } - } - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; - */ - return nil -} - -// Move the cursor to the last item in the database. -func (c *Cursor) last() ([]byte, []byte) { - /* - int rc; - MDB_node *leaf; - - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - if (!(mc->mc_flags & C_EOF)) { - - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_LAST); - if (rc != MDB_SUCCESS) - return rc; - } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - - } - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; - mc->mc_flags |= C_INITIALIZED|C_EOF; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc) - return rc; - } else { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; - */ - return nil, nil -} - -// Touch all the pages in the cursor stack. Set mc_top. -// Makes sure all the pages are writable, before attempting a write operation. -// @param[in] mc The cursor to operate on. -func (c *Cursor) touch() error { - /* - int rc = MDB_SUCCESS; - - if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { - MDB_cursor mc2; - MDB_xcursor mcx; - mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); - rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); - if (rc) - return rc; - *mc->mc_dbflag |= DB_DIRTY; - } - mc->mc_top = 0; - if (mc->mc_snum) { - do { - rc = mdb_page_touch(mc); - } while (!rc && ++(mc->mc_top) < mc->mc_snum); - mc->mc_top = mc->mc_snum-1; - } - return rc; - } - */ - return nil -} - -func (c *Cursor) Del(flags int) error { - /* - MDB_node *leaf; - MDB_page *mp; - int rc; - - if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - if (!(mc->mc_flags & C_INITIALIZED)) - return EINVAL; - - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) - return MDB_NOTFOUND; - - if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) - return rc; - - rc = mdb_cursor_touch(mc); - if (rc) - return rc; - - mp = mc->mc_pg[mc->mc_top]; - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (!(flags & MDB_NODUPDATA)) { - if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { - mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } - rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); - // If sub-DB still has entries, we're done - if (mc->mc_xcursor->mx_db.md_entries) { - if (leaf->mn_flags & F_SUBDATA) { - // update subDB info - void *db = NODEDATA(leaf); - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); - } else { - MDB_cursor *m2; - // shrink fake page - mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - // fix other sub-DB cursors pointed at this fake page - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (m2->mc_pg[mc->mc_top] == mp && - m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } - } - mc->mc_db->md_entries--; - mc->mc_flags |= C_DEL; - return rc; - } - // otherwise fall thru and delete the sub-DB - } - - if (leaf->mn_flags & F_SUBDATA) { - // add all the child DB's pages to the free list - rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); - if (rc == MDB_SUCCESS) { - mc->mc_db->md_entries -= - mc->mc_xcursor->mx_db.md_entries; - } - } - } - - return mdb_cursor_del0(mc, leaf); - */ - return nil -} - -// Add a node to the page pointed to by the cursor. -// @param[in] mc The cursor for this operation. -// @param[in] indx The index on the page where the new node should be added. -// @param[in] key The key for the new node. -// @param[in] data The data for the new node, if any. -// @param[in] pgno The page number, if adding a branch node. -// @param[in] flags Flags for the node. -// @return 0 on success, non-zero on failure. Possible errors are: -//
    -//
  • ENOMEM - failed to allocate overflow pages for the node. -//
  • MDB_PAGE_FULL - there is insufficient room in the page. This error -// should never happen since all callers already calculate the -// page's free space before calling this function. -//
-func (c *Cursor) addNode(index int, key []byte, data []byte, pgno int, flags int) error { - /* - unsigned int i; - size_t node_size = NODESIZE; - ssize_t room; - indx_t ofs; - MDB_node *node; - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_page *ofp = NULL; // overflow page - DKBUF; - - mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); - - DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", - IS_LEAF(mp) ? "leaf" : "branch", - IS_SUBP(mp) ? "sub-" : "", - mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, - key ? key->mv_size : 0, key ? DKEY(key) : "null")); - - if (IS_LEAF2(mp)) { - // Move higher keys up one slot. - int ksize = mc->mc_db->md_pad, dif; - char *ptr = LEAF2KEY(mp, indx, ksize); - dif = NUMKEYS(mp) - indx; - if (dif > 0) - memmove(ptr+ksize, ptr, dif*ksize); - // insert new key - memcpy(ptr, key->mv_data, ksize); - - // Just using these for counting - mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); - return MDB_SUCCESS; - } - - room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); - if (key != NULL) - node_size += key->mv_size; - if (IS_LEAF(mp)) { - mdb_cassert(mc, data); - if (F_ISSET(flags, F_BIGDATA)) { - // Data already on overflow page. - node_size += sizeof(pgno_t); - } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { - int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); - int rc; - // Put data on overflow page. - DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", - data->mv_size, node_size+data->mv_size)); - node_size = EVEN(node_size + sizeof(pgno_t)); - if ((ssize_t)node_size > room) - goto full; - if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) - return rc; - DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); - flags |= F_BIGDATA; - goto update; - } else { - node_size += data->mv_size; - } - } - node_size = EVEN(node_size); - if ((ssize_t)node_size > room) - goto full; - - update: - // Move higher pointers up one slot. - for (i = NUMKEYS(mp); i > indx; i--) - mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; - - // Adjust free space offsets. - ofs = mp->mp_upper - node_size; - mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); - mp->mp_ptrs[indx] = ofs; - mp->mp_upper = ofs; - mp->mp_lower += sizeof(indx_t); - - // Write the node data. - node = NODEPTR(mp, indx); - node->mn_ksize = (key == NULL) ? 0 : key->mv_size; - node->mn_flags = flags; - if (IS_LEAF(mp)) - SETDSZ(node,data->mv_size); - else - SETPGNO(node,pgno); - - if (key) - memcpy(NODEKEY(node), key->mv_data, key->mv_size); - - if (IS_LEAF(mp)) { - mdb_cassert(mc, key); - if (ofp == NULL) { - if (F_ISSET(flags, F_BIGDATA)) - memcpy(node->mn_data + key->mv_size, data->mv_data, - sizeof(pgno_t)); - else if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = node->mn_data + key->mv_size; - else - memcpy(node->mn_data + key->mv_size, data->mv_data, - data->mv_size); - } else { - memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno, - sizeof(pgno_t)); - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = METADATA(ofp); - else - memcpy(METADATA(ofp), data->mv_data, data->mv_size); - } - } - - return MDB_SUCCESS; - - full: - DPRINTF(("not enough room in page %"Z"u, got %u ptrs", - mdb_dbg_pgno(mp), NUMKEYS(mp))); - DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); - DPRINTF(("node size = %"Z"u", node_size)); - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_FULL; - */ - - return nil -} - -// Delete the specified node from a page. -// @param[in] mp The page to operate on. -// @param[in] indx The index of the node to delete. -// @param[in] ksize The size of a node. Only used if the page is -// part of a #MDB_DUPFIXED database. -func (c *Cursor) deleteNode(ksize int) { - /* - MDB_page *mp = mc->mc_pg[mc->mc_top]; - indx_t indx = mc->mc_ki[mc->mc_top]; - unsigned int sz; - indx_t i, j, numkeys, ptr; - MDB_node *node; - char *base; - - DPRINTF(("delete node %u on %s page %"Z"u", indx, - IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); - numkeys = NUMKEYS(mp); - mdb_cassert(mc, indx < numkeys); - - if (IS_LEAF2(mp)) { - int x = numkeys - 1 - indx; - base = LEAF2KEY(mp, indx, ksize); - if (x) - memmove(base, base + ksize, x * ksize); - mp->mp_lower -= sizeof(indx_t); - mp->mp_upper += ksize - sizeof(indx_t); - return; - } - - node = NODEPTR(mp, indx); - sz = NODESIZE + node->mn_ksize; - if (IS_LEAF(mp)) { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - sz += sizeof(pgno_t); - else - sz += NODEDSZ(node); - } - sz = EVEN(sz); - - ptr = mp->mp_ptrs[indx]; - for (i = j = 0; i < numkeys; i++) { - if (i != indx) { - mp->mp_ptrs[j] = mp->mp_ptrs[i]; - if (mp->mp_ptrs[i] < ptr) - mp->mp_ptrs[j] += sz; - j++; - } - } - - base = (char *)mp + mp->mp_upper; - memmove(base + sz, base, ptr - mp->mp_upper); - - mp->mp_lower -= sizeof(indx_t); - mp->mp_upper += sz; - */ -} - -// Final setup of a sorted-dups cursor. -// Sets up the fields that depend on the data from the main cursor. -// @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. -// @param[in] node The data containing the #MDB_db record for the -// sorted-dup database. -func (c *Cursor) xcursor_init1(n *node) { - /* - MDB_xcursor *mx = mc->mc_xcursor; - - if (node->mn_flags & F_SUBDATA) { - memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); - mx->mx_cursor.mc_pg[0] = 0; - mx->mx_cursor.mc_snum = 0; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; - } else { - MDB_page *fp = NODEDATA(node); - mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad; - mx->mx_db.md_flags = 0; - mx->mx_db.md_depth = 1; - mx->mx_db.md_branch_pages = 0; - mx->mx_db.md_leaf_pages = 1; - mx->mx_db.md_overflow_pages = 0; - mx->mx_db.md_entries = NUMKEYS(fp); - COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); - mx->mx_cursor.mc_snum = 1; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; - mx->mx_cursor.mc_pg[0] = fp; - mx->mx_cursor.mc_ki[0] = 0; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - mx->mx_db.md_flags = MDB_DUPFIXED; - mx->mx_db.md_pad = fp->mp_pad; - if (mc->mc_db->md_flags & MDB_INTEGERDUP) - mx->mx_db.md_flags |= MDB_INTEGERKEY; - } - } - DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, - mx->mx_db.md_root)); - mx->mx_dbflag = DB_VALID|DB_DIRTY; // DB_DIRTY guides mdb_cursor_touch - #if UINT_MAX < SIZE_MAX - if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) - #ifdef MISALIGNED_OK - mx->mx_dbx.md_cmp = mdb_cmp_long; - #else - mx->mx_dbx.md_cmp = mdb_cmp_cint; - #endif - #endif - */ -} - -// Return the count of duplicate data items for the current key. -func (c *Cursor) count() (int, error) { - /* - MDB_node *leaf; - - if (mc == NULL || countp == NULL) - return EINVAL; - - if (mc->mc_xcursor == NULL) - return MDB_INCOMPATIBLE; - - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - *countp = 1; - } else { - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) - return EINVAL; - - *countp = mc->mc_xcursor->mx_db.md_entries; - } - return MDB_SUCCESS; - */ - return 0, nil -} - -func (c *Cursor) Close() { - /* - if (mc && !mc->mc_backup) { - // remove from txn, if tracked - if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { - MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) prev = &(*prev)->mc_next; - if (*prev == mc) - *prev = mc->mc_next; - } - free(mc); - } - */ +type stackelem struct { + page *page + index int } func (c *Cursor) Transaction() *Transaction { @@ -1598,1078 +19,39 @@ func (c *Cursor) Bucket() *Bucket { return c.bucket } -// Replace the key for a branch node with a new key. -// @param[in] mc Cursor pointing to the node to operate on. -// @param[in] key The new key to use. -// @return 0 on success, non-zero on failure. -func (c *Cursor) updateKey(key []byte) error { - /* - MDB_page *mp; - MDB_node *node; - char *base; - size_t len; - int delta, ksize, oksize; - indx_t ptr, i, numkeys, indx; - DKBUF; +func (c *Cursor) Get(key []byte) ([]byte, error) { + // TODO: Move to key + // TODO: If it doesn't exist, return nil, nil + // TODO: Otherwise return node key+data. + return nil, nil +} - indx = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - node = NODEPTR(mp, indx); - ptr = mp->mp_ptrs[indx]; - #if MDB_DEBUG - { - MDB_val k2; - char kbuf2[DKBUF_MAXKEYSIZE*2+1]; - k2.mv_data = NODEKEY(node); - k2.mv_size = node->mn_ksize; - DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", - indx, ptr, - mdb_dkey(&k2, kbuf2), - DKEY(key), - mp->mp_pgno)); - } - #endif +// Move the cursor to the next key/value. +func (c *Cursor) Next() ([]byte, []byte, error) { + return nil, nil, nil +} - // Sizes must be 2-byte aligned. - ksize = EVEN(key->mv_size); - oksize = EVEN(node->mn_ksize); - delta = ksize - oksize; +// First moves the cursor to the first item in the bucket and returns its key and data. +func (c *Cursor) First() ([]byte, []byte, error) { + // TODO: Traverse to the first key. + return nil, nil, nil +} - // Shift node contents if EVEN(key length) changed. - if (delta) { - if (delta > 0 && SIZELEFT(mp) < delta) { - pgno_t pgno; - // not enough space left, do a delete and split - DPRINTF(("Not enough room, delta = %d, splitting...", delta)); - pgno = NODEPGNO(node); - mdb_node_del(mc, 0); - return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); - } +// Set the cursor on a specific data item. +// (bool return is whether it is exact). +func (c *Cursor) set(key []byte, data []byte, op int) (error, bool) { + // TODO(benbjohnson): Optimize for specific use cases. - numkeys = NUMKEYS(mp); - for (i = 0; i < numkeys; i++) { - if (mp->mp_ptrs[i] <= ptr) - mp->mp_ptrs[i] -= delta; - } + // TODO: Check if len(key) > 0. + // TODO: Start from root page and traverse to correct page. - base = (char *)mp + mp->mp_upper; - len = ptr - mp->mp_upper + NODESIZE; - memmove(base - delta, base, len); - mp->mp_upper -= delta; + return nil, false +} - node = NODEPTR(mp, indx); - } - - // But even if no shift was needed, update ksize - if (node->mn_ksize != key->mv_size) - node->mn_ksize = key->mv_size; - - if (key->mv_size) - memcpy(NODEKEY(node), key->mv_data, key->mv_size); - - return MDB_SUCCESS; - */ - return nil -} - -// Move a node from csrc to cdst. -func (c *Cursor) moveNodeTo(dst *Cursor) error { - /* - MDB_node *srcnode; - MDB_val key, data; - pgno_t srcpg; - MDB_cursor mn; - int rc; - unsigned short flags; - - DKBUF; - - // Mark src and dst as dirty. - if ((rc = mdb_page_touch(csrc)) || - (rc = mdb_page_touch(cdst))) - return rc; - - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_pad; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); - data.mv_size = 0; - data.mv_data = NULL; - srcpg = 0; - flags = 0; - } else { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); - mdb_cassert(csrc, !((size_t)srcnode & 1)); - srcpg = NODEPGNO(srcnode); - flags = srcnode->mn_flags; - if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - unsigned int snum = csrc->mc_snum; - MDB_node *s2; - // must find the lowest key below src - mdb_page_search_lowest(csrc); - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_pad; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); - } else { - s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(s2); - key.mv_data = NODEKEY(s2); - } - csrc->mc_snum = snum--; - csrc->mc_top = snum; - } else { - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - data.mv_size = NODEDSZ(srcnode); - data.mv_data = NODEDATA(srcnode); - } - if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { - unsigned int snum = cdst->mc_snum; - MDB_node *s2; - MDB_val bkey; - // must find the lowest key below dst - mdb_page_search_lowest(cdst); - if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) { - bkey.mv_size = cdst->mc_db->md_pad; - bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size); - } else { - s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); - bkey.mv_size = NODEKSZ(s2); - bkey.mv_data = NODEKEY(s2); - } - cdst->mc_snum = snum--; - cdst->mc_top = snum; - mdb_cursor_copy(cdst, &mn); - mn.mc_ki[snum] = 0; - rc = mdb_update_key(&mn, &bkey); - if (rc) - return rc; - } - - DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", - IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", - csrc->mc_ki[csrc->mc_top], - DKEY(&key), - csrc->mc_pg[csrc->mc_top]->mp_pgno, - cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); - - // Add the node to the destination page. - rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); - if (rc != MDB_SUCCESS) - return rc; - - // Delete the node from the source page. - mdb_node_del(csrc, key.mv_size); - - { - // Adjust other cursors pointing to mp - MDB_cursor *m2, *m3; - MDB_dbi dbi = csrc->mc_dbi; - MDB_page *mp = csrc->mc_pg[csrc->mc_top]; - - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == csrc) continue; - if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] == - csrc->mc_ki[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; - m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - } - } - } - - // Update the parent separators. - if (csrc->mc_ki[csrc->mc_top] == 0) { - if (csrc->mc_ki[csrc->mc_top-1] != 0) { - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); - } else { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - DPRINTF(("update separator for source page %"Z"u to [%s]", - csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); - mdb_cursor_copy(csrc, &mn); - mn.mc_snum--; - mn.mc_top--; - if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) - return rc; - } - if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - MDB_val nullkey; - indx_t ix = csrc->mc_ki[csrc->mc_top]; - nullkey.mv_size = 0; - csrc->mc_ki[csrc->mc_top] = 0; - rc = mdb_update_key(csrc, &nullkey); - csrc->mc_ki[csrc->mc_top] = ix; - mdb_cassert(csrc, rc == MDB_SUCCESS); - } - } - - if (cdst->mc_ki[cdst->mc_top] == 0) { - if (cdst->mc_ki[cdst->mc_top-1] != 0) { - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); - } else { - srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - DPRINTF(("update separator for destination page %"Z"u to [%s]", - cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); - mdb_cursor_copy(cdst, &mn); - mn.mc_snum--; - mn.mc_top--; - if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) - return rc; - } - if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { - MDB_val nullkey; - indx_t ix = cdst->mc_ki[cdst->mc_top]; - nullkey.mv_size = 0; - cdst->mc_ki[cdst->mc_top] = 0; - rc = mdb_update_key(cdst, &nullkey); - cdst->mc_ki[cdst->mc_top] = ix; - mdb_cassert(csrc, rc == MDB_SUCCESS); - } - } - - return MDB_SUCCESS; - */ - - return nil -} - -// Merge one page into another. -// The nodes from the page pointed to by \b csrc will -// be copied to the page pointed to by \b cdst and then -// the \b csrc page will be freed. -// @param[in] csrc Cursor pointing to the source page. -// @param[in] cdst Cursor pointing to the destination page. -func (c *Cursor) mergePage(dst *Cursor) error { - /* - int rc; - indx_t i, j; - MDB_node *srcnode; - MDB_val key, data; - unsigned nkeys; - - DPRINTF(("merging page %"Z"u into %"Z"u", csrc->mc_pg[csrc->mc_top]->mp_pgno, - cdst->mc_pg[cdst->mc_top]->mp_pgno)); - - mdb_cassert(csrc, csrc->mc_snum > 1); // can't merge root page - mdb_cassert(csrc, cdst->mc_snum > 1); - - // Mark dst as dirty. - if ((rc = mdb_page_touch(cdst))) - return rc; - - // Move all nodes from src to dst. - j = nkeys = NUMKEYS(cdst->mc_pg[cdst->mc_top]); - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_pad; - key.mv_data = METADATA(csrc->mc_pg[csrc->mc_top]); - for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { - rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); - if (rc != MDB_SUCCESS) - return rc; - key.mv_data = (char *)key.mv_data + key.mv_size; - } - } else { - for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i); - if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - unsigned int snum = csrc->mc_snum; - MDB_node *s2; - // must find the lowest key below src - mdb_page_search_lowest(csrc); - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_pad; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); - } else { - s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(s2); - key.mv_data = NODEKEY(s2); - } - csrc->mc_snum = snum--; - csrc->mc_top = snum; - } else { - key.mv_size = srcnode->mn_ksize; - key.mv_data = NODEKEY(srcnode); - } - - data.mv_size = NODEDSZ(srcnode); - data.mv_data = NODEDATA(srcnode); - rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); - if (rc != MDB_SUCCESS) - return rc; - } - } - - DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", - cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]), - (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10)); - - // Unlink the src page from parent and add to free list. - csrc->mc_top--; - mdb_node_del(csrc, 0); - if (csrc->mc_ki[csrc->mc_top] == 0) { - key.mv_size = 0; - rc = mdb_update_key(csrc, &key); - if (rc) { - csrc->mc_top++; - return rc; - } - } - csrc->mc_top++; - - rc = mdb_midl_append(&csrc->mc_txn->mt_free_pgs, - csrc->mc_pg[csrc->mc_top]->mp_pgno); - if (rc) - return rc; - if (IS_LEAF(csrc->mc_pg[csrc->mc_top])) - csrc->mc_db->md_leaf_pages--; - else - csrc->mc_db->md_branch_pages--; - { - // Adjust other cursors pointing to mp - MDB_cursor *m2, *m3; - MDB_dbi dbi = csrc->mc_dbi; - MDB_page *mp = cdst->mc_pg[cdst->mc_top]; - - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == csrc) continue; - if (m3->mc_snum < csrc->mc_snum) continue; - if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = mp; - m3->mc_ki[csrc->mc_top] += nkeys; - } - } - } - mdb_cursor_pop(csrc); - - return mdb_rebalance(csrc); - */ - - return nil -} - -// Copy the contents of a cursor. -// @param[in] csrc The cursor to copy from. -// @param[out] cdst The cursor to copy to. -func (c *Cursor) copyTo(dst *Cursor) { - /* - unsigned int i; - - cdst->mc_txn = csrc->mc_txn; - cdst->mc_dbi = csrc->mc_dbi; - cdst->mc_db = csrc->mc_db; - cdst->mc_dbx = csrc->mc_dbx; - cdst->mc_snum = csrc->mc_snum; - cdst->mc_top = csrc->mc_top; - cdst->mc_flags = csrc->mc_flags; - - for (i=0; imc_snum; i++) { - cdst->mc_pg[i] = csrc->mc_pg[i]; - cdst->mc_ki[i] = csrc->mc_ki[i]; - } - */ -} - -// Rebalance the tree after a delete operation. -// @param[in] mc Cursor pointing to the page where rebalancing -// should begin. -// @return 0 on success, non-zero on failure. -func (c *Cursor) rebalance() error { - /* - MDB_node *node; - int rc; - unsigned int ptop, minkeys; - MDB_cursor mn; - - minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top])); - DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", - IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", - mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), - (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); - - if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD && - NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { - DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", - mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); - return MDB_SUCCESS; - } - - if (mc->mc_snum < 2) { - MDB_page *mp = mc->mc_pg[0]; - if (IS_SUBP(mp)) { - DPUTS("Can't rebalance a subpage, ignoring"); - return MDB_SUCCESS; - } - if (NUMKEYS(mp) == 0) { - DPUTS("tree is completely empty"); - mc->mc_db->md_root = P_INVALID; - mc->mc_db->md_depth = 0; - mc->mc_db->md_leaf_pages = 0; - rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); - if (rc) - return rc; - // Adjust cursors pointing to mp - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_flags &= ~C_INITIALIZED; - { - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[0] == mp) { - m3->mc_snum = 0; - m3->mc_top = 0; - m3->mc_flags &= ~C_INITIALIZED; - } - } - } - } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { - DPUTS("collapsing root page!"); - rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); - if (rc) - return rc; - mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); - rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL); - if (rc) - return rc; - mc->mc_db->md_depth--; - mc->mc_db->md_branch_pages--; - mc->mc_ki[0] = mc->mc_ki[1]; - { - // Adjust other cursors pointing to mp - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[0] == mp) { - int i; - m3->mc_snum--; - m3->mc_top--; - for (i=0; imc_snum; i++) { - m3->mc_pg[i] = m3->mc_pg[i+1]; - m3->mc_ki[i] = m3->mc_ki[i+1]; - } - } - } - } - } else - DPUTS("root page doesn't need rebalancing"); - return MDB_SUCCESS; - } - - // The parent (branch page) must have at least 2 pointers, - // otherwise the tree is invalid. - ptop = mc->mc_top-1; - mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); - - // Leaf page fill factor is below the threshold. - // Try to move keys from left or right neighbor, or - // merge with a neighbor page. - - // Find neighbors. - mdb_cursor_copy(mc, &mn); - mn.mc_xcursor = NULL; - - if (mc->mc_ki[ptop] == 0) { - // We're the leftmost leaf in our parent. - DPUTS("reading right neighbor"); - mn.mc_ki[ptop]++; - node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); - if (rc) - return rc; - mn.mc_ki[mn.mc_top] = 0; - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); - } else { - // There is at least one neighbor to the left. - DPUTS("reading left neighbor"); - mn.mc_ki[ptop]--; - node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); - if (rc) - return rc; - mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; - mc->mc_ki[mc->mc_top] = 0; - } - - DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", - mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), - (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); - - // If the neighbor page is above threshold and has enough keys, - // move one key from it. Otherwise we should try to merge them. - // (A branch page must never have less than 2 keys.) - minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top])); - if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) - return mdb_node_move(&mn, mc); - else { - if (mc->mc_ki[ptop] == 0) - rc = mdb_page_merge(&mn, mc); - else { - mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; - rc = mdb_page_merge(mc, &mn); - mdb_cursor_copy(&mn, mc); - } - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - } - return rc; - */ - return nil -} - -// Complete a delete operation started by #mdb_cursor_del(). -func (c *Cursor) del0(leaf *node) error { - /* - int rc; - MDB_page *mp; - indx_t ki; - unsigned int nkeys; - - mp = mc->mc_pg[mc->mc_top]; - ki = mc->mc_ki[mc->mc_top]; - - // add overflow pages to free list - if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; - pgno_t pg; - - memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || - (rc = mdb_ovpage_free(mc, omp))) - return rc; - } - mdb_node_del(mc, mc->mc_db->md_pad); - mc->mc_db->md_entries--; - rc = mdb_rebalance(mc); - if (rc != MDB_SUCCESS) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - else { - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - - mp = mc->mc_pg[mc->mc_top]; - nkeys = NUMKEYS(mp); - - // if mc points past last node in page, find next sibling - if (mc->mc_ki[mc->mc_top] >= nkeys) - mdb_cursor_sibling(mc, 1); - - // Adjust other cursors pointing to mp - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3 == mc || m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] >= ki) { - m3->mc_flags |= C_DEL; - if (m3->mc_ki[mc->mc_top] > ki) - m3->mc_ki[mc->mc_top]--; - } - if (m3->mc_ki[mc->mc_top] >= nkeys) - mdb_cursor_sibling(m3, 1); - } - } - mc->mc_flags |= C_DEL; - } - - return rc; - */ - return nil -} - -// Split a page and insert a new node. -// @param[in,out] mc Cursor pointing to the page and desired insertion index. -// The cursor will be updated to point to the actual page and index where -// the node got inserted after the split. -// @param[in] newkey The key for the newly inserted node. -// @param[in] newdata The data for the newly inserted node. -// @param[in] newpgno The page number, if the new node is a branch node. -// @param[in] nflags The #NODE_ADD_FLAGS for the new node. -// @return 0 on success, non-zero on failure. -func (c *Cursor) splitPage(newKey []byte, newData []byte, newpgno int, nflags int) error { - /* - unsigned int flags; - int rc = MDB_SUCCESS, new_root = 0, did_split = 0; - indx_t newindx; - pgno_t pgno = 0; - int i, j, split_indx, nkeys, pmax; - MDB_env *env = mc->mc_txn->mt_env; - MDB_node *node; - MDB_val sepkey, rkey, xdata, *rdata = &xdata; - MDB_page *copy = NULL; - MDB_page *mp, *rp, *pp; - int ptop; - MDB_cursor mn; - DKBUF; - - mp = mc->mc_pg[mc->mc_top]; - newindx = mc->mc_ki[mc->mc_top]; - nkeys = NUMKEYS(mp); - - DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, - DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); - - // Create a right sibling. - if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) - return rc; - DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); - - if (mc->mc_snum < 2) { - if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) - return rc; - // shift current top to make room for new parent - mc->mc_pg[1] = mc->mc_pg[0]; - mc->mc_ki[1] = mc->mc_ki[0]; - mc->mc_pg[0] = pp; - mc->mc_ki[0] = 0; - mc->mc_db->md_root = pp->mp_pgno; - DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); - mc->mc_db->md_depth++; - new_root = 1; - - // Add left (implicit) pointer. - if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { - // undo the pre-push - mc->mc_pg[0] = mc->mc_pg[1]; - mc->mc_ki[0] = mc->mc_ki[1]; - mc->mc_db->md_root = mp->mp_pgno; - mc->mc_db->md_depth--; - return rc; - } - mc->mc_snum = 2; - mc->mc_top = 1; - ptop = 0; - } else { - ptop = mc->mc_top-1; - DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); - } - - mc->mc_flags |= C_SPLITTING; - mdb_cursor_copy(mc, &mn); - mn.mc_pg[mn.mc_top] = rp; - mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; - - if (nflags & MDB_APPEND) { - mn.mc_ki[mn.mc_top] = 0; - sepkey = *newkey; - split_indx = newindx; - nkeys = 0; - } else { - - split_indx = (nkeys+1) / 2; - - if (IS_LEAF2(rp)) { - char *split, *ins; - int x; - unsigned int lsize, rsize, ksize; - // Move half of the keys to the right sibling - copy = NULL; - x = mc->mc_ki[mc->mc_top] - split_indx; - ksize = mc->mc_db->md_pad; - split = LEAF2KEY(mp, split_indx, ksize); - rsize = (nkeys - split_indx) * ksize; - lsize = (nkeys - split_indx) * sizeof(indx_t); - mp->mp_lower -= lsize; - rp->mp_lower += lsize; - mp->mp_upper += rsize - lsize; - rp->mp_upper -= rsize - lsize; - sepkey.mv_size = ksize; - if (newindx == split_indx) { - sepkey.mv_data = newkey->mv_data; - } else { - sepkey.mv_data = split; - } - if (x<0) { - ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); - memcpy(rp->mp_ptrs, split, rsize); - sepkey.mv_data = rp->mp_ptrs; - memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); - memcpy(ins, newkey->mv_data, ksize); - mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); - } else { - if (x) - memcpy(rp->mp_ptrs, split, x * ksize); - ins = LEAF2KEY(rp, x, ksize); - memcpy(ins, newkey->mv_data, ksize); - memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); - rp->mp_lower += sizeof(indx_t); - rp->mp_upper -= ksize - sizeof(indx_t); - mc->mc_ki[mc->mc_top] = x; - mc->mc_pg[mc->mc_top] = rp; - } - } else { - int psize, nsize, k; - // Maximum free space in an empty page - pmax = env->me_psize - PAGEHDRSZ; - if (IS_LEAF(mp)) - nsize = mdb_leaf_size(env, newkey, newdata); - else - nsize = mdb_branch_size(env, newkey); - nsize = EVEN(nsize); - - // grab a page to hold a temporary copy - copy = mdb_page_malloc(mc->mc_txn, 1); - if (copy == NULL) - return ENOMEM; - copy->mp_pgno = mp->mp_pgno; - copy->mp_flags = mp->mp_flags; - copy->mp_lower = PAGEHDRSZ; - copy->mp_upper = env->me_psize; - - // prepare to insert - for (i=0, j=0; imp_ptrs[j++] = 0; - } - copy->mp_ptrs[j++] = mp->mp_ptrs[i]; - } - - // When items are relatively large the split point needs - // to be checked, because being off-by-one will make the - // difference between success or failure in mdb_node_add. - // - // It's also relevant if a page happens to be laid out - // such that one half of its nodes are all "small" and - // the other half of its nodes are "large." If the new - // item is also "large" and falls on the half with - // "large" nodes, it also may not fit. - // - // As a final tweak, if the new item goes on the last - // spot on the page (and thus, onto the new page), bias - // the split so the new page is emptier than the old page. - // This yields better packing during sequential inserts. - if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { - // Find split point - psize = 0; - if (newindx <= split_indx || newindx >= nkeys) { - i = 0; j = 1; - k = newindx >= nkeys ? nkeys : split_indx+2; - } else { - i = nkeys; j = -1; - k = split_indx-1; - } - for (; i!=k; i+=j) { - if (i == newindx) { - psize += nsize; - node = NULL; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]); - psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); - if (IS_LEAF(mp)) { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - psize += sizeof(pgno_t); - else - psize += NODEDSZ(node); - } - psize = EVEN(psize); - } - if (psize > pmax || i == k-j) { - split_indx = i + (j<0); - break; - } - } - } - if (split_indx == newindx) { - sepkey.mv_size = newkey->mv_size; - sepkey.mv_data = newkey->mv_data; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]); - sepkey.mv_size = node->mn_ksize; - sepkey.mv_data = NODEKEY(node); - } - } - } - - DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); - - // Copy separator key to the parent. - if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { - mn.mc_snum--; - mn.mc_top--; - did_split = 1; - rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0); - - // root split? - if (mn.mc_snum == mc->mc_snum) { - mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top]; - mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop]; - mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop]; - mc->mc_snum++; - mc->mc_top++; - ptop++; - } - // Right page might now have changed parent. - // Check if left page also changed parent. - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; imc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - mc->mc_pg[ptop] = mn.mc_pg[ptop]; - mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; - } - } else { - mn.mc_top--; - rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); - mn.mc_top++; - } - mc->mc_flags ^= C_SPLITTING; - if (rc != MDB_SUCCESS) { - return rc; - } - if (nflags & MDB_APPEND) { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[mc->mc_top] = 0; - rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); - if (rc) - return rc; - for (i=0; imc_top; i++) - mc->mc_ki[i] = mn.mc_ki[i]; - } else if (!IS_LEAF2(mp)) { - // Move nodes - mc->mc_pg[mc->mc_top] = rp; - i = split_indx; - j = 0; - do { - if (i == newindx) { - rkey.mv_data = newkey->mv_data; - rkey.mv_size = newkey->mv_size; - if (IS_LEAF(mp)) { - rdata = newdata; - } else - pgno = newpgno; - flags = nflags; - // Update index for the new key. - mc->mc_ki[mc->mc_top] = j; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]); - rkey.mv_data = NODEKEY(node); - rkey.mv_size = node->mn_ksize; - if (IS_LEAF(mp)) { - xdata.mv_data = NODEDATA(node); - xdata.mv_size = NODEDSZ(node); - rdata = &xdata; - } else - pgno = NODEPGNO(node); - flags = node->mn_flags; - } - - if (!IS_LEAF(mp) && j == 0) { - // First branch index doesn't need key data. - rkey.mv_size = 0; - } - - rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); - if (rc) { - // return tmp page to freelist - mdb_page_free(env, copy); - return rc; - } - if (i == nkeys) { - i = 0; - j = 0; - mc->mc_pg[mc->mc_top] = copy; - } else { - i++; - j++; - } - } while (i != split_indx); - - nkeys = NUMKEYS(copy); - for (i=0; imp_ptrs[i] = copy->mp_ptrs[i]; - mp->mp_lower = copy->mp_lower; - mp->mp_upper = copy->mp_upper; - memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), - env->me_psize - copy->mp_upper); - - // reset back to original page - if (newindx < split_indx) { - mc->mc_pg[mc->mc_top] = mp; - if (nflags & MDB_RESERVE) { - node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!(node->mn_flags & F_BIGDATA)) - newdata->mv_data = NODEDATA(node); - } - } else { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[ptop]++; - // Make sure mc_ki is still valid. - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; imc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - mc->mc_pg[ptop] = mn.mc_pg[ptop]; - mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; - } - } - // return tmp page to freelist - mdb_page_free(env, copy); - } - - { - // Adjust other cursors pointing to mp - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - int fixup = NUMKEYS(mp); - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc) - continue; - if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_flags & C_SPLITTING) - continue; - if (new_root) { - int k; - // root split - for (k=m3->mc_top; k>=0; k--) { - m3->mc_ki[k+1] = m3->mc_ki[k]; - m3->mc_pg[k+1] = m3->mc_pg[k]; - } - if (m3->mc_ki[0] >= split_indx) { - m3->mc_ki[0] = 1; - } else { - m3->mc_ki[0] = 0; - } - m3->mc_pg[0] = mc->mc_pg[0]; - m3->mc_snum++; - m3->mc_top++; - } - if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) - m3->mc_ki[mc->mc_top]++; - if (m3->mc_ki[mc->mc_top] >= fixup) { - m3->mc_pg[mc->mc_top] = rp; - m3->mc_ki[mc->mc_top] -= fixup; - m3->mc_ki[ptop] = mn.mc_ki[ptop]; - } - } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && - m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { - m3->mc_ki[ptop]++; - } - } - } - DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); - return rc; - */ - return nil -} - -// Add all the DB's pages to the free list. -// @param[in] mc Cursor on the DB to free. -// @param[in] subs non-Zero to check for sub-DBs in this DB. -// @return 0 on success, non-zero on failure. -func (c *Cursor) drop0(subs int) error { - /* - int rc; - - rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); - if (rc == MDB_SUCCESS) { - MDB_txn *txn = mc->mc_txn; - MDB_node *ni; - MDB_cursor mx; - unsigned int i; - - // LEAF2 pages have no nodes, cannot have sub-DBs - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) - mdb_cursor_pop(mc); - - mdb_cursor_copy(mc, &mx); - while (mc->mc_snum > 0) { - MDB_page *mp = mc->mc_pg[mc->mc_top]; - unsigned n = NUMKEYS(mp); - if (IS_LEAF(mp)) { - for (i=0; imn_flags & F_BIGDATA) { - MDB_page *omp; - pgno_t pg; - memcpy(&pg, NODEDATA(ni), sizeof(pg)); - rc = mdb_page_get(txn, pg, &omp, NULL); - if (rc != 0) - return rc; - mdb_cassert(mc, IS_OVERFLOW(omp)); - rc = mdb_midl_append_range(&txn->mt_free_pgs, - pg, omp->mp_pages); - if (rc) - return rc; - } else if (subs && (ni->mn_flags & F_SUBDATA)) { - mdb_xcursor_init1(mc, ni); - rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); - if (rc) - return rc; - } - } - } else { - if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) - return rc; - for (i=0; imt_free_pgs, pg); - } - } - if (!mc->mc_top) - break; - mc->mc_ki[mc->mc_top] = i; - rc = mdb_cursor_sibling(mc, 1); - if (rc) { - // no more siblings, go back to beginning - // of previous level. - mdb_cursor_pop(mc); - mc->mc_ki[0] = 0; - for (i=1; imc_snum; i++) { - mc->mc_ki[i] = 0; - mc->mc_pg[i] = mx.mc_pg[i]; - } - } - } - // free it - rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); - } else if (rc == MDB_NOTFOUND) { - rc = MDB_SUCCESS; - } - return rc; - */ +func (c *Cursor) insert(key []byte, data []byte) error { + // TODO: If there is not enough space on page for key+data then split. + // TODO: Move remaining data on page forward. + // TODO: Write leaf node to current location. + // TODO: Adjust available page size. return nil } diff --git a/db.go b/db.go index 4b2859d..2a90061 100644 --- a/db.go +++ b/db.go @@ -13,7 +13,7 @@ const ( ) var ( - DatabaseNotOpenError = &Error{"db is not open", nil} + DatabaseNotOpenError = &Error{"db is not open", nil} DatabaseAlreadyOpenedError = &Error{"db already open", nil} TransactionInProgressError = &Error{"writable transaction is already in progress", nil} ) @@ -22,26 +22,26 @@ type DB struct { sync.Mutex opened bool - os _os - syscall _syscall - path string - file file - metafile file - data []byte - buf []byte - meta0 *meta - meta1 *meta - pageSize int - rwtransaction *RWTransaction - transactions []*Transaction - maxPageNumber int /**< me_mapsize / me_psize */ - freePages []int /** IDL of pages that became unused in a write txn */ - dirtyPages []int /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + os _os + syscall _syscall + path string + file file + metafile file + data []byte + buf []byte + meta0 *meta + meta1 *meta + pageSize int + rwtransaction *RWTransaction + transactions []*Transaction + maxPageNumber int /**< me_mapsize / me_psize */ + freePages []int /** IDL of pages that became unused in a write txn */ + dirtyPages []int /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ // TODO: scratch []*page // list of temp pages for writing. readers []*reader - maxFreeOnePage int /** Max number of freelist items that can fit in a single overflow page */ + maxFreeOnePage int /** Max number of freelist items that can fit in a single overflow page */ maxPageDataSize int maxNodeSize int /** Max size of a node on a page */ maxKeySize int /**< max size of a key */ @@ -281,8 +281,3 @@ func (db *DB) Stat() *stat { // TODO: Calculate size, depth, page count (by type), entry count, readers, etc. return nil } - -func (db *DB) minTxnID() txnid { - // TODO: Find the oldest transaction id. - return 0 -} diff --git a/error.go b/error.go index 0f17acb..40877dc 100644 --- a/error.go +++ b/error.go @@ -1,26 +1,5 @@ package bolt -var ( - KeyExistError = &Error{"key/data pair already exists", nil} - NotFoundError = &Error{"no matching key/data pair found", nil} - PageNotFoundError = &Error{"requested page not found", nil} - CorruptedError = &Error{"located page was wrong type", nil} - PanicError = &Error{"update of meta page failed", nil} - VersionMismatchError = &Error{"database environment version mismatch", nil} - InvalidError = &Error{"file is not a bolt file", nil} - MapFullError = &Error{"environment mapsize limit reached", nil} - BucketFullError = &Error{"environment maxdbs limit reached", nil} - ReadersFullError = &Error{"environment maxreaders limit reached", nil} - TransactionFullError = &Error{"transaction has too many dirty pages - transaction too big", nil} - CursorFullError = &Error{"internal error - cursor stack limit reached", nil} - PageFullError = &Error{"internal error - page has no more space", nil} - MapResizedError = &Error{"database contents grew beyond environment mapsize", nil} - IncompatibleError = &Error{"operation and db incompatible, or db flags changed", nil} - BadReaderSlotError = &Error{"invalid reuse of reader locktable slot", nil} - BadTransactionError = &Error{"transaction cannot recover - it must be aborted", nil} - BadValueSizeError = &Error{"too big key/data or key is empty", nil} -) - type Error struct { message string cause error diff --git a/file.go b/file.go deleted file mode 100644 index 9395d89..0000000 --- a/file.go +++ /dev/null @@ -1,12 +0,0 @@ -package bolt - -import ( - "os" -) - -type file interface { - Fd() uintptr - ReadAt(b []byte, off int64) (n int, err error) - Stat() (fi os.FileInfo, err error) - WriteAt(b []byte, off int64) (n int, err error) -} diff --git a/file_test.go b/file_test.go deleted file mode 100644 index f001160..0000000 --- a/file_test.go +++ /dev/null @@ -1,65 +0,0 @@ -package bolt - -import ( - "os" - "time" - - "github.com/stretchr/testify/mock" -) - -type mockfile struct { - mock.Mock - fd uintptr -} - -func (m *mockfile) Fd() uintptr { - return m.fd -} - -func (m *mockfile) ReadAt(b []byte, off int64) (n int, err error) { - args := m.Called(b, off) - return args.Int(0), args.Error(1) -} - -func (m *mockfile) Stat() (os.FileInfo, error) { - args := m.Called() - return args.Get(0).(os.FileInfo), args.Error(1) -} - -func (m *mockfile) WriteAt(b []byte, off int64) (n int, err error) { - args := m.Called(b, off) - return args.Int(0), args.Error(1) -} - -type mockfileinfo struct { - name string - size int64 - mode os.FileMode - modTime time.Time - isDir bool - sys interface{} -} - -func (m *mockfileinfo) Name() string { - return m.name -} - -func (m *mockfileinfo) Size() int64 { - return m.size -} - -func (m *mockfileinfo) Mode() os.FileMode { - return m.mode -} - -func (m *mockfileinfo) ModTime() time.Time { - return m.modTime -} - -func (m *mockfileinfo) IsDir() bool { - return m.isDir -} - -func (m *mockfileinfo) Sys() interface{} { - return m.sys -} diff --git a/info.go b/info.go deleted file mode 100644 index e32587e..0000000 --- a/info.go +++ /dev/null @@ -1,10 +0,0 @@ -package bolt - -// Info contains information about the database. -type Info struct { - MapSize int - LastPageID int - LastTransactionID int - MaxReaders int - ReaderCount int -} diff --git a/lnode.go b/lnode.go new file mode 100644 index 0000000..9e457b6 --- /dev/null +++ b/lnode.go @@ -0,0 +1,30 @@ +package bolt + +import ( + "unsafe" +) + +type nodeid uint16 + +// lnode represents a node on a leaf page. +type lnode struct { + flags uint16 + keySize uint16 + dataSize uint32 + data uintptr // Pointer to the beginning of the data. +} + +// key returns a byte slice that of the node key. +func (n *lnode) key() []byte { + return (*[MaxKeySize]byte)(unsafe.Pointer(&n.data))[:n.keySize] +} + +// data returns a byte slice that of the node data. +func (n *lnode) data() []byte { + return (*[MaxKeySize]byte)(unsafe.Pointer(&n.data))[n.keySize : n.keySize+n.dataSize] +} + +// lnodeSize returns the number of bytes required to store a key+data as a leaf node. +func lnodeSize(key []byte, data []byte) int { + return int(unsafe.Offsetof((*lnode)(nil)).data) + len(key) + len(data) +} diff --git a/meta.go b/meta.go index 6fa2df1..881417a 100644 --- a/meta.go +++ b/meta.go @@ -4,37 +4,16 @@ var ( InvalidMetaPageError = &Error{"Invalid meta page", nil} ) -// TODO: #define mm_psize mm_dbs[0].md_pad -// TODO: #define mm_flags mm_dbs[0].md_flags - -// TODO: -// typedef union MDB_metabuf { -// MDB_page mb_page; -// struct { -// char mm_pad[PAGEHDRSZ]; -// MDB_meta mm_meta; -// } mb_metabuf; -// } MDB_metabuf; - -// TODO: -// typedef struct MDB_dbx { -// MDB_val md_name; /**< name of the database */ -// MDB_cmp_func *md_cmp; /**< function for comparing keys */ -// MDB_cmp_func *md_dcmp; /**< function for comparing data items */ -// MDB_rel_func *md_rel; /**< user relocate function */ -// void *md_relctx; /**< user-provided context for md_rel */ -// } MDB_dbx; - const magic uint32 = 0xC0DEC0DE const version uint32 = 1 type meta struct { - magic uint32 - version uint32 - free bucket - buckets bucket - pgno int - txnid int + magic uint32 + version uint32 + buckets bucket + pageSize uint32 + pgid pgid + txnid txnid } // validate checks the marker bytes and version of the meta page to ensure it matches this binary. @@ -46,18 +25,3 @@ func (m *meta) validate() error { } return nil } - -// Read the environment parameters of a DB environment before -// mapping it into memory. -// @param[in] env the environment handle -// @param[out] meta address of where to store the meta information -// @return 0 on success, non-zero on failure. -func (m *meta) read(p *page) error { - /* - if (off == 0 || m->mm_txnid > meta->mm_txnid) - *meta = *m; - } - return 0; - */ - return nil -} diff --git a/node.go b/node.go deleted file mode 100644 index 6ca2d72..0000000 --- a/node.go +++ /dev/null @@ -1,40 +0,0 @@ -package bolt - -import ( - "unsafe" -) - -// node represents a node on a page. -type node struct { - flags uint16 - keySize uint16 -} - -// leafNode represents a node on a leaf page. -type leafNode struct { - node - dataSize uint32 - data uintptr // Pointer to the beginning of the data. -} - -// branchNode represents a node on a branch page. -type branchNode struct { - node - pgno uint32 - data uintptr // Pointer to the beginning of the data. -} - -// key returns a byte slice that of the key data. -func (n *leafNode) key() []byte { - return (*[MaxKeySize]byte)(unsafe.Pointer(&n.data))[:n.keySize] -} - -func leafNodeSize(key []byte, data []byte) int { - // TODO: Return even(sizeof(node) + len(key) + len(data)) - return 0 -} - -func branchNodeSize(key []byte) int { - // TODO: Return even(sizeof(node) + len(key)) - return 0 -} diff --git a/os.go b/os.go index f8945bf..606a207 100644 --- a/os.go +++ b/os.go @@ -10,6 +10,13 @@ type _os interface { Getpagesize() int } +type file interface { + Fd() uintptr + ReadAt(b []byte, off int64) (n int, err error) + Stat() (fi os.FileInfo, err error) + WriteAt(b []byte, off int64) (n int, err error) +} + type sysos struct{} func (o *sysos) OpenFile(name string, flag int, perm os.FileMode) (file file, err error) { diff --git a/os_test.go b/os_test.go index 8a9808e..b2c7557 100644 --- a/os_test.go +++ b/os_test.go @@ -24,3 +24,60 @@ func (m *mockos) Getpagesize() int { args := m.Called() return args.Int(0) } + +type mockfile struct { + mock.Mock + fd uintptr +} + +func (m *mockfile) Fd() uintptr { + return m.fd +} + +func (m *mockfile) ReadAt(b []byte, off int64) (n int, err error) { + args := m.Called(b, off) + return args.Int(0), args.Error(1) +} + +func (m *mockfile) Stat() (os.FileInfo, error) { + args := m.Called() + return args.Get(0).(os.FileInfo), args.Error(1) +} + +func (m *mockfile) WriteAt(b []byte, off int64) (n int, err error) { + args := m.Called(b, off) + return args.Int(0), args.Error(1) +} + +type mockfileinfo struct { + name string + size int64 + mode os.FileMode + modTime time.Time + isDir bool + sys interface{} +} + +func (m *mockfileinfo) Name() string { + return m.name +} + +func (m *mockfileinfo) Size() int64 { + return m.size +} + +func (m *mockfileinfo) Mode() os.FileMode { + return m.mode +} + +func (m *mockfileinfo) ModTime() time.Time { + return m.modTime +} + +func (m *mockfileinfo) IsDir() bool { + return m.isDir +} + +func (m *mockfileinfo) Sys() interface{} { + return m.sys +} diff --git a/page.go b/page.go index 736aa91..ba39dc5 100644 --- a/page.go +++ b/page.go @@ -8,55 +8,26 @@ import ( const maxPageSize = 0x8000 const minKeyCount = 2 -var _page page - -const pageHeaderSize = int(unsafe.Offsetof(_page.ptr)) +const pageHeaderSize = int(unsafe.Offsetof(((*page)(nil)).data)) const minPageKeys = 2 const fillThreshold = 250 // 25% const ( - p_branch = 0x01 - p_leaf = 0x02 - p_overflow = 0x04 - p_meta = 0x08 - p_dirty = 0x10 /**< dirty page, also set for #P_SUBP pages */ - - p_invalid = ^pgno(0) + p_branch = 0x01 + p_leaf = 0x02 + p_meta = 0x04 ) -// maxCommitPages is the maximum number of pages to commit in one writev() call. -const maxCommitPages = 64 - -/* max bytes to write in one call */ -const maxWriteByteCount uint = 0x80000000 // TODO: #define MAX_WRITE 0x80000000U >> (sizeof(ssize_t) == 4)) - -// TODO: -// #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES -// #undef MDB_COMMIT_PAGES -// #define MDB_COMMIT_PAGES IOV_MAX -// #endif - -const ( - MDB_PS_MODIFY = 1 - MDB_PS_ROOTONLY = 2 - MDB_PS_FIRST = 4 - MDB_PS_LAST = 8 -) - -// TODO: #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ - -type pgno uint64 -type txnid uint64 -type indx uint16 +type pgid uint64 type page struct { - id pgno - flags uint16 - lower indx - upper indx - overflow uint32 - ptr uintptr + id pgid + flags uint32 + lower uint16 + upper uint16 + count uint32 + data uintptr } // meta returns a pointer to the metadata section of the page. @@ -80,72 +51,7 @@ func (p *page) init(pageSize int) { m := (*meta)(unsafe.Pointer(&p.ptr)) m.magic = magic m.version = version - m.free.pad = uint32(pageSize) - m.pgno = 1 - m.free.root = p_invalid - m.buckets.root = p_invalid -} - -// branchNode retrieves the branch node at the given index within the page. -func (p *page) branchNode(index indx) *branchNode { - b := (*[maxPageSize]byte)(unsafe.Pointer(&p.ptr)) - return (*branchNode)(unsafe.Pointer(&b[index*indx(unsafe.Sizeof(index))])) -} - -// leafNode retrieves the leaf node at the given index within the page. -func (p *page) leafNode(index indx) *leafNode { - b := (*[maxPageSize]byte)(unsafe.Pointer(&p.ptr)) - return (*leafNode)(unsafe.Pointer(&b[index*indx(unsafe.Sizeof(index))])) -} - -// numkeys returns the number of nodes in the page. -func (p *page) numkeys() int { - return int((p.lower - indx(pageHeaderSize)) >> 1) -} - -// remainingSize returns the number of bytes left in the page. -func (p *page) remainingSize() int { - return int(p.upper - p.lower) -} - -// find returns the node with the smallest entry larger or equal to the key. -// This function also returns a boolean stating if an exact match was made. -func (p *page) find(key []byte, pageSize int) (*node, int, bool) { - // TODO: MDB_page *mp = mc->mc_pg[mc->mc_top]; - - var node *node - nkeys := p.numkeys() - low, high := 1, nkeys-1 - if (p.flags & p_leaf) != 0 { - low = 0 - } - - // Perform a binary search to find the correct node. - var i, rc int - for low <= high { - i = (low + high) / 2 - - node = p.node(indx(i)) - rc = bytes.Compare(key, node.key()) - if rc == 0 { - break - } else if rc > 0 { - low = i + 1 - } else { - high = i - 1 - } - } - - // Found entry is less than key so grab the next one. - if rc > 0 { - i++ - } - - // If index is beyond key range then return nil. - if i >= nkeys { - node = nil - } - - exact := (rc == 0 && nkeys > 0) - return node, i, exact + m.pageSize = uint32(pageSize) + m.pgid = 1 + m.buckets.root = 0 } diff --git a/reader.go b/reader.go deleted file mode 100644 index 73dbc40..0000000 --- a/reader.go +++ /dev/null @@ -1,5 +0,0 @@ -package bolt - -type reader struct { - txnid int -} diff --git a/rwcursor.go b/rwcursor.go index 300ec90..70fb43a 100644 --- a/rwcursor.go +++ b/rwcursor.go @@ -1,18 +1,9 @@ package bolt -/* -type RWCursor interface { - Put([]byte, []byte) (error) - Delete([]byte) (error) -} -*/ - // RWCursor represents a cursor that can read and write data for a bucket. type RWCursor struct { Cursor transaction *RWTransaction - reclaimed []pgno /**< Reclaimed freeDB pages, or NULL before use (was me_pghead) */ - last txnid /**< ID of last used record, or 0 if len(reclaimed) == 0 */ } func (c *RWCursor) Put(key []byte, value []byte) error { @@ -71,337 +62,14 @@ func (c *RWCursor) Put(key []byte, value []byte) error { } - /* + return nil +} - insert = rc; - if (insert) { - // The key does not exist - DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - LEAFSIZE(key, data) > env->me_nodemax) - { - // Too big for a node, insert in sub-DB - fp_flags = P_LEAF|P_DIRTY; - fp = env->me_pbuf; - fp->mp_pad = data->mv_size; // used if MDB_DUPFIXED - fp->mp_lower = fp->mp_upper = olddata.mv_size = PAGEHDRSZ; - goto prep_subDB; - } - } else { - - more: - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - olddata.mv_size = NODEDSZ(leaf); - olddata.mv_data = NODEDATA(leaf); - - // DB has dups? - if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { - // Prepare (sub-)page/sub-DB to accept the new item, - // if needed. fp: old sub-page or a header faking - // it. mp: new (sub-)page. offset: growth in page - // size. xdata: node data with new page or DB. - ssize_t i, offset = 0; - mp = fp = xdata.mv_data = env->me_pbuf; - mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; - - // Was a single item before, must convert now - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - // Just overwrite the current item - if (flags == MDB_CURRENT) - goto current; - - #if UINT_MAX < SIZE_MAX - if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) - #ifdef MISALIGNED_OK - mc->mc_dbx->md_dcmp = mdb_cmp_long; - #else - mc->mc_dbx->md_dcmp = mdb_cmp_cint; - #endif - #endif - // if data matches, skip it - if (!mc->mc_dbx->md_dcmp(data, &olddata)) { - if (flags & MDB_NODUPDATA) - rc = MDB_KEYEXIST; - else if (flags & MDB_MULTIPLE) - goto next_mult; - else - rc = MDB_SUCCESS; - return rc; - } - - // Back up original data item - dkey.mv_size = olddata.mv_size; - dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); - - // Make sub-page header for the dup items, with dummy body - fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; - fp->mp_lower = PAGEHDRSZ; - xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - fp->mp_flags |= P_LEAF2; - fp->mp_pad = data->mv_size; - xdata.mv_size += 2 * data->mv_size; // leave space for 2 more - } else { - xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + - (dkey.mv_size & 1) + (data->mv_size & 1); - } - fp->mp_upper = xdata.mv_size; - olddata.mv_size = fp->mp_upper; // pretend olddata is fp - } else if (leaf->mn_flags & F_SUBDATA) { - // Data is on sub-DB, just store it - flags |= F_DUPDATA|F_SUBDATA; - goto put_sub; - } else { - // Data is on sub-page - fp = olddata.mv_data; - switch (flags) { - default: - i = -(ssize_t)SIZELEFT(fp); - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - offset = i += (ssize_t) EVEN( - sizeof(indx_t) + NODESIZE + data->mv_size); - } else { - i += offset = fp->mp_pad; - offset *= 4; // space for 4 more - } - if (i > 0) - break; - // FALLTHRU: Sub-page is big enough - case MDB_CURRENT: - fp->mp_flags |= P_DIRTY; - COPY_PGNO(fp->mp_pgno, mp->mp_pgno); - mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; - flags |= F_DUPDATA; - goto put_sub; - } - xdata.mv_size = olddata.mv_size + offset; - } - - fp_flags = fp->mp_flags; - if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { - // Too big for a sub-page, convert to sub-DB - fp_flags &= ~P_SUBP; - prep_subDB: - dummy.md_pad = 0; - dummy.md_flags = 0; - dummy.md_depth = 1; - dummy.md_branch_pages = 0; - dummy.md_leaf_pages = 1; - dummy.md_overflow_pages = 0; - dummy.md_entries = NUMKEYS(fp); - xdata.mv_size = sizeof(MDB_db); - xdata.mv_data = &dummy; - if ((rc = mdb_page_alloc(mc, 1, &mp))) - return rc; - offset = env->me_psize - olddata.mv_size; - flags |= F_DUPDATA|F_SUBDATA; - dummy.md_root = mp->mp_pgno; - } - if (mp != fp) { - mp->mp_flags = fp_flags | P_DIRTY; - mp->mp_pad = fp->mp_pad; - mp->mp_lower = fp->mp_lower; - mp->mp_upper = fp->mp_upper + offset; - if (fp_flags & P_LEAF2) { - memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); - } else { - memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, - olddata.mv_size - fp->mp_upper); - for (i = NUMKEYS(fp); --i >= 0; ) - mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; - } - } - - rdata = &xdata; - flags |= F_DUPDATA; - do_sub = 1; - if (!insert) - mdb_node_del(mc, 0); - goto new_sub; - } - current: - // overflow page overwrites need special handling - if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; - pgno_t pg; - int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); - - memcpy(&pg, olddata.mv_data, sizeof(pg)); - if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0) - return rc2; - ovpages = omp->mp_pages; - - // Is the ov page large enough? - if (ovpages >= dpages) { - if (!(omp->mp_flags & P_DIRTY) && - (level || (env->me_flags & MDB_WRITEMAP))) - { - rc = mdb_page_unspill(mc->mc_txn, omp, &omp); - if (rc) - return rc; - level = 0; // dirty in this txn or clean - } - // Is it dirty? - if (omp->mp_flags & P_DIRTY) { - // yes, overwrite it. Note in this case we don't - // bother to try shrinking the page if the new data - // is smaller than the overflow threshold. - if (level > 1) { - // It is writable only in a parent txn - size_t sz = (size_t) env->me_psize * ovpages, off; - MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); - MDB_ID2 id2; - if (!np) - return ENOMEM; - id2.mid = pg; - id2.mptr = np; - rc = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); - mdb_cassert(mc, rc == 0); - if (!(flags & MDB_RESERVE)) { - // Copy end of page, adjusting alignment so - // compiler may copy words instead of bytes. - off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); - memcpy((size_t *)((char *)np + off), - (size_t *)((char *)omp + off), sz - off); - sz = PAGEHDRSZ; - } - memcpy(np, omp, sz); // Copy beginning of page - omp = np; - } - SETDSZ(leaf, data->mv_size); - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = METADATA(omp); - else - memcpy(METADATA(omp), data->mv_data, data->mv_size); - goto done; - } - } - if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) - return rc2; - } else if (data->mv_size == olddata.mv_size) { - // same size, just replace it. Note that we could - // also reuse this node if the new data is smaller, - // but instead we opt to shrink the node in that case. - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = olddata.mv_data; - else if (data->mv_size) - memcpy(olddata.mv_data, data->mv_data, data->mv_size); - else - memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); - goto done; - } - mdb_node_del(mc, 0); - mc->mc_db->md_entries--; - } - - rdata = data; - - new_sub: - nflags = flags & NODE_ADD_FLAGS; - nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); - if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { - if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) - nflags &= ~MDB_APPEND; - if (!insert) - nflags |= MDB_SPLIT_REPLACE; - rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); - } else { - // There is room already in this leaf page. - rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); - if (rc == 0 && !do_sub && insert) { - // Adjust other cursors pointing to mp - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - unsigned i = mc->mc_top; - MDB_page *mp = mc->mc_pg[i]; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) { - m3->mc_ki[i]++; - } - } - } - } - - if (rc != MDB_SUCCESS) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - else { - // Now store the actual data in the child DB. Note that we're - // storing the user data in the keys field, so there are strict - // size limits on dupdata. The actual data fields of the child - // DB are all zero size. - if (do_sub) { - int xflags; - put_sub: - xdata.mv_size = 0; - xdata.mv_data = ""; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (flags & MDB_CURRENT) { - xflags = MDB_CURRENT|MDB_NOSPILL; - } else { - mdb_xcursor_init1(mc, leaf); - xflags = (flags & MDB_NODUPDATA) ? - MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; - } - // converted, write the original data first - if (dkey.mv_size) { - rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); - if (rc) - return rc; - { - // Adjust other cursors pointing to mp - MDB_cursor *m2; - unsigned i = mc->mc_top; - MDB_page *mp = mc->mc_pg[i]; - - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (!(m2->mc_flags & C_INITIALIZED)) continue; - if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) { - mdb_xcursor_init1(m2, leaf); - } - } - } - // we've done our job - dkey.mv_size = 0; - } - if (flags & MDB_APPENDDUP) - xflags |= MDB_APPEND; - rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); - if (flags & F_SUBDATA) { - void *db = NODEDATA(leaf); - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); - } - } - // sub-writes might have failed so check rc again. - // Don't increment count if we just replaced an existing item. - if (!rc && !(flags & MDB_CURRENT)) - mc->mc_db->md_entries++; - if (flags & MDB_MULTIPLE) { - if (!rc) { - next_mult: - mcount++; - // let caller know how many succeeded, if any - data[1].mv_size = mcount; - if (mcount < dcount) { - data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; - goto more; - } - } - } - } - done: - // If we succeeded and the key didn't exist before, make sure - // the cursor is marked valid. - if (!rc && insert) - mc->mc_flags |= C_INITIALIZED; - return rc; - */ +func (c *Cursor) Delete(key []byte) error { + // TODO: Traverse to the correct node. + // TODO: If missing, exit. + // TODO: Remove node from page. + // TODO: If page is empty then add it to the freelist. return nil } @@ -417,7 +85,6 @@ func (c *RWCursor) newLeafPage() (*page, error) { p.flags = p_leaf | p_dirty p.lower = pageHeaderSize p.upper = c.transaction.db.pageSize - c.leafs += 1 return p, nil } @@ -434,175 +101,6 @@ func (b *RWCursor) newBranchPage() (*page, error) { p.flags = p_branch | p_dirty p.lower = pageHeaderSize p.upper = c.transaction.db.pageSize - c.bucket.branches += 1 return p, nil } - -// newOverflowPage allocates and initialize new overflow pages. -func (b *RWCursor) newOverflowPage(count int) (*page, error) { - // Allocate page. - p, err := c.allocatePage(count) - if err != nil { - return nil, err - } - - // Set flags and bounds. - p.flags = p_overflow | p_dirty - p.lower = pageHeaderSize - p.upper = c.transaction.db.pageSize - c.bucket.overflows += count - - return p, nil -} - -// Allocate page numbers and memory for writing. Maintain me_pglast, -// me_pghead and mt_next_pgno. -// -// If there are free pages available from older transactions, they -// are re-used first. Otherwise allocate a new page at mt_next_pgno. -// Do not modify the freedB, just merge freeDB records into me_pghead[] -// and move me_pglast to say which records were consumed. Only this -// function can create me_pghead and move me_pglast/mt_next_pgno. -// @param[in] mc cursor A cursor handle identifying the transaction and -// database for which we are allocating. -// @param[in] num the number of pages to allocate. -// @param[out] mp Address of the allocated page(s). Requests for multiple pages -// will always be satisfied by a single contiguous chunk of memory. -// @return 0 on success, non-zero on failure. - -// allocatePage allocates a new page. -func (c *RWCursor) allocatePage(count int) (*page, error) { - head := env.pagestate.head - - // TODO? - // If our dirty list is already full, we can't do anything - // if (txn->mt_dirty_room == 0) { - // rc = MDB_TXN_FULL; - // goto fail; - // } - - /* - int rc, retry = INT_MAX; - MDB_txn *txn = mc->mc_txn; - MDB_env *env = txn->mt_env; - pgno_t pgno, *mop = env->me_pghead; - unsigned i, j, k, mop_len = mop ? mop[0] : 0, n2 = num-1; - MDB_page *np; - txnid_t oldest = 0, last; - MDB_cursor_op op; - MDB_cursor m2; - - *mp = NULL; - - - for (op = MDB_FIRST;; op = MDB_NEXT) { - MDB_val key, data; - MDB_node *leaf; - pgno_t *idl, old_id, new_id; - - // Seek a big enough contiguous page range. Prefer - // pages at the tail, just truncating the list. - if (mop_len > n2) { - i = mop_len; - do { - pgno = mop[i]; - if (mop[i-n2] == pgno+n2) - goto search_done; - } while (--i > n2); - if (Max_retries < INT_MAX && --retry < 0) - break; - } - - if (op == MDB_FIRST) { // 1st iteration - // Prepare to fetch more and coalesce - oldest = mdb_find_oldest(txn); - last = env->me_pglast; - mdb_cursor_init(&m2, txn, FREE_DBI, NULL); - if (last) { - op = MDB_SET_RANGE; - key.mv_data = &last; // will look up last+1 - key.mv_size = sizeof(last); - } - } - - last++; - // Do not fetch more if the record will be too recent - if (oldest <= last) - break; - rc = mdb_cursor_get(&m2, &key, NULL, op); - if (rc) { - if (rc == MDB_NOTFOUND) - break; - goto fail; - } - last = *(txnid_t*)key.mv_data; - if (oldest <= last) - break; - np = m2.mc_pg[m2.mc_top]; - leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); - if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS) - return rc; - - idl = (MDB_ID *) data.mv_data; - i = idl[0]; - if (!mop) { - if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { - rc = ENOMEM; - goto fail; - } - } else { - if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) - goto fail; - mop = env->me_pghead; - } - env->me_pglast = last; - - // Merge in descending sorted order - j = mop_len; - k = mop_len += i; - mop[0] = (pgno_t)-1; - old_id = mop[j]; - while (i) { - new_id = idl[i--]; - for (; old_id < new_id; old_id = mop[--j]) - mop[k--] = old_id; - mop[k--] = new_id; - } - mop[0] = mop_len; - } - - // Use new pages from the map when nothing suitable in the freeDB - i = 0; - pgno = txn->mt_next_pgno; - if (pgno + num >= env->me_maxpg) { - DPUTS("DB size maxed out"); - rc = MDB_MAP_FULL; - goto fail; - } - - search_done: - if (!(np = mdb_page_malloc(txn, num))) { - rc = ENOMEM; - goto fail; - } - if (i) { - mop[0] = mop_len -= num; - // Move any stragglers down - for (j = i-num; j < mop_len; ) - mop[++j] = mop[++i]; - } else { - txn->mt_next_pgno = pgno + num; - } - np->mp_pgno = pgno; - mdb_page_dirty(txn, np); - *mp = np; - - return MDB_SUCCESS; - - fail: - txn->mt_flags |= MDB_TXN_ERROR; - return rc; - */ - return nil -} diff --git a/rwtransaction.go b/rwtransaction.go index cb0cc0f..f28c8b2 100644 --- a/rwtransaction.go +++ b/rwtransaction.go @@ -6,7 +6,7 @@ type RWTransaction struct { Transaction dirtyPages map[int]*page - freePages map[int]*page + freelist []pgno } // TODO: Allocate scratch meta page. @@ -95,7 +95,6 @@ func (t *Transaction) Put(name string, key []byte, value []byte) error { return c.Put(key, value) } - // page returns a reference to the page with a given id. // If page has been written to then a temporary bufferred page is returned. func (t *Transaction) page(id int) *page { @@ -127,3 +126,10 @@ func (t *RWTransaction) DeleteBucket(name string) error { return nil } + +// allocate returns a contiguous block of memory starting at a given page. +func (t *RWTransaction) allocate(count int) (*page, error) { + // TODO: Find a continuous block of free pages. + // TODO: If no free pages are available, resize the mmap to allocate more. + return nil, nil +} diff --git a/stat.go b/stat.go index 2020d9a..b01fa99 100644 --- a/stat.go +++ b/stat.go @@ -1,6 +1,6 @@ package bolt -type stat struct { +type Stat struct { PageSize int Depth int BranchPageCount int diff --git a/transaction.go b/transaction.go index 6319b9e..b79f3e7 100644 --- a/transaction.go +++ b/transaction.go @@ -6,7 +6,7 @@ import ( ) var ( - InvalidTransactionError = &Error{"txn is invalid", nil} + InvalidTransactionError = &Error{"txn is invalid", nil} BucketAlreadyExistsError = &Error{"bucket already exists", nil} ) @@ -17,6 +17,8 @@ const ( ps_last = 8 ) +type txnid uint64 + type Transaction struct { id int db *DB @@ -158,166 +160,6 @@ func (t *Transaction) Stat(name string) *stat { // // // // - -// Save the freelist as of this transaction to the freeDB. -// This changes the freelist. Keep trying until it stabilizes. -func (t *Transaction) saveFreelist() error { - /* - // env->me_pghead[] can grow and shrink during this call. - // env->me_pglast and txn->mt_free_pgs[] can only grow. - // Page numbers cannot disappear from txn->mt_free_pgs[]. - MDB_cursor mc; - MDB_env *env = txn->mt_env; - int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; - txnid_t pglast = 0, head_id = 0; - pgno_t freecnt = 0, *free_pgs, *mop; - ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; - - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - - if (env->me_pghead) { - // Make sure first page of freeDB is touched and on freelist - rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); - if (rc && rc != MDB_NOTFOUND) - return rc; - } - - // MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) - clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) - ? SSIZE_MAX : maxfree_1pg; - - for (;;) { - // Come back here after each Put() in case freelist changed - MDB_val key, data; - pgno_t *pgs; - ssize_t j; - - // If using records from freeDB which we have not yet - // deleted, delete them and any we reserved for me_pghead. - while (pglast < env->me_pglast) { - rc = mdb_cursor_first(&mc, &key, NULL); - if (rc) - return rc; - pglast = head_id = *(txnid_t *)key.mv_data; - total_room = head_room = 0; - mdb_tassert(txn, pglast <= env->me_pglast); - rc = mdb_cursor_del(&mc, 0); - if (rc) - return rc; - } - - // Save the IDL of pages freed by this txn, to a single record - if (freecnt < txn->mt_free_pgs[0]) { - if (!freecnt) { - // Make sure last page of freeDB is touched and on freelist - rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); - if (rc && rc != MDB_NOTFOUND) - return rc; - } - free_pgs = txn->mt_free_pgs; - // Write to last page of freeDB - key.mv_size = sizeof(txn->mt_txnid); - key.mv_data = &txn->mt_txnid; - do { - freecnt = free_pgs[0]; - data.mv_size = MDB_IDL_SIZEOF(free_pgs); - rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); - if (rc) - return rc; - // Retry if mt_free_pgs[] grew during the Put() - free_pgs = txn->mt_free_pgs; - } while (freecnt < free_pgs[0]); - mdb_midl_sort(free_pgs); - memcpy(data.mv_data, free_pgs, data.mv_size); - #if (MDB_DEBUG) > 1 - { - unsigned int i = free_pgs[0]; - DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", - txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); - for (; i; i--) - DPRINTF(("IDL %"Z"u", free_pgs[i])); - } - #endif - continue; - } - - mop = env->me_pghead; - mop_len = mop ? mop[0] : 0; - - // Reserve records for me_pghead[]. Split it if multi-page, - // to avoid searching freeDB for a page range. Use keys in - // range [1,me_pglast]: Smaller than txnid of oldest reader. - if (total_room >= mop_len) { - if (total_room == mop_len || --more < 0) - break; - } else if (head_room >= maxfree_1pg && head_id > 1) { - // Keep current record (overflow page), add a new one - head_id--; - head_room = 0; - } - // (Re)write {key = head_id, IDL length = head_room} - total_room -= head_room; - head_room = mop_len - total_room; - if (head_room > maxfree_1pg && head_id > 1) { - // Overflow multi-page for part of me_pghead - head_room /= head_id; // amortize page sizes - head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); - } else if (head_room < 0) { - // Rare case, not bothering to delete this record - head_room = 0; - } - key.mv_size = sizeof(head_id); - key.mv_data = &head_id; - data.mv_size = (head_room + 1) * sizeof(pgno_t); - rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); - if (rc) - return rc; - // IDL is initially empty, zero out at least the length - pgs = (pgno_t *)data.mv_data; - j = head_room > clean_limit ? head_room : 0; - do { - pgs[j] = 0; - } while (--j >= 0); - total_room += head_room; - } - - // Fill in the reserved me_pghead records - rc = MDB_SUCCESS; - if (mop_len) { - MDB_val key, data; - - mop += mop_len; - rc = mdb_cursor_first(&mc, &key, &data); - for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { - unsigned flags = MDB_CURRENT; - txnid_t id = *(txnid_t *)key.mv_data; - ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; - MDB_ID save; - - mdb_tassert(txn, len >= 0 && id <= env->me_pglast); - key.mv_data = &id; - if (len > mop_len) { - len = mop_len; - data.mv_size = (len + 1) * sizeof(MDB_ID); - flags = 0; - } - data.mv_data = mop -= len; - save = mop[0]; - mop[0] = len; - rc = mdb_cursor_put(&mc, &key, &data, flags); - mop[0] = save; - if (rc || !(mop_len -= len)) - break; - } - } - return rc; - */ - return nil -} - - - - // Return the data associated with a given node. // @param[in] txn The transaction for this operation. // @param[in] leaf The node being read.