diff --git a/cursor.go b/cursor.go index 8399b88..2bf8318 100644 --- a/cursor.go +++ b/cursor.go @@ -9,6 +9,9 @@ package bolt // TODO: #define C_SPLITTING 0x20 /**< Cursor is in page_split */ // TODO: #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ +// TODO: #define MDB_NOSPILL 0x8000 /** Do not spill pages to disk if txn is getting full, may fail instead */ + + type Cursor interface { First() error FirstDup() error @@ -42,3 +45,3557 @@ type cursor struct { page []*page ki []int /**< stack of page indices */ } + +// Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. +// @param[in] mc A cursor handle for the current operation. +// @param[in] pflags Flags of the pages to update: +// P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. +// @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). +// @return 0 on success, non-zero on failure. +func (c *cursor) xkeep(unsigned pflags, int all) int { + /* + enum { Mask = P_SUBP|P_DIRTY|P_KEEP }; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m3; + MDB_xcursor *mx; + MDB_page *dp, *mp; + MDB_node *leaf; + unsigned i, j; + int rc = MDB_SUCCESS, level; + + // Mark pages seen by cursors + if (mc->mc_flags & C_UNTRACK) + mc = NULL; // will find mc in mt_cursors + for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { + for (; mc; mc=mc->mc_next) { + if (!(mc->mc_flags & C_INITIALIZED)) + continue; + for (m3 = mc;; m3 = &mx->mx_cursor) { + mp = NULL; + for (j=0; jmc_snum; j++) { + mp = m3->mc_pg[j]; + if ((mp->mp_flags & Mask) == pflags) + mp->mp_flags ^= P_KEEP; + } + mx = m3->mc_xcursor; + // Proceed to mx if it is at a sub-database + if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) + break; + if (! (mp && (mp->mp_flags & P_LEAF))) + break; + leaf = NODEPTR(mp, m3->mc_ki[j-1]); + if (!(leaf->mn_flags & F_SUBDATA)) + break; + } + } + if (i == 0) + break; + } + + if (all) { + // Mark dirty root pages + for (i=0; imt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + pgno_t pgno = txn->mt_dbs[i].md_root; + if (pgno == P_INVALID) + continue; + if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) + break; + if ((dp->mp_flags & Mask) == pflags && level <= 1) + dp->mp_flags ^= P_KEEP; + } + } + } + + return rc; + */ + return 0 +} + +// Spill pages from the dirty list back to disk. +// This is intended to prevent running into #MDB_TXN_FULL situations, +// but note that they may still occur in a few cases: +// 1) our estimate of the txn size could be too small. Currently this +// seems unlikely, except with a large number of #MDB_MULTIPLE items. +// 2) child txns may run out of space if their parents dirtied a +// lot of pages and never spilled them. TODO: we probably should do +// a preemptive spill during #mdb_txn_begin() of a child txn, if +// the parent's dirty_room is below a given threshold. +// +// Otherwise, if not using nested txns, it is expected that apps will +// not run into #MDB_TXN_FULL any more. The pages are flushed to disk +// the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. +// If the txn never references them again, they can be left alone. +// If the txn only reads them, they can be used without any fuss. +// If the txn writes them again, they can be dirtied immediately without +// going thru all of the work of #mdb_page_touch(). Such references are +// handled by #mdb_page_unspill(). +// +// Also note, we never spill DB root pages, nor pages of active cursors, +// because we'll need these back again soon anyway. And in nested txns, +// we can't spill a page in a child txn if it was already spilled in a +// parent txn. That would alter the parent txns' data even though +// the child hasn't committed yet, and we'd have no way to undo it if +// the child aborted. +// +// @param[in] m0 cursor A cursor handle identifying the transaction and +// database for which we are checking space. +// @param[in] key For a put operation, the key being stored. +// @param[in] data For a put operation, the data being stored. +// @return 0 on success, non-zero on failure. +func (c *cursor) spill(MDB_val *key, MDB_val *data) int { + /* + MDB_txn *txn = m0->mc_txn; + MDB_page *dp; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned int i, j, need; + int rc; + + if (m0->mc_flags & C_SUB) + return MDB_SUCCESS; + + // Estimate how much space this op will take + i = m0->mc_db->md_depth; + // Named DBs also dirty the main DB + if (m0->mc_dbi > MAIN_DBI) + i += txn->mt_dbs[MAIN_DBI].md_depth; + // For puts, roughly factor in the key+data size + if (key) + i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; + i += i; // double it for good measure + need = i; + + if (txn->mt_dirty_room > i) + return MDB_SUCCESS; + + if (!txn->mt_spill_pgs) { + txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); + if (!txn->mt_spill_pgs) + return ENOMEM; + } else { + // purge deleted slots + MDB_IDL sl = txn->mt_spill_pgs; + unsigned int num = sl[0]; + j=0; + for (i=1; i<=num; i++) { + if (!(sl[i] & 1)) + sl[++j] = sl[i]; + } + sl[0] = j; + } + + // Preserve pages which may soon be dirtied again + if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) + goto done; + + // Less aggressive spill - we originally spilled the entire dirty list, + // with a few exceptions for cursor pages and DB root pages. But this + // turns out to be a lot of wasted effort because in a large txn many + // of those pages will need to be used again. So now we spill only 1/8th + // of the dirty pages. Testing revealed this to be a good tradeoff, + // better than 1/2, 1/4, or 1/10. + if (need < MDB_IDL_UM_MAX / 8) + need = MDB_IDL_UM_MAX / 8; + + // Save the page IDs of all the pages we're flushing + // flush from the tail forward, this saves a lot of shifting later on. + for (i=dl[0].mid; i && need; i--) { + MDB_ID pn = dl[i].mid << 1; + dp = dl[i].mptr; + if (dp->mp_flags & P_KEEP) + continue; + // Can't spill twice, make sure it's not already in a parent's + // spill list. + if (txn->mt_parent) { + MDB_txn *tx2; + for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { + if (tx2->mt_spill_pgs) { + j = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { + dp->mp_flags |= P_KEEP; + break; + } + } + } + if (tx2) + continue; + } + if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) + goto done; + need--; + } + mdb_midl_sort(txn->mt_spill_pgs); + + // Flush the spilled part of dirty list + if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) + goto done; + + // Reset any dirty pages we kept that page_flush didn't see + rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); + +done: + txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; + return rc; + /* + return 0 +} + +// Allocate page numbers and memory for writing. Maintain me_pglast, +// me_pghead and mt_next_pgno. +// +// If there are free pages available from older transactions, they +// are re-used first. Otherwise allocate a new page at mt_next_pgno. +// Do not modify the freedB, just merge freeDB records into me_pghead[] +// and move me_pglast to say which records were consumed. Only this +// function can create me_pghead and move me_pglast/mt_next_pgno. +// @param[in] mc cursor A cursor handle identifying the transaction and +// database for which we are allocating. +// @param[in] num the number of pages to allocate. +// @param[out] mp Address of the allocated page(s). Requests for multiple pages +// will always be satisfied by a single contiguous chunk of memory. +// @return 0 on success, non-zero on failure. +func (c *cursor) allocPage(int num, MDB_page **mp) { + int rc, retry = INT_MAX; + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + pgno_t pgno, *mop = env->me_pghead; + unsigned i, j, k, mop_len = mop ? mop[0] : 0, n2 = num-1; + MDB_page *np; + txnid_t oldest = 0, last; + MDB_cursor_op op; + MDB_cursor m2; + + *mp = NULL; + + // If our dirty list is already full, we can't do anything + if (txn->mt_dirty_room == 0) { + rc = MDB_TXN_FULL; + goto fail; + } + + for (op = MDB_FIRST;; op = MDB_NEXT) { + MDB_val key, data; + MDB_node *leaf; + pgno_t *idl, old_id, new_id; + + // Seek a big enough contiguous page range. Prefer + // pages at the tail, just truncating the list. + if (mop_len > n2) { + i = mop_len; + do { + pgno = mop[i]; + if (mop[i-n2] == pgno+n2) + goto search_done; + } while (--i > n2); + if (Max_retries < INT_MAX && --retry < 0) + break; + } + + if (op == MDB_FIRST) { // 1st iteration + // Prepare to fetch more and coalesce + oldest = mdb_find_oldest(txn); + last = env->me_pglast; + mdb_cursor_init(&m2, txn, FREE_DBI, NULL); + if (last) { + op = MDB_SET_RANGE; + key.mv_data = &last; // will look up last+1 + key.mv_size = sizeof(last); + } + } + + last++; + // Do not fetch more if the record will be too recent + if (oldest <= last) + break; + rc = mdb_cursor_get(&m2, &key, NULL, op); + if (rc) { + if (rc == MDB_NOTFOUND) + break; + goto fail; + } + last = *(txnid_t*)key.mv_data; + if (oldest <= last) + break; + np = m2.mc_pg[m2.mc_top]; + leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); + if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS) + return rc; + + idl = (MDB_ID *) data.mv_data; + i = idl[0]; + if (!mop) { + if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { + rc = ENOMEM; + goto fail; + } + } else { + if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) + goto fail; + mop = env->me_pghead; + } + env->me_pglast = last; +#if (MDB_DEBUG) > 1 + DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", + last, txn->mt_dbs[FREE_DBI].md_root, i)); + for (k = i; k; k--) + DPRINTF(("IDL %"Z"u", idl[k])); +#endif + // Merge in descending sorted order + j = mop_len; + k = mop_len += i; + mop[0] = (pgno_t)-1; + old_id = mop[j]; + while (i) { + new_id = idl[i--]; + for (; old_id < new_id; old_id = mop[--j]) + mop[k--] = old_id; + mop[k--] = new_id; + } + mop[0] = mop_len; + } + + // Use new pages from the map when nothing suitable in the freeDB + i = 0; + pgno = txn->mt_next_pgno; + if (pgno + num >= env->me_maxpg) { + DPUTS("DB size maxed out"); + rc = MDB_MAP_FULL; + goto fail; + } + +search_done: + if (env->me_flags & MDB_WRITEMAP) { + np = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + if (!(np = mdb_page_malloc(txn, num))) { + rc = ENOMEM; + goto fail; + } + } + if (i) { + mop[0] = mop_len -= num; + // Move any stragglers down + for (j = i-num; j < mop_len; ) + mop[++j] = mop[++i]; + } else { + txn->mt_next_pgno = pgno + num; + } + np->mp_pgno = pgno; + mdb_page_dirty(txn, np); + *mp = np; + + return MDB_SUCCESS; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; + */ + return 0 +} + +// Copy the used portions of a non-overflow page. +// @param[in] dst page to copy into +// @param[in] src page to copy from +// @param[in] psize size of a page +func (p *page) copyTo(dst *page, size int) { + /* + enum { Align = sizeof(pgno_t) }; + indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; + + // If page isn't full, just copy the used portion. Adjust + // alignment so memcpy may copy words instead of bytes. + if ((unused &= -Align) && !IS_LEAF2(src)) { + upper &= -Align; + memcpy(dst, src, (lower + (Align-1)) & -Align); + memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), + psize - upper); + } else { + memcpy(dst, src, psize - unused); + } + */ +} + +// Touch a page: make it dirty and re-insert into tree with updated pgno. +// @param[in] mc cursor pointing to the page to be touched +// @return 0 on success, non-zero on failure. +func (c *cursor) page_touch() int { + /* + MDB_page *mp = mc->mc_pg[mc->mc_top], *np; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m2, *m3; + pgno_t pgno; + int rc; + + if (!F_ISSET(mp->mp_flags, P_DIRTY)) { + if (txn->mt_flags & MDB_TXN_SPILLS) { + np = NULL; + rc = mdb_page_unspill(txn, mp, &np); + if (rc) + goto fail; + if (np) + goto done; + } + if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || + (rc = mdb_page_alloc(mc, 1, &np))) + goto fail; + pgno = np->mp_pgno; + DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), + mp->mp_pgno, pgno)); + mdb_cassert(mc, mp->mp_pgno != pgno); + mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + // Update the parent page, if any, to point to the new page + if (mc->mc_top) { + MDB_page *parent = mc->mc_pg[mc->mc_top-1]; + MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); + SETPGNO(node, pgno); + } else { + mc->mc_db->md_root = pgno; + } + } else if (txn->mt_parent && !IS_SUBP(mp)) { + MDB_ID2 mid, *dl = txn->mt_u.dirty_list; + pgno = mp->mp_pgno; + // If txn has a parent, make sure the page is in our + // dirty list. + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (mp != dl[x].mptr) { // bad cursor? + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + return 0; + } + } + mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); + // No - copy it + np = mdb_page_malloc(txn, 1); + if (!np) + return ENOMEM; + mid.mid = pgno; + mid.mptr = np; + rc = mdb_mid2l_insert(dl, &mid); + mdb_cassert(mc, rc == 0); + } else { + return 0; + } + + mdb_page_copy(np, mp, txn->mt_env->me_psize); + np->mp_pgno = pgno; + np->mp_flags |= P_DIRTY; + +done: + // Adjust cursors pointing to mp + mc->mc_pg[mc->mc_top] = np; + m2 = txn->mt_cursors[mc->mc_dbi]; + if (mc->mc_flags & C_SUB) { + for (; m2; m2=m2->mc_next) { + m3 = &m2->mc_xcursor->mx_cursor; + if (m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[mc->mc_top] == mp) + m3->mc_pg[mc->mc_top] = np; + } + } else { + for (; m2; m2=m2->mc_next) { + if (m2->mc_snum < mc->mc_snum) continue; + if (m2->mc_pg[mc->mc_top] == mp) { + m2->mc_pg[mc->mc_top] = np; + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) + { + MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]); + if (!(leaf->mn_flags & F_SUBDATA)) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + } + } + } + return 0; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; + */ + + return 0 +} + +// Search for key within a page, using binary search. +// Returns the smallest entry larger or equal to the key. +// If exactp is non-null, stores whether the found entry was an exact match +// in *exactp (1 or 0). +// Updates the cursor index with the index of the found entry. +// If no entry larger or equal to the key is found, returns NULL. +func (c *cursor) search(key []byte) (*node, bool) { + /* + unsigned int i = 0, nkeys; + int low, high; + int rc = 0; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NULL; + MDB_val nodekey; + MDB_cmp_func *cmp; + DKBUF; + + nkeys = NUMKEYS(mp); + + DPRINTF(("searching %u keys in %s %spage %"Z"u", + nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mdb_dbg_pgno(mp))); + + low = IS_LEAF(mp) ? 0 : 1; + high = nkeys - 1; + cmp = mc->mc_dbx->md_cmp; + + // Branch pages have no data, so if using integer keys, + // alignment is guaranteed. Use faster mdb_cmp_int. + if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { + if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) + cmp = mdb_cmp_long; + else + cmp = mdb_cmp_int; + } + + if (IS_LEAF2(mp)) { + nodekey.mv_size = mc->mc_db->md_pad; + node = NODEPTR(mp, 0); // fake + while (low <= high) { + i = (low + high) >> 1; + nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); + rc = cmp(key, &nodekey); + DPRINTF(("found leaf index %u [%s], rc = %i", + i, DKEY(&nodekey), rc)); + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } else { + while (low <= high) { + i = (low + high) >> 1; + + node = NODEPTR(mp, i); + nodekey.mv_size = NODEKSZ(node); + nodekey.mv_data = NODEKEY(node); + + rc = cmp(key, &nodekey); +#if MDB_DEBUG + if (IS_LEAF(mp)) + DPRINTF(("found leaf index %u [%s], rc = %i", + i, DKEY(&nodekey), rc)); + else + DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", + i, DKEY(&nodekey), NODEPGNO(node), rc)); +#endif + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } + + if (rc > 0) { // Found entry is less than the key. + i++; // Skip to get the smallest entry larger than key. + if (!IS_LEAF2(mp)) + node = NODEPTR(mp, i); + } + if (exactp) + *exactp = (rc == 0 && nkeys > 0); + // store the key index + mc->mc_ki[mc->mc_top] = i; + if (i >= nkeys) + // There is no entry larger or equal to the key. + return NULL; + + // nodeptr is fake for LEAF2 + return node; + */ + return nil, false +} + +func (c *cursor) pop() { + /* + if (mc->mc_snum) { +#if MDB_DEBUG + MDB_page *top = mc->mc_pg[mc->mc_top]; +#endif + mc->mc_snum--; + if (mc->mc_snum) + mc->mc_top--; + + DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno, + DDBI(mc), (void *) mc)); + } + */ +} + +/** Push a page onto the top of the cursor's stack. */ +func (c *cursor) push(p *page) error { + /* + DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, + DDBI(mc), (void *) mc)); + + if (mc->mc_snum >= CURSOR_STACK) { + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CURSOR_FULL; + } + + mc->mc_top = mc->mc_snum++; + mc->mc_pg[mc->mc_top] = mp; + mc->mc_ki[mc->mc_top] = 0; + + return MDB_SUCCESS; + */ + return nil +} + +// Finish #mdb_page_search() / #mdb_page_search_lowest(). +// The cursor is at the root page, set up the rest of it. +func (c *cursor) searchRoot(key []byte, flags int) error { + /* + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int rc; + DKBUF; + + while (IS_BRANCH(mp)) { + MDB_node *node; + indx_t i; + + DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); + mdb_cassert(mc, NUMKEYS(mp) > 1); + DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); + + if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { + i = 0; + if (flags & MDB_PS_LAST) + i = NUMKEYS(mp) - 1; + } else { + int exact; + node = mdb_node_search(mc, key, &exact); + if (node == NULL) + i = NUMKEYS(mp) - 1; + else { + i = mc->mc_ki[mc->mc_top]; + if (!exact) { + mdb_cassert(mc, i > 0); + i--; + } + } + DPRINTF(("following index %u for key [%s]", i, DKEY(key))); + } + + mdb_cassert(mc, i < NUMKEYS(mp)); + node = NODEPTR(mp, i); + + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) + return rc; + + mc->mc_ki[mc->mc_top] = i; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + + if (flags & MDB_PS_MODIFY) { + if ((rc = mdb_page_touch(mc)) != 0) + return rc; + mp = mc->mc_pg[mc->mc_top]; + } + } + + if (!IS_LEAF(mp)) { + DPRINTF(("internal error, index points to a %02X page!?", + mp->mp_flags)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + + DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, + key ? DKEY(key) : "null")); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + return MDB_SUCCESS; + */ + return nil +} + +// Search for the lowest key under the current branch page. +// This just bypasses a NUMKEYS check in the current page +// before calling mdb_page_search_root(), because the callers +// are all in situations where the current page is known to +// be underfilled. +func (c *cursor) searchLowest() error { + /* + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NODEPTR(mp, 0); + int rc; + + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) + return rc; + + mc->mc_ki[mc->mc_top] = 0; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); + */ +} + +// Search for the page a given key should be in. +// Push it and its parent pages on the cursor stack. +// @param[in,out] mc the cursor for this operation. +// @param[in] key the key to search for, or NULL for first/last page. +// @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB +// are touched (updated with new page numbers). +// If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. +// This is used by #mdb_cursor_first() and #mdb_cursor_last(). +// If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. +// @return 0 on success, non-zero on failure. +func (c *cursor) findPage(key []byte, flags int) error { + /* + int rc; + pgno_t root; + + // Make sure the txn is still viable, then find the root from + // the txn's db table and set it as the root of the cursor's stack. + if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) { + DPUTS("transaction has failed, must abort"); + return MDB_BAD_TXN; + } else { + // Make sure we're using an up-to-date root + if (*mc->mc_dbflag & DB_STALE) { + MDB_cursor mc2; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); + if (rc) + return rc; + { + MDB_val data; + int exact = 0; + uint16_t flags; + MDB_node *leaf = mdb_node_search(&mc2, + &mc->mc_dbx->md_name, &exact); + if (!exact) + return MDB_NOTFOUND; + rc = mdb_node_read(mc->mc_txn, leaf, &data); + if (rc) + return rc; + memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), + sizeof(uint16_t)); + // The txn may not know this DBI, or another process may + // have dropped and recreated the DB with other flags. + if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) + return MDB_INCOMPATIBLE; + memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); + } + *mc->mc_dbflag &= ~DB_STALE; + } + root = mc->mc_db->md_root; + + if (root == P_INVALID) { // Tree is empty. + DPUTS("tree is empty"); + return MDB_NOTFOUND; + } + } + + mdb_cassert(mc, root > 1); + if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) + if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0) + return rc; + + mc->mc_snum = 1; + mc->mc_top = 0; + + DPRINTF(("db %d root page %"Z"u has flags 0x%X", + DDBI(mc), root, mc->mc_pg[0]->mp_flags)); + + if (flags & MDB_PS_MODIFY) { + if ((rc = mdb_page_touch(mc))) + return rc; + } + + if (flags & MDB_PS_ROOTONLY) + return MDB_SUCCESS; + + return mdb_page_search_root(mc, key, flags); + */ + return nil +} + +func (c *cursor) freeOverflowPage(p *page) error { + /* + MDB_txn *txn = mc->mc_txn; + pgno_t pg = mp->mp_pgno; + unsigned x = 0, ovpages = mp->mp_pages; + MDB_env *env = txn->mt_env; + MDB_IDL sl = txn->mt_spill_pgs; + MDB_ID pn = pg << 1; + int rc; + + DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); + // If the page is dirty or on the spill list we just acquired it, + // so we should give it back to our current free list, if any. + // Otherwise put it onto the list of pages we freed in this txn. + // + // Won't create me_pghead: me_pglast must be inited along with it. + // Unsupported in nested txns: They would need to hide the page + // range in ancestor txns' dirty and spilled lists. + if (env->me_pghead && + !txn->mt_parent && + ((mp->mp_flags & P_DIRTY) || + (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) + { + unsigned i, j; + pgno_t *mop; + MDB_ID2 *dl, ix, iy; + rc = mdb_midl_need(&env->me_pghead, ovpages); + if (rc) + return rc; + if (!(mp->mp_flags & P_DIRTY)) { + // This page is no longer spilled + if (x == sl[0]) + sl[0]--; + else + sl[x] |= 1; + goto release; + } + // Remove from dirty list + dl = txn->mt_u.dirty_list; + x = dl[0].mid--; + for (ix = dl[x]; ix.mptr != mp; ix = iy) { + if (x > 1) { + x--; + iy = dl[x]; + dl[x] = ix; + } else { + mdb_cassert(mc, x > 1); + j = ++(dl[0].mid); + dl[j] = ix; // Unsorted. OK when MDB_TXN_ERROR. + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + } + if (!(env->me_flags & MDB_WRITEMAP)) + mdb_dpage_free(env, mp); +release: + // Insert in me_pghead + mop = env->me_pghead; + j = mop[0] + ovpages; + for (i = mop[0]; i && mop[i] < pg; i--) + mop[j--] = mop[i]; + while (j>i) + mop[j--] = pg++; + mop[0] += ovpages; + } else { + rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); + if (rc) + return rc; + } + mc->mc_db->md_overflow_pages -= ovpages; + return 0; + */ + return nil +} + + +// Find a sibling for a page. +// Replaces the page at the top of the cursor's stack with the +// specified sibling, if one exists. +// @param[in] mc The cursor for this operation. +// @param[in] move_right Non-zero if the right sibling is requested, +// otherwise the left sibling. +// @return 0 on success, non-zero on failure. +func (c *cursor) sibling(moveRight bool) error { + /* + int rc; + MDB_node *indx; + MDB_page *mp; + + if (mc->mc_snum < 2) { + return MDB_NOTFOUND; // root has no siblings + } + + mdb_cursor_pop(mc); + DPRINTF(("parent page is page %"Z"u, index %u", + mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); + + if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) + : (mc->mc_ki[mc->mc_top] == 0)) { + DPRINTF(("no more keys left, moving to %s sibling", + move_right ? "right" : "left")); + if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { + // undo cursor_pop before returning + mc->mc_top++; + mc->mc_snum++; + return rc; + } + } else { + if (move_right) + mc->mc_ki[mc->mc_top]++; + else + mc->mc_ki[mc->mc_top]--; + DPRINTF(("just moving to %s index key %u", + move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); + } + mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + + indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) { + // mc will be inconsistent if caller does mc_snum++ as above + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + return rc; + } + + mdb_cursor_push(mc, mp); + if (!move_right) + mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; + + return MDB_SUCCESS; + */ + return nil +} + +// Move the cursor to the next data item. +func (c *cursor) next(key []byte, data []byte, op int) error { + /* + MDB_page *mp; + MDB_node *leaf; + int rc; + + if (mc->mc_flags & C_EOF) { + return MDB_NOTFOUND; + } + + mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_NEXT || op == MDB_NEXT_DUP) { + rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); + if (op != MDB_NEXT || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) + MDB_GET_KEY(leaf, key); + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (op == MDB_NEXT_DUP) + return MDB_NOTFOUND; + } + } + + DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); + if (mc->mc_flags & C_DEL) + goto skip; + + if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { + DPUTS("=====> move to next sibling page"); + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { + mc->mc_flags |= C_EOF; + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + } else + mc->mc_ki[mc->mc_top]++; + +skip: + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; + */ + return nil +} + +// Move the cursor to the previous data item. +func (c *cursor) prev(key []byte, data []byte, op int) error { + /* + MDB_page *mp; + MDB_node *leaf; + int rc; + + mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_PREV || op == MDB_PREV_DUP) { + rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); + if (op != MDB_PREV || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) + MDB_GET_KEY(leaf, key); + return rc; + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (op == MDB_PREV_DUP) + return MDB_NOTFOUND; + } + } + } + + DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); + + if (mc->mc_ki[mc->mc_top] == 0) { + DPUTS("=====> move to prev sibling page"); + if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; + DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + } else + mc->mc_ki[mc->mc_top]--; + + mc->mc_flags &= ~C_EOF; + + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; + */ +} + +// Set the cursor on a specific data item. +// (bool return is whether it is exact). +func (c *cursor) set(key []byte, data []byte, op int) (error, bool) { + /* + int rc; + MDB_page *mp; + MDB_node *leaf = NULL; + DKBUF; + + if (key->mv_size == 0) + return MDB_BAD_VALSIZE; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + // See if we're already on the right page + if (mc->mc_flags & C_INITIALIZED) { + MDB_val nodekey; + + mp = mc->mc_pg[mc->mc_top]; + if (!NUMKEYS(mp)) { + mc->mc_ki[mc->mc_top] = 0; + return MDB_NOTFOUND; + } + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_size = mc->mc_db->md_pad; + nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, 0); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + // Probably happens rarely, but first node on the page + // was the one we wanted. + mc->mc_ki[mc->mc_top] = 0; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc > 0) { + unsigned int i; + unsigned int nkeys = NUMKEYS(mp); + if (nkeys > 1) { + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, + nkeys-1, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, nkeys-1); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + // last node was the one we wanted + mc->mc_ki[mc->mc_top] = nkeys-1; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc < 0) { + if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { + // This is definitely the right page, skip search_page + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, + mc->mc_ki[mc->mc_top], nodekey.mv_size); + } else { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + // current node was the one we wanted + if (exactp) + *exactp = 1; + goto set1; + } + } + rc = 0; + goto set2; + } + } + // If any parents have right-sibs, search. + // Otherwise, there's nothing further. + for (i=0; imc_top; i++) + if (mc->mc_ki[i] < + NUMKEYS(mc->mc_pg[i])-1) + break; + if (i == mc->mc_top) { + // There are no other pages + mc->mc_ki[mc->mc_top] = nkeys; + return MDB_NOTFOUND; + } + } + if (!mc->mc_top) { + // There are no other pages + mc->mc_ki[mc->mc_top] = 0; + if (op == MDB_SET_RANGE) { + rc = 0; + goto set1; + } else + return MDB_NOTFOUND; + } + } + + rc = mdb_page_search(mc, key, 0); + if (rc != MDB_SUCCESS) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + mdb_cassert(mc, IS_LEAF(mp)); + +set2: + leaf = mdb_node_search(mc, key, exactp); + if (exactp != NULL && !*exactp) { + // MDB_SET specified and not an exact match. + return MDB_NOTFOUND; + } + + if (leaf == NULL) { + DPUTS("===> inexact leaf not found, goto sibling"); + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) + return rc; // no entries matched + mp = mc->mc_pg[mc->mc_top]; + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, 0); + } + +set1: + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + } else { + int ex2, *ex2p; + if (op == MDB_GET_BOTH) { + ex2p = &ex2; + ex2 = 0; + } else { + ex2p = NULL; + } + rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); + if (rc != MDB_SUCCESS) + return rc; + } + } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { + MDB_val d2; + if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS) + return rc; + rc = mc->mc_dbx->md_dcmp(data, &d2); + if (rc) { + if (op == MDB_GET_BOTH || rc > 0) + return MDB_NOTFOUND; + rc = 0; + *data = d2; + } + + } else { + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + + // The key already matches in all other cases + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) + MDB_GET_KEY(leaf, key); + DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); + + return rc; + */ + + return nil +} + +// Move the cursor to the first item in the database. +func (c *cursor) first(key []byte, data []byte) error { + /* + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + mc->mc_ki[mc->mc_top] = 0; + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); + return MDB_SUCCESS; + } + + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else { + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; + */ + return nil +} + +// Move the cursor to the last item in the database. +func (c *cursor) last() ([]byte, []byte) { + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + + if (!(mc->mc_flags & C_EOF)) { + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_LAST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + } + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_flags |= C_INITIALIZED|C_EOF; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else { + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +func (c *cursor) Get(key []byte, data []byte, op int) ([]byte, []byte, error) { + /* + int rc; + int exact = 0; + int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); + + if (mc == NULL) + return EINVAL; + + if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + + switch (op) { + case MDB_GET_CURRENT: + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + } else { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int nkeys = NUMKEYS(mp); + if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { + mc->mc_ki[mc->mc_top] = nkeys; + rc = MDB_NOTFOUND; + break; + } + rc = MDB_SUCCESS; + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } else { + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY(leaf, key); + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (mc->mc_flags & C_DEL) + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); + } else { + rc = mdb_node_read(mc->mc_txn, leaf, data); + } + } + } + } + break; + case MDB_GET_BOTH: + case MDB_GET_BOTH_RANGE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (mc->mc_xcursor == NULL) { + rc = MDB_INCOMPATIBLE; + break; + } + // FALLTHRU + case MDB_SET: + case MDB_SET_KEY: + case MDB_SET_RANGE: + if (key == NULL) { + rc = EINVAL; + } else { + rc = mdb_cursor_set(mc, key, data, op, + op == MDB_SET_RANGE ? NULL : &exact); + } + break; + case MDB_GET_MULTIPLE: + if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = MDB_SUCCESS; + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || + (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) + break; + goto fetchm; + case MDB_NEXT_MULTIPLE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_first(mc, key, data); + else + rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); + if (rc == MDB_SUCCESS) { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + MDB_cursor *mx; +fetchm: + mx = &mc->mc_xcursor->mx_cursor; + data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * + mx->mc_db->md_pad; + data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); + mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_NEXT: + case MDB_NEXT_DUP: + case MDB_NEXT_NODUP: + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_first(mc, key, data); + else + rc = mdb_cursor_next(mc, key, data, op); + break; + case MDB_PREV: + case MDB_PREV_DUP: + case MDB_PREV_NODUP: + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdb_cursor_last(mc, key, data); + if (rc) + break; + mc->mc_flags |= C_INITIALIZED; + mc->mc_ki[mc->mc_top]++; + } + rc = mdb_cursor_prev(mc, key, data, op); + break; + case MDB_FIRST: + rc = mdb_cursor_first(mc, key, data); + break; + case MDB_FIRST_DUP: + mfunc = mdb_cursor_first; + mmove: + if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + if (mc->mc_xcursor == NULL) { + rc = MDB_INCOMPATIBLE; + break; + } + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); + break; + case MDB_LAST: + rc = mdb_cursor_last(mc, key, data); + break; + case MDB_LAST_DUP: + mfunc = mdb_cursor_last; + goto mmove; + default: + DPRINTF(("unhandled/unimplemented cursor operation %u", op)); + rc = EINVAL; + break; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + return rc; + */ + return nil, nil, nil +} + +// Touch all the pages in the cursor stack. Set mc_top. +// Makes sure all the pages are writable, before attempting a write operation. +// @param[in] mc The cursor to operate on. +func (c *cursor) touch() error { + /* + int rc = MDB_SUCCESS; + + if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { + MDB_cursor mc2; + MDB_xcursor mcx; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); + if (rc) + return rc; + *mc->mc_dbflag |= DB_DIRTY; + } + mc->mc_top = 0; + if (mc->mc_snum) { + do { + rc = mdb_page_touch(mc); + } while (!rc && ++(mc->mc_top) < mc->mc_snum); + mc->mc_top = mc->mc_snum-1; + } + return rc; +} + +int +mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, + unsigned int flags) +{ + enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; // internal code + MDB_env *env; + MDB_node *leaf = NULL; + MDB_page *fp, *mp; + uint16_t fp_flags; + MDB_val xdata, *rdata, dkey, olddata; + MDB_db dummy; + int do_sub = 0, insert; + unsigned int mcount = 0, dcount = 0, nospill; + size_t nsize; + int rc, rc2; + unsigned int nflags; + DKBUF; + + if (mc == NULL || key == NULL) + return EINVAL; + + env = mc->mc_txn->mt_env; + + // Check this first so counter will always be zero on any + // early failures. + if (flags & MDB_MULTIPLE) { + dcount = data[1].mv_size; + data[1].mv_size = 0; + if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) + return MDB_INCOMPATIBLE; + } + + nospill = flags & MDB_NOSPILL; + flags &= ~MDB_NOSPILL; + + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (flags != MDB_CURRENT && key->mv_size-1 >= ENV_MAXKEY(env)) + return MDB_BAD_VALSIZE; + +#if SIZE_MAX > MAXDATASIZE + if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) + return MDB_BAD_VALSIZE; +#else + if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) + return MDB_BAD_VALSIZE; +#endif + + DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", + DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); + + dkey.mv_size = 0; + + if (flags == MDB_CURRENT) { + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + rc = MDB_SUCCESS; + } else if (mc->mc_db->md_root == P_INVALID) { + // new database, cursor has nothing to point to + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + rc = MDB_NO_ROOT; + } else { + int exact = 0; + MDB_val d2; + if (flags & MDB_APPEND) { + MDB_val k2; + rc = mdb_cursor_last(mc, &k2, &d2); + if (rc == 0) { + rc = mc->mc_dbx->md_cmp(key, &k2); + if (rc > 0) { + rc = MDB_NOTFOUND; + mc->mc_ki[mc->mc_top]++; + } else { + // new key is <= last key + rc = MDB_KEYEXIST; + } + } + } else { + rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); + } + if ((flags & MDB_NOOVERWRITE) && rc == 0) { + DPRINTF(("duplicate key [%s]", DKEY(key))); + *data = d2; + return MDB_KEYEXIST; + } + if (rc && rc != MDB_NOTFOUND) + return rc; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + // Cursor is positioned, check for room in the dirty list + if (!nospill) { + if (flags & MDB_MULTIPLE) { + rdata = &xdata; + xdata.mv_size = data->mv_size * dcount; + } else { + rdata = data; + } + if ((rc2 = mdb_page_spill(mc, key, rdata))) + return rc2; + } + + if (rc == MDB_NO_ROOT) { + MDB_page *np; + // new database, write a root leaf page + DPUTS("allocating new root leaf page"); + if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { + return rc2; + } + mdb_cursor_push(mc, np); + mc->mc_db->md_root = np->mp_pgno; + mc->mc_db->md_depth++; + *mc->mc_dbflag |= DB_DIRTY; + if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) + == MDB_DUPFIXED) + np->mp_flags |= P_LEAF2; + mc->mc_flags |= C_INITIALIZED; + } else { + // make sure all cursor pages are writable + rc2 = mdb_cursor_touch(mc); + if (rc2) + return rc2; + } + + insert = rc; + if (insert) { + // The key does not exist + DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + LEAFSIZE(key, data) > env->me_nodemax) + { + // Too big for a node, insert in sub-DB + fp_flags = P_LEAF|P_DIRTY; + fp = env->me_pbuf; + fp->mp_pad = data->mv_size; // used if MDB_DUPFIXED + fp->mp_lower = fp->mp_upper = olddata.mv_size = PAGEHDRSZ; + goto prep_subDB; + } + } else { + // there's only a key anyway, so this is a no-op + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + unsigned int ksize = mc->mc_db->md_pad; + if (key->mv_size != ksize) + return MDB_BAD_VALSIZE; + if (flags == MDB_CURRENT) { + char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + memcpy(ptr, key->mv_data, ksize); + } + return MDB_SUCCESS; + } + +more: + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + olddata.mv_size = NODEDSZ(leaf); + olddata.mv_data = NODEDATA(leaf); + + // DB has dups? + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + // Prepare (sub-)page/sub-DB to accept the new item, + // if needed. fp: old sub-page or a header faking + // it. mp: new (sub-)page. offset: growth in page + // size. xdata: node data with new page or DB. + ssize_t i, offset = 0; + mp = fp = xdata.mv_data = env->me_pbuf; + mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + + // Was a single item before, must convert now + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + // Just overwrite the current item + if (flags == MDB_CURRENT) + goto current; + +#if UINT_MAX < SIZE_MAX + if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) +#ifdef MISALIGNED_OK + mc->mc_dbx->md_dcmp = mdb_cmp_long; +#else + mc->mc_dbx->md_dcmp = mdb_cmp_cint; +#endif +#endif + // if data matches, skip it + if (!mc->mc_dbx->md_dcmp(data, &olddata)) { + if (flags & MDB_NODUPDATA) + rc = MDB_KEYEXIST; + else if (flags & MDB_MULTIPLE) + goto next_mult; + else + rc = MDB_SUCCESS; + return rc; + } + + // Back up original data item + dkey.mv_size = olddata.mv_size; + dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); + + // Make sub-page header for the dup items, with dummy body + fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; + fp->mp_lower = PAGEHDRSZ; + xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp->mp_flags |= P_LEAF2; + fp->mp_pad = data->mv_size; + xdata.mv_size += 2 * data->mv_size; // leave space for 2 more + } else { + xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + + (dkey.mv_size & 1) + (data->mv_size & 1); + } + fp->mp_upper = xdata.mv_size; + olddata.mv_size = fp->mp_upper; // pretend olddata is fp + } else if (leaf->mn_flags & F_SUBDATA) { + // Data is on sub-DB, just store it + flags |= F_DUPDATA|F_SUBDATA; + goto put_sub; + } else { + // Data is on sub-page + fp = olddata.mv_data; + switch (flags) { + default: + i = -(ssize_t)SIZELEFT(fp); + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + offset = i += (ssize_t) EVEN( + sizeof(indx_t) + NODESIZE + data->mv_size); + } else { + i += offset = fp->mp_pad; + offset *= 4; // space for 4 more + } + if (i > 0) + break; + // FALLTHRU: Sub-page is big enough + case MDB_CURRENT: + fp->mp_flags |= P_DIRTY; + COPY_PGNO(fp->mp_pgno, mp->mp_pgno); + mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; + flags |= F_DUPDATA; + goto put_sub; + } + xdata.mv_size = olddata.mv_size + offset; + } + + fp_flags = fp->mp_flags; + if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { + // Too big for a sub-page, convert to sub-DB + fp_flags &= ~P_SUBP; +prep_subDB: + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp_flags |= P_LEAF2; + dummy.md_pad = fp->mp_pad; + dummy.md_flags = MDB_DUPFIXED; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + dummy.md_flags |= MDB_INTEGERKEY; + } else { + dummy.md_pad = 0; + dummy.md_flags = 0; + } + dummy.md_depth = 1; + dummy.md_branch_pages = 0; + dummy.md_leaf_pages = 1; + dummy.md_overflow_pages = 0; + dummy.md_entries = NUMKEYS(fp); + xdata.mv_size = sizeof(MDB_db); + xdata.mv_data = &dummy; + if ((rc = mdb_page_alloc(mc, 1, &mp))) + return rc; + offset = env->me_psize - olddata.mv_size; + flags |= F_DUPDATA|F_SUBDATA; + dummy.md_root = mp->mp_pgno; + } + if (mp != fp) { + mp->mp_flags = fp_flags | P_DIRTY; + mp->mp_pad = fp->mp_pad; + mp->mp_lower = fp->mp_lower; + mp->mp_upper = fp->mp_upper + offset; + if (fp_flags & P_LEAF2) { + memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); + } else { + memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, + olddata.mv_size - fp->mp_upper); + for (i = NUMKEYS(fp); --i >= 0; ) + mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; + } + } + + rdata = &xdata; + flags |= F_DUPDATA; + do_sub = 1; + if (!insert) + mdb_node_del(mc, 0); + goto new_sub; + } +current: + // overflow page overwrites need special handling + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); + + memcpy(&pg, olddata.mv_data, sizeof(pg)); + if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0) + return rc2; + ovpages = omp->mp_pages; + + // Is the ov page large enough? + if (ovpages >= dpages) { + if (!(omp->mp_flags & P_DIRTY) && + (level || (env->me_flags & MDB_WRITEMAP))) + { + rc = mdb_page_unspill(mc->mc_txn, omp, &omp); + if (rc) + return rc; + level = 0; // dirty in this txn or clean + } + // Is it dirty? + if (omp->mp_flags & P_DIRTY) { + // yes, overwrite it. Note in this case we don't + // bother to try shrinking the page if the new data + // is smaller than the overflow threshold. + if (level > 1) { + // It is writable only in a parent txn + size_t sz = (size_t) env->me_psize * ovpages, off; + MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); + MDB_ID2 id2; + if (!np) + return ENOMEM; + id2.mid = pg; + id2.mptr = np; + rc = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); + mdb_cassert(mc, rc == 0); + if (!(flags & MDB_RESERVE)) { + // Copy end of page, adjusting alignment so + // compiler may copy words instead of bytes. + off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); + memcpy((size_t *)((char *)np + off), + (size_t *)((char *)omp + off), sz - off); + sz = PAGEHDRSZ; + } + memcpy(np, omp, sz); // Copy beginning of page + omp = np; + } + SETDSZ(leaf, data->mv_size); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = METADATA(omp); + else + memcpy(METADATA(omp), data->mv_data, data->mv_size); + goto done; + } + } + if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) + return rc2; + } else if (data->mv_size == olddata.mv_size) { + // same size, just replace it. Note that we could + // also reuse this node if the new data is smaller, + // but instead we opt to shrink the node in that case. + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = olddata.mv_data; + else if (data->mv_size) + memcpy(olddata.mv_data, data->mv_data, data->mv_size); + else + memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); + goto done; + } + mdb_node_del(mc, 0); + mc->mc_db->md_entries--; + } + + rdata = data; + +new_sub: + nflags = flags & NODE_ADD_FLAGS; + nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); + if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { + if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) + nflags &= ~MDB_APPEND; + if (!insert) + nflags |= MDB_SPLIT_REPLACE; + rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); + } else { + // There is room already in this leaf page. + rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); + if (rc == 0 && !do_sub && insert) { + // Adjust other cursors pointing to mp + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) { + m3->mc_ki[i]++; + } + } + } + } + + if (rc != MDB_SUCCESS) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + else { + // Now store the actual data in the child DB. Note that we're + // storing the user data in the keys field, so there are strict + // size limits on dupdata. The actual data fields of the child + // DB are all zero size. + if (do_sub) { + int xflags; +put_sub: + xdata.mv_size = 0; + xdata.mv_data = ""; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (flags & MDB_CURRENT) { + xflags = MDB_CURRENT|MDB_NOSPILL; + } else { + mdb_xcursor_init1(mc, leaf); + xflags = (flags & MDB_NODUPDATA) ? + MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; + } + // converted, write the original data first + if (dkey.mv_size) { + rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); + if (rc) + return rc; + { + // Adjust other cursors pointing to mp + MDB_cursor *m2; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (!(m2->mc_flags & C_INITIALIZED)) continue; + if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) { + mdb_xcursor_init1(m2, leaf); + } + } + } + // we've done our job + dkey.mv_size = 0; + } + if (flags & MDB_APPENDDUP) + xflags |= MDB_APPEND; + rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); + if (flags & F_SUBDATA) { + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } + } + // sub-writes might have failed so check rc again. + // Don't increment count if we just replaced an existing item. + if (!rc && !(flags & MDB_CURRENT)) + mc->mc_db->md_entries++; + if (flags & MDB_MULTIPLE) { + if (!rc) { +next_mult: + mcount++; + // let caller know how many succeeded, if any + data[1].mv_size = mcount; + if (mcount < dcount) { + data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; + goto more; + } + } + } + } +done: + // If we succeeded and the key didn't exist before, make sure + // the cursor is marked valid. + if (!rc && insert) + mc->mc_flags |= C_INITIALIZED; + return rc; + */ + return nil +} + +func (c *cursor) Del(flags int) error { + /* + MDB_node *leaf; + MDB_page *mp; + int rc; + + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDB_NOTFOUND; + + if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) + return rc; + + rc = mdb_cursor_touch(mc); + if (rc) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (!(flags & MDB_NODUPDATA)) { + if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); + // If sub-DB still has entries, we're done + if (mc->mc_xcursor->mx_db.md_entries) { + if (leaf->mn_flags & F_SUBDATA) { + // update subDB info + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } else { + MDB_cursor *m2; + // shrink fake page + mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + // fix other sub-DB cursors pointed at this fake page + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (m2->mc_pg[mc->mc_top] == mp && + m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + } + mc->mc_db->md_entries--; + mc->mc_flags |= C_DEL; + return rc; + } + // otherwise fall thru and delete the sub-DB + } + + if (leaf->mn_flags & F_SUBDATA) { + // add all the child DB's pages to the free list + rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (rc == MDB_SUCCESS) { + mc->mc_db->md_entries -= + mc->mc_xcursor->mx_db.md_entries; + } + } + } + + return mdb_cursor_del0(mc, leaf); + */ + return nil +} + +// Allocate and initialize new pages for a database. +// @param[in] mc a cursor on the database being added to. +// @param[in] flags flags defining what type of page is being allocated. +// @param[in] num the number of pages to allocate. This is usually 1, +// unless allocating overflow pages for a large record. +// @param[out] mp Address of a page, or NULL on failure. +// @return 0 on success, non-zero on failure. +func (c *cursor) newPage(flags int, num int) ([]*page, error) { + /* + MDB_page *np; + int rc; + + if ((rc = mdb_page_alloc(mc, num, &np))) + return rc; + DPRINTF(("allocated new mpage %"Z"u, page size %u", + np->mp_pgno, mc->mc_txn->mt_env->me_psize)); + np->mp_flags = flags | P_DIRTY; + np->mp_lower = PAGEHDRSZ; + np->mp_upper = mc->mc_txn->mt_env->me_psize; + + if (IS_BRANCH(np)) + mc->mc_db->md_branch_pages++; + else if (IS_LEAF(np)) + mc->mc_db->md_leaf_pages++; + else if (IS_OVERFLOW(np)) { + mc->mc_db->md_overflow_pages += num; + np->mp_pages = num; + } + *mp = np; + + return 0; + */ + return nil, nil +} + +// Add a node to the page pointed to by the cursor. +// @param[in] mc The cursor for this operation. +// @param[in] indx The index on the page where the new node should be added. +// @param[in] key The key for the new node. +// @param[in] data The data for the new node, if any. +// @param[in] pgno The page number, if adding a branch node. +// @param[in] flags Flags for the node. +// @return 0 on success, non-zero on failure. Possible errors are: +//
    +//
  • ENOMEM - failed to allocate overflow pages for the node. +//
  • MDB_PAGE_FULL - there is insufficient room in the page. This error +// should never happen since all callers already calculate the +// page's free space before calling this function. +//
+func (c *cursor) addNode(index int, key []byte, data []byte, pgno int, flags int) error { + /* + unsigned int i; + size_t node_size = NODESIZE; + ssize_t room; + indx_t ofs; + MDB_node *node; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_page *ofp = NULL; // overflow page + DKBUF; + + mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); + + DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", + IS_LEAF(mp) ? "leaf" : "branch", + IS_SUBP(mp) ? "sub-" : "", + mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, + key ? key->mv_size : 0, key ? DKEY(key) : "null")); + + if (IS_LEAF2(mp)) { + // Move higher keys up one slot. + int ksize = mc->mc_db->md_pad, dif; + char *ptr = LEAF2KEY(mp, indx, ksize); + dif = NUMKEYS(mp) - indx; + if (dif > 0) + memmove(ptr+ksize, ptr, dif*ksize); + // insert new key + memcpy(ptr, key->mv_data, ksize); + + // Just using these for counting + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + return MDB_SUCCESS; + } + + room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); + if (key != NULL) + node_size += key->mv_size; + if (IS_LEAF(mp)) { + mdb_cassert(mc, data); + if (F_ISSET(flags, F_BIGDATA)) { + // Data already on overflow page. + node_size += sizeof(pgno_t); + } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { + int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); + int rc; + // Put data on overflow page. + DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", + data->mv_size, node_size+data->mv_size)); + node_size = EVEN(node_size + sizeof(pgno_t)); + if ((ssize_t)node_size > room) + goto full; + if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) + return rc; + DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); + flags |= F_BIGDATA; + goto update; + } else { + node_size += data->mv_size; + } + } + node_size = EVEN(node_size); + if ((ssize_t)node_size > room) + goto full; + +update: + // Move higher pointers up one slot. + for (i = NUMKEYS(mp); i > indx; i--) + mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; + + // Adjust free space offsets. + ofs = mp->mp_upper - node_size; + mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); + mp->mp_ptrs[indx] = ofs; + mp->mp_upper = ofs; + mp->mp_lower += sizeof(indx_t); + + // Write the node data. + node = NODEPTR(mp, indx); + node->mn_ksize = (key == NULL) ? 0 : key->mv_size; + node->mn_flags = flags; + if (IS_LEAF(mp)) + SETDSZ(node,data->mv_size); + else + SETPGNO(node,pgno); + + if (key) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + if (IS_LEAF(mp)) { + mdb_cassert(mc, key); + if (ofp == NULL) { + if (F_ISSET(flags, F_BIGDATA)) + memcpy(node->mn_data + key->mv_size, data->mv_data, + sizeof(pgno_t)); + else if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = node->mn_data + key->mv_size; + else + memcpy(node->mn_data + key->mv_size, data->mv_data, + data->mv_size); + } else { + memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno, + sizeof(pgno_t)); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = METADATA(ofp); + else + memcpy(METADATA(ofp), data->mv_data, data->mv_size); + } + } + + return MDB_SUCCESS; + +full: + DPRINTF(("not enough room in page %"Z"u, got %u ptrs", + mdb_dbg_pgno(mp), NUMKEYS(mp))); + DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); + DPRINTF(("node size = %"Z"u", node_size)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_FULL; + */ + + return nil +} + +// Delete the specified node from a page. +// @param[in] mp The page to operate on. +// @param[in] indx The index of the node to delete. +// @param[in] ksize The size of a node. Only used if the page is +// part of a #MDB_DUPFIXED database. +func (c *cursor) deleteNode(ksize int) { + /* + MDB_page *mp = mc->mc_pg[mc->mc_top]; + indx_t indx = mc->mc_ki[mc->mc_top]; + unsigned int sz; + indx_t i, j, numkeys, ptr; + MDB_node *node; + char *base; + + DPRINTF(("delete node %u on %s page %"Z"u", indx, + IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); + numkeys = NUMKEYS(mp); + mdb_cassert(mc, indx < numkeys); + + if (IS_LEAF2(mp)) { + int x = numkeys - 1 - indx; + base = LEAF2KEY(mp, indx, ksize); + if (x) + memmove(base, base + ksize, x * ksize); + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += ksize - sizeof(indx_t); + return; + } + + node = NODEPTR(mp, indx); + sz = NODESIZE + node->mn_ksize; + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + sz += sizeof(pgno_t); + else + sz += NODEDSZ(node); + } + sz = EVEN(sz); + + ptr = mp->mp_ptrs[indx]; + for (i = j = 0; i < numkeys; i++) { + if (i != indx) { + mp->mp_ptrs[j] = mp->mp_ptrs[i]; + if (mp->mp_ptrs[i] < ptr) + mp->mp_ptrs[j] += sz; + j++; + } + } + + base = (char *)mp + mp->mp_upper; + memmove(base + sz, base, ptr - mp->mp_upper); + + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += sz; +*/ +} + +// Initial setup of a sorted-dups cursor. +// Sorted duplicates are implemented as a sub-database for the given key. +// The duplicate data items are actually keys of the sub-database. +// Operations on the duplicate data items are performed using a sub-cursor +// initialized when the sub-database is first accessed. This function does +// the preliminary setup of the sub-cursor, filling in the fields that +// depend only on the parent DB. +// @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. +func (c *cursor) xcursor_init0() { + /* + MDB_xcursor *mx = mc->mc_xcursor; + + mx->mx_cursor.mc_xcursor = NULL; + mx->mx_cursor.mc_txn = mc->mc_txn; + mx->mx_cursor.mc_db = &mx->mx_db; + mx->mx_cursor.mc_dbx = &mx->mx_dbx; + mx->mx_cursor.mc_dbi = mc->mc_dbi; + mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + mx->mx_dbx.md_name.mv_size = 0; + mx->mx_dbx.md_name.mv_data = NULL; + mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; + mx->mx_dbx.md_dcmp = NULL; + mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; + */ +} + +// Final setup of a sorted-dups cursor. +// Sets up the fields that depend on the data from the main cursor. +// @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. +// @param[in] node The data containing the #MDB_db record for the +// sorted-dup database. +func (c *cursor) xcursor_init1(n *node) { + /* + MDB_xcursor *mx = mc->mc_xcursor; + + if (node->mn_flags & F_SUBDATA) { + memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); + mx->mx_cursor.mc_pg[0] = 0; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + } else { + MDB_page *fp = NODEDATA(node); + mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad; + mx->mx_db.md_flags = 0; + mx->mx_db.md_depth = 1; + mx->mx_db.md_branch_pages = 0; + mx->mx_db.md_leaf_pages = 1; + mx->mx_db.md_overflow_pages = 0; + mx->mx_db.md_entries = NUMKEYS(fp); + COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; + mx->mx_cursor.mc_pg[0] = fp; + mx->mx_cursor.mc_ki[0] = 0; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + mx->mx_db.md_flags = MDB_DUPFIXED; + mx->mx_db.md_pad = fp->mp_pad; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + mx->mx_db.md_flags |= MDB_INTEGERKEY; + } + } + DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root)); + mx->mx_dbflag = DB_VALID|DB_DIRTY; // DB_DIRTY guides mdb_cursor_touch +#if UINT_MAX < SIZE_MAX + if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) +#ifdef MISALIGNED_OK + mx->mx_dbx.md_cmp = mdb_cmp_long; +#else + mx->mx_dbx.md_cmp = mdb_cmp_cint; +#endif +#endif + */ +} + +// Initialize a cursor for a given transaction and database. +func (c *cursor) init(t *transaction, bucket *bucket, mx *xcursor) { + /* + mc->mc_next = NULL; + mc->mc_backup = NULL; + mc->mc_dbi = dbi; + mc->mc_txn = txn; + mc->mc_db = &txn->mt_dbs[dbi]; + mc->mc_dbx = &txn->mt_dbxs[dbi]; + mc->mc_dbflag = &txn->mt_dbflags[dbi]; + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_pg[0] = 0; + mc->mc_flags = 0; + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { + mdb_tassert(txn, mx != NULL); + mc->mc_xcursor = mx; + mdb_xcursor_init0(mc); + } else { + mc->mc_xcursor = NULL; + } + if (*mc->mc_dbflag & DB_STALE) { + mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); + } + */ +} + +// Return the count of duplicate data items for the current key. +func (c *cursor) count() (int, error) { + /* + MDB_node *leaf; + + if (mc == NULL || countp == NULL) + return EINVAL; + + if (mc->mc_xcursor == NULL) + return MDB_INCOMPATIBLE; + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + *countp = 1; + } else { + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + return EINVAL; + + *countp = mc->mc_xcursor->mx_db.md_entries; + } + return MDB_SUCCESS; + */ + return 0, nil +} + +func (c *cursor) Close() { + /* + if (mc && !mc->mc_backup) { + // remove from txn, if tracked + if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { + MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + while (*prev && *prev != mc) prev = &(*prev)->mc_next; + if (*prev == mc) + *prev = mc->mc_next; + } + free(mc); + } + */ +} + +func (c *cursor) Transaction() Transaction { + /* + if (!mc) return NULL; + return mc->mc_txn; + */ + return nil +} + +func (c *cursor) Bucket() Bucket { + return c.bucket +} + +// Replace the key for a branch node with a new key. +// @param[in] mc Cursor pointing to the node to operate on. +// @param[in] key The new key to use. +// @return 0 on success, non-zero on failure. +func (c *cursor) updateKey(key []byte) error { + /* + MDB_page *mp; + MDB_node *node; + char *base; + size_t len; + int delta, ksize, oksize; + indx_t ptr, i, numkeys, indx; + DKBUF; + + indx = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + node = NODEPTR(mp, indx); + ptr = mp->mp_ptrs[indx]; +#if MDB_DEBUG + { + MDB_val k2; + char kbuf2[DKBUF_MAXKEYSIZE*2+1]; + k2.mv_data = NODEKEY(node); + k2.mv_size = node->mn_ksize; + DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", + indx, ptr, + mdb_dkey(&k2, kbuf2), + DKEY(key), + mp->mp_pgno)); + } +#endif + + // Sizes must be 2-byte aligned. + ksize = EVEN(key->mv_size); + oksize = EVEN(node->mn_ksize); + delta = ksize - oksize; + + // Shift node contents if EVEN(key length) changed. + if (delta) { + if (delta > 0 && SIZELEFT(mp) < delta) { + pgno_t pgno; + // not enough space left, do a delete and split + DPRINTF(("Not enough room, delta = %d, splitting...", delta)); + pgno = NODEPGNO(node); + mdb_node_del(mc, 0); + return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); + } + + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] -= delta; + } + + base = (char *)mp + mp->mp_upper; + len = ptr - mp->mp_upper + NODESIZE; + memmove(base - delta, base, len); + mp->mp_upper -= delta; + + node = NODEPTR(mp, indx); + } + + // But even if no shift was needed, update ksize + if (node->mn_ksize != key->mv_size) + node->mn_ksize = key->mv_size; + + if (key->mv_size) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + return MDB_SUCCESS; + */ + return nil +} + +// Move a node from csrc to cdst. +func (c *cursor) moveNodeTo(dst *cursor) error { + /* + MDB_node *srcnode; + MDB_val key, data; + pgno_t srcpg; + MDB_cursor mn; + int rc; + unsigned short flags; + + DKBUF; + + // Mark src and dst as dirty. + if ((rc = mdb_page_touch(csrc)) || + (rc = mdb_page_touch(cdst))) + return rc; + + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); + data.mv_size = 0; + data.mv_data = NULL; + srcpg = 0; + flags = 0; + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); + mdb_cassert(csrc, !((size_t)srcnode & 1)); + srcpg = NODEPGNO(srcnode); + flags = srcnode->mn_flags; + if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + unsigned int snum = csrc->mc_snum; + MDB_node *s2; + // must find the lowest key below src + mdb_page_search_lowest(csrc); + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + csrc->mc_snum = snum--; + csrc->mc_top = snum; + } else { + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + } + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { + unsigned int snum = cdst->mc_snum; + MDB_node *s2; + MDB_val bkey; + // must find the lowest key below dst + mdb_page_search_lowest(cdst); + if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) { + bkey.mv_size = cdst->mc_db->md_pad; + bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size); + } else { + s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); + bkey.mv_size = NODEKSZ(s2); + bkey.mv_data = NODEKEY(s2); + } + cdst->mc_snum = snum--; + cdst->mc_top = snum; + mdb_cursor_copy(cdst, &mn); + mn.mc_ki[snum] = 0; + rc = mdb_update_key(&mn, &bkey); + if (rc) + return rc; + } + + DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", + IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", + csrc->mc_ki[csrc->mc_top], + DKEY(&key), + csrc->mc_pg[csrc->mc_top]->mp_pgno, + cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); + + // Add the node to the destination page. + rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); + if (rc != MDB_SUCCESS) + return rc; + + // Delete the node from the source page. + mdb_node_del(csrc, key.mv_size); + + { + // Adjust other cursors pointing to mp + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + MDB_page *mp = csrc->mc_pg[csrc->mc_top]; + + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] == + csrc->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + } + } + } + + // Update the parent separators. + if (csrc->mc_ki[csrc->mc_top] == 0) { + if (csrc->mc_ki[csrc->mc_top-1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + DPRINTF(("update separator for source page %"Z"u to [%s]", + csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); + mdb_cursor_copy(csrc, &mn); + mn.mc_snum--; + mn.mc_top--; + if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) + return rc; + } + if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + MDB_val nullkey; + indx_t ix = csrc->mc_ki[csrc->mc_top]; + nullkey.mv_size = 0; + csrc->mc_ki[csrc->mc_top] = 0; + rc = mdb_update_key(csrc, &nullkey); + csrc->mc_ki[csrc->mc_top] = ix; + mdb_cassert(csrc, rc == MDB_SUCCESS); + } + } + + if (cdst->mc_ki[cdst->mc_top] == 0) { + if (cdst->mc_ki[cdst->mc_top-1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + DPRINTF(("update separator for destination page %"Z"u to [%s]", + cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); + mdb_cursor_copy(cdst, &mn); + mn.mc_snum--; + mn.mc_top--; + if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) + return rc; + } + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { + MDB_val nullkey; + indx_t ix = cdst->mc_ki[cdst->mc_top]; + nullkey.mv_size = 0; + cdst->mc_ki[cdst->mc_top] = 0; + rc = mdb_update_key(cdst, &nullkey); + cdst->mc_ki[cdst->mc_top] = ix; + mdb_cassert(csrc, rc == MDB_SUCCESS); + } + } + + return MDB_SUCCESS; + */ + + return nil +} + +// Merge one page into another. +// The nodes from the page pointed to by \b csrc will +// be copied to the page pointed to by \b cdst and then +// the \b csrc page will be freed. +// @param[in] csrc Cursor pointing to the source page. +// @param[in] cdst Cursor pointing to the destination page. +func (c *cursor) mergePage(dst *cursor) error { + /* + int rc; + indx_t i, j; + MDB_node *srcnode; + MDB_val key, data; + unsigned nkeys; + + DPRINTF(("merging page %"Z"u into %"Z"u", csrc->mc_pg[csrc->mc_top]->mp_pgno, + cdst->mc_pg[cdst->mc_top]->mp_pgno)); + + mdb_cassert(csrc, csrc->mc_snum > 1); // can't merge root page + mdb_cassert(csrc, cdst->mc_snum > 1); + + // Mark dst as dirty. + if ((rc = mdb_page_touch(cdst))) + return rc; + + // Move all nodes from src to dst. + j = nkeys = NUMKEYS(cdst->mc_pg[cdst->mc_top]); + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = METADATA(csrc->mc_pg[csrc->mc_top]); + for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { + rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); + if (rc != MDB_SUCCESS) + return rc; + key.mv_data = (char *)key.mv_data + key.mv_size; + } + } else { + for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i); + if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + unsigned int snum = csrc->mc_snum; + MDB_node *s2; + // must find the lowest key below src + mdb_page_search_lowest(csrc); + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + csrc->mc_snum = snum--; + csrc->mc_top = snum; + } else { + key.mv_size = srcnode->mn_ksize; + key.mv_data = NODEKEY(srcnode); + } + + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); + if (rc != MDB_SUCCESS) + return rc; + } + } + + DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", + cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]), + (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10)); + + // Unlink the src page from parent and add to free list. + csrc->mc_top--; + mdb_node_del(csrc, 0); + if (csrc->mc_ki[csrc->mc_top] == 0) { + key.mv_size = 0; + rc = mdb_update_key(csrc, &key); + if (rc) { + csrc->mc_top++; + return rc; + } + } + csrc->mc_top++; + + rc = mdb_midl_append(&csrc->mc_txn->mt_free_pgs, + csrc->mc_pg[csrc->mc_top]->mp_pgno); + if (rc) + return rc; + if (IS_LEAF(csrc->mc_pg[csrc->mc_top])) + csrc->mc_db->md_leaf_pages--; + else + csrc->mc_db->md_branch_pages--; + { + // Adjust other cursors pointing to mp + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + MDB_page *mp = cdst->mc_pg[cdst->mc_top]; + + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (m3->mc_snum < csrc->mc_snum) continue; + if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = mp; + m3->mc_ki[csrc->mc_top] += nkeys; + } + } + } + mdb_cursor_pop(csrc); + + return mdb_rebalance(csrc); + */ + + return nil +} + +// Copy the contents of a cursor. +// @param[in] csrc The cursor to copy from. +// @param[out] cdst The cursor to copy to. +func (c *cursor) copyTo(dst *cursor) { + /* + unsigned int i; + + cdst->mc_txn = csrc->mc_txn; + cdst->mc_dbi = csrc->mc_dbi; + cdst->mc_db = csrc->mc_db; + cdst->mc_dbx = csrc->mc_dbx; + cdst->mc_snum = csrc->mc_snum; + cdst->mc_top = csrc->mc_top; + cdst->mc_flags = csrc->mc_flags; + + for (i=0; imc_snum; i++) { + cdst->mc_pg[i] = csrc->mc_pg[i]; + cdst->mc_ki[i] = csrc->mc_ki[i]; + } + */ +} + +// Rebalance the tree after a delete operation. +// @param[in] mc Cursor pointing to the page where rebalancing +// should begin. +// @return 0 on success, non-zero on failure. +func (c *cursor) rebalance() error { + /* + MDB_node *node; + int rc; + unsigned int ptop, minkeys; + MDB_cursor mn; + + minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top])); + DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", + IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); + + if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD && + NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { + DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); + return MDB_SUCCESS; + } + + if (mc->mc_snum < 2) { + MDB_page *mp = mc->mc_pg[0]; + if (IS_SUBP(mp)) { + DPUTS("Can't rebalance a subpage, ignoring"); + return MDB_SUCCESS; + } + if (NUMKEYS(mp) == 0) { + DPUTS("tree is completely empty"); + mc->mc_db->md_root = P_INVALID; + mc->mc_db->md_depth = 0; + mc->mc_db->md_leaf_pages = 0; + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; + // Adjust cursors pointing to mp + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + { + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[0] == mp) { + m3->mc_snum = 0; + m3->mc_top = 0; + m3->mc_flags &= ~C_INITIALIZED; + } + } + } + } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { + DPUTS("collapsing root page!"); + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; + mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); + rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL); + if (rc) + return rc; + mc->mc_db->md_depth--; + mc->mc_db->md_branch_pages--; + mc->mc_ki[0] = mc->mc_ki[1]; + { + // Adjust other cursors pointing to mp + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[0] == mp) { + int i; + m3->mc_snum--; + m3->mc_top--; + for (i=0; imc_snum; i++) { + m3->mc_pg[i] = m3->mc_pg[i+1]; + m3->mc_ki[i] = m3->mc_ki[i+1]; + } + } + } + } + } else + DPUTS("root page doesn't need rebalancing"); + return MDB_SUCCESS; + } + + // The parent (branch page) must have at least 2 pointers, + // otherwise the tree is invalid. + ptop = mc->mc_top-1; + mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); + + // Leaf page fill factor is below the threshold. + // Try to move keys from left or right neighbor, or + // merge with a neighbor page. + + // Find neighbors. + mdb_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + + if (mc->mc_ki[ptop] == 0) { + // We're the leftmost leaf in our parent. + DPUTS("reading right neighbor"); + mn.mc_ki[ptop]++; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + if (rc) + return rc; + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + } else { + // There is at least one neighbor to the left. + DPUTS("reading left neighbor"); + mn.mc_ki[ptop]--; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + if (rc) + return rc; + mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; + mc->mc_ki[mc->mc_top] = 0; + } + + DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", + mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); + + // If the neighbor page is above threshold and has enough keys, + // move one key from it. Otherwise we should try to merge them. + // (A branch page must never have less than 2 keys.) + minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top])); + if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) + return mdb_node_move(&mn, mc); + else { + if (mc->mc_ki[ptop] == 0) + rc = mdb_page_merge(&mn, mc); + else { + mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; + rc = mdb_page_merge(mc, &mn); + mdb_cursor_copy(&mn, mc); + } + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + } + return rc; + */ + return nil +} + +// Complete a delete operation started by #mdb_cursor_del(). +func (c *cursor) del0(leaf *node) error { + /* + int rc; + MDB_page *mp; + indx_t ki; + unsigned int nkeys; + + mp = mc->mc_pg[mc->mc_top]; + ki = mc->mc_ki[mc->mc_top]; + + // add overflow pages to free list + if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + + memcpy(&pg, NODEDATA(leaf), sizeof(pg)); + if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || + (rc = mdb_ovpage_free(mc, omp))) + return rc; + } + mdb_node_del(mc, mc->mc_db->md_pad); + mc->mc_db->md_entries--; + rc = mdb_rebalance(mc); + if (rc != MDB_SUCCESS) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + else { + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + mp = mc->mc_pg[mc->mc_top]; + nkeys = NUMKEYS(mp); + + // if mc points past last node in page, find next sibling + if (mc->mc_ki[mc->mc_top] >= nkeys) + mdb_cursor_sibling(mc, 1); + + // Adjust other cursors pointing to mp + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3 == mc || m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= ki) { + m3->mc_flags |= C_DEL; + if (m3->mc_ki[mc->mc_top] > ki) + m3->mc_ki[mc->mc_top]--; + } + if (m3->mc_ki[mc->mc_top] >= nkeys) + mdb_cursor_sibling(m3, 1); + } + } + mc->mc_flags |= C_DEL; + } + + return rc; + */ + return nil +} + +// Split a page and insert a new node. +// @param[in,out] mc Cursor pointing to the page and desired insertion index. +// The cursor will be updated to point to the actual page and index where +// the node got inserted after the split. +// @param[in] newkey The key for the newly inserted node. +// @param[in] newdata The data for the newly inserted node. +// @param[in] newpgno The page number, if the new node is a branch node. +// @param[in] nflags The #NODE_ADD_FLAGS for the new node. +// @return 0 on success, non-zero on failure. +func (c *cursor) splitPage(newKey []byte, newData []byte, newpgno int, nflags int) error { + /* + unsigned int flags; + int rc = MDB_SUCCESS, new_root = 0, did_split = 0; + indx_t newindx; + pgno_t pgno = 0; + int i, j, split_indx, nkeys, pmax; + MDB_env *env = mc->mc_txn->mt_env; + MDB_node *node; + MDB_val sepkey, rkey, xdata, *rdata = &xdata; + MDB_page *copy = NULL; + MDB_page *mp, *rp, *pp; + int ptop; + MDB_cursor mn; + DKBUF; + + mp = mc->mc_pg[mc->mc_top]; + newindx = mc->mc_ki[mc->mc_top]; + nkeys = NUMKEYS(mp); + + DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, + DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); + + // Create a right sibling. + if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) + return rc; + DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); + + if (mc->mc_snum < 2) { + if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) + return rc; + // shift current top to make room for new parent + mc->mc_pg[1] = mc->mc_pg[0]; + mc->mc_ki[1] = mc->mc_ki[0]; + mc->mc_pg[0] = pp; + mc->mc_ki[0] = 0; + mc->mc_db->md_root = pp->mp_pgno; + DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); + mc->mc_db->md_depth++; + new_root = 1; + + // Add left (implicit) pointer. + if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { + // undo the pre-push + mc->mc_pg[0] = mc->mc_pg[1]; + mc->mc_ki[0] = mc->mc_ki[1]; + mc->mc_db->md_root = mp->mp_pgno; + mc->mc_db->md_depth--; + return rc; + } + mc->mc_snum = 2; + mc->mc_top = 1; + ptop = 0; + } else { + ptop = mc->mc_top-1; + DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); + } + + mc->mc_flags |= C_SPLITTING; + mdb_cursor_copy(mc, &mn); + mn.mc_pg[mn.mc_top] = rp; + mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; + + if (nflags & MDB_APPEND) { + mn.mc_ki[mn.mc_top] = 0; + sepkey = *newkey; + split_indx = newindx; + nkeys = 0; + } else { + + split_indx = (nkeys+1) / 2; + + if (IS_LEAF2(rp)) { + char *split, *ins; + int x; + unsigned int lsize, rsize, ksize; + // Move half of the keys to the right sibling + copy = NULL; + x = mc->mc_ki[mc->mc_top] - split_indx; + ksize = mc->mc_db->md_pad; + split = LEAF2KEY(mp, split_indx, ksize); + rsize = (nkeys - split_indx) * ksize; + lsize = (nkeys - split_indx) * sizeof(indx_t); + mp->mp_lower -= lsize; + rp->mp_lower += lsize; + mp->mp_upper += rsize - lsize; + rp->mp_upper -= rsize - lsize; + sepkey.mv_size = ksize; + if (newindx == split_indx) { + sepkey.mv_data = newkey->mv_data; + } else { + sepkey.mv_data = split; + } + if (x<0) { + ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); + memcpy(rp->mp_ptrs, split, rsize); + sepkey.mv_data = rp->mp_ptrs; + memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memcpy(ins, newkey->mv_data, ksize); + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + } else { + if (x) + memcpy(rp->mp_ptrs, split, x * ksize); + ins = LEAF2KEY(rp, x, ksize); + memcpy(ins, newkey->mv_data, ksize); + memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); + rp->mp_lower += sizeof(indx_t); + rp->mp_upper -= ksize - sizeof(indx_t); + mc->mc_ki[mc->mc_top] = x; + mc->mc_pg[mc->mc_top] = rp; + } + } else { + int psize, nsize, k; + // Maximum free space in an empty page + pmax = env->me_psize - PAGEHDRSZ; + if (IS_LEAF(mp)) + nsize = mdb_leaf_size(env, newkey, newdata); + else + nsize = mdb_branch_size(env, newkey); + nsize = EVEN(nsize); + + // grab a page to hold a temporary copy + copy = mdb_page_malloc(mc->mc_txn, 1); + if (copy == NULL) + return ENOMEM; + copy->mp_pgno = mp->mp_pgno; + copy->mp_flags = mp->mp_flags; + copy->mp_lower = PAGEHDRSZ; + copy->mp_upper = env->me_psize; + + // prepare to insert + for (i=0, j=0; imp_ptrs[j++] = 0; + } + copy->mp_ptrs[j++] = mp->mp_ptrs[i]; + } + + // When items are relatively large the split point needs + // to be checked, because being off-by-one will make the + // difference between success or failure in mdb_node_add. + // + // It's also relevant if a page happens to be laid out + // such that one half of its nodes are all "small" and + // the other half of its nodes are "large." If the new + // item is also "large" and falls on the half with + // "large" nodes, it also may not fit. + // + // As a final tweak, if the new item goes on the last + // spot on the page (and thus, onto the new page), bias + // the split so the new page is emptier than the old page. + // This yields better packing during sequential inserts. + if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { + // Find split point + psize = 0; + if (newindx <= split_indx || newindx >= nkeys) { + i = 0; j = 1; + k = newindx >= nkeys ? nkeys : split_indx+2; + } else { + i = nkeys; j = -1; + k = split_indx-1; + } + for (; i!=k; i+=j) { + if (i == newindx) { + psize += nsize; + node = NULL; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]); + psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + psize += sizeof(pgno_t); + else + psize += NODEDSZ(node); + } + psize = EVEN(psize); + } + if (psize > pmax || i == k-j) { + split_indx = i + (j<0); + break; + } + } + } + if (split_indx == newindx) { + sepkey.mv_size = newkey->mv_size; + sepkey.mv_data = newkey->mv_data; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]); + sepkey.mv_size = node->mn_ksize; + sepkey.mv_data = NODEKEY(node); + } + } + } + + DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); + + // Copy separator key to the parent. + if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { + mn.mc_snum--; + mn.mc_top--; + did_split = 1; + rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0); + + // root split? + if (mn.mc_snum == mc->mc_snum) { + mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top]; + mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top]; + mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop]; + mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop]; + mc->mc_snum++; + mc->mc_top++; + ptop++; + } + // Right page might now have changed parent. + // Check if left page also changed parent. + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; imc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + mc->mc_pg[ptop] = mn.mc_pg[ptop]; + mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; + } + } else { + mn.mc_top--; + rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); + mn.mc_top++; + } + mc->mc_flags ^= C_SPLITTING; + if (rc != MDB_SUCCESS) { + return rc; + } + if (nflags & MDB_APPEND) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[mc->mc_top] = 0; + rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); + if (rc) + return rc; + for (i=0; imc_top; i++) + mc->mc_ki[i] = mn.mc_ki[i]; + } else if (!IS_LEAF2(mp)) { + // Move nodes + mc->mc_pg[mc->mc_top] = rp; + i = split_indx; + j = 0; + do { + if (i == newindx) { + rkey.mv_data = newkey->mv_data; + rkey.mv_size = newkey->mv_size; + if (IS_LEAF(mp)) { + rdata = newdata; + } else + pgno = newpgno; + flags = nflags; + // Update index for the new key. + mc->mc_ki[mc->mc_top] = j; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]); + rkey.mv_data = NODEKEY(node); + rkey.mv_size = node->mn_ksize; + if (IS_LEAF(mp)) { + xdata.mv_data = NODEDATA(node); + xdata.mv_size = NODEDSZ(node); + rdata = &xdata; + } else + pgno = NODEPGNO(node); + flags = node->mn_flags; + } + + if (!IS_LEAF(mp) && j == 0) { + // First branch index doesn't need key data. + rkey.mv_size = 0; + } + + rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); + if (rc) { + // return tmp page to freelist + mdb_page_free(env, copy); + return rc; + } + if (i == nkeys) { + i = 0; + j = 0; + mc->mc_pg[mc->mc_top] = copy; + } else { + i++; + j++; + } + } while (i != split_indx); + + nkeys = NUMKEYS(copy); + for (i=0; imp_ptrs[i] = copy->mp_ptrs[i]; + mp->mp_lower = copy->mp_lower; + mp->mp_upper = copy->mp_upper; + memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), + env->me_psize - copy->mp_upper); + + // reset back to original page + if (newindx < split_indx) { + mc->mc_pg[mc->mc_top] = mp; + if (nflags & MDB_RESERVE) { + node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); + } + } else { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + // Make sure mc_ki is still valid. + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; imc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + mc->mc_pg[ptop] = mn.mc_pg[ptop]; + mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; + } + } + // return tmp page to freelist + mdb_page_free(env, copy); + } + + { + // Adjust other cursors pointing to mp + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + int fixup = NUMKEYS(mp); + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) + continue; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_flags & C_SPLITTING) + continue; + if (new_root) { + int k; + // root split + for (k=m3->mc_top; k>=0; k--) { + m3->mc_ki[k+1] = m3->mc_ki[k]; + m3->mc_pg[k+1] = m3->mc_pg[k]; + } + if (m3->mc_ki[0] >= split_indx) { + m3->mc_ki[0] = 1; + } else { + m3->mc_ki[0] = 0; + } + m3->mc_pg[0] = mc->mc_pg[0]; + m3->mc_snum++; + m3->mc_top++; + } + if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) + m3->mc_ki[mc->mc_top]++; + if (m3->mc_ki[mc->mc_top] >= fixup) { + m3->mc_pg[mc->mc_top] = rp; + m3->mc_ki[mc->mc_top] -= fixup; + m3->mc_ki[ptop] = mn.mc_ki[ptop]; + } + } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && + m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { + m3->mc_ki[ptop]++; + } + } + } + DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); + return rc; + */ + return nil +} + +// Add all the DB's pages to the free list. +// @param[in] mc Cursor on the DB to free. +// @param[in] subs non-Zero to check for sub-DBs in this DB. +// @return 0 on success, non-zero on failure. +func (c *cursor) drop0(subs int) error { + /* + int rc; + + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); + if (rc == MDB_SUCCESS) { + MDB_txn *txn = mc->mc_txn; + MDB_node *ni; + MDB_cursor mx; + unsigned int i; + + // LEAF2 pages have no nodes, cannot have sub-DBs + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) + mdb_cursor_pop(mc); + + mdb_cursor_copy(mc, &mx); + while (mc->mc_snum > 0) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + unsigned n = NUMKEYS(mp); + if (IS_LEAF(mp)) { + for (i=0; imn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + rc = mdb_page_get(txn, pg, &omp, NULL); + if (rc != 0) + return rc; + mdb_cassert(mc, IS_OVERFLOW(omp)); + rc = mdb_midl_append_range(&txn->mt_free_pgs, + pg, omp->mp_pages); + if (rc) + return rc; + } else if (subs && (ni->mn_flags & F_SUBDATA)) { + mdb_xcursor_init1(mc, ni); + rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (rc) + return rc; + } + } + } else { + if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) + return rc; + for (i=0; imt_free_pgs, pg); + } + } + if (!mc->mc_top) + break; + mc->mc_ki[mc->mc_top] = i; + rc = mdb_cursor_sibling(mc, 1); + if (rc) { + // no more siblings, go back to beginning + // of previous level. + mdb_cursor_pop(mc); + mc->mc_ki[0] = 0; + for (i=1; imc_snum; i++) { + mc->mc_ki[i] = 0; + mc->mc_pg[i] = mx.mc_pg[i]; + } + } + } + // free it + rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); + } else if (rc == MDB_NOTFOUND) { + rc = MDB_SUCCESS; + } + return rc; + */ + return nil +} diff --git a/db.go b/db.go index 914cd18..2c593e7 100644 --- a/db.go +++ b/db.go @@ -8,12 +8,1177 @@ const ( IntegerDupKey ) +// TODO: #define MDB_FATAL_ERROR 0x80000000U /** Failed to update the meta page. Probably an I/O error. */ +// TODO: #define MDB_ENV_ACTIVE 0x20000000U /** Some fields are initialized. */ +// TODO: #define MDB_ENV_TXKEY 0x10000000U /** me_txkey is set */ +// TODO: #define MDB_LIVE_READER 0x08000000U /** Have liveness lock in reader table */ + +// Only a subset of the @ref mdb_env flags can be changed +// at runtime. Changing other flags requires closing the +// environment and re-opening it with the new flags. +// TODO: #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) +// TODO: #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) + type DB interface { } type db struct { + sync.Mutex + file os.File + flags int + pageSize int + osPageSize int + readers []*reader + buckets []*bucket + xbuckets []*bucketx /**< array of static DB info */ + bucketFlags []int /**< array of flags from MDB_db.md_flags */ + path string + mmap []byte + mmapSize int /**< size of the data memory map */ + size int /**< current file size */ + meta1 []byte + meta2 []byte + pbuf []byte + transaction *transaction /**< current write transaction */ + maxPageNumber int /**< me_mapsize / me_psize */ + pageState pageStage /**< state of old pages from freeDB */ + dpages []*page /**< list of malloc'd blocks for re-use */ + freePageNumbers []int /** IDL of pages that became unused in a write txn */ + dirtyPageNumbers []int /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + maxFreeOnePage int /** Max number of freelist items that can fit in a single overflow page */ + maxNodeSize int /** Max size of a node on a page */ + maxKeySize int /**< max size of a key */ } func NewDB() DB { return &db{} } + +func (db *db) Path() string { + return db.path +} + +func (db *db) freePage(p *page) { + /* + mp->mp_next = env->me_dpages; + VGMEMP_FREE(env, mp); + env->me_dpages = mp; + */ +} + +func (db *db) freeDirtyPage(p *page) { + /* + if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { + mdb_page_free(env, dp); + } else { + // large pages just get freed directly + VGMEMP_FREE(env, dp); + free(dp); + } + */ +} + +func (db *db) freeAllDirtyPages(p *page) { + /* + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned i, n = dl[0].mid; + + for (i = 1; i <= n; i++) { + mdb_dpage_free(env, dl[i].mptr); + } + dl[0].mid = 0; + */ +} + +func (db *db) sync(force bool) error { + /* + int rc = 0; + if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { + if (env->me_flags & MDB_WRITEMAP) { + int flags = ((env->me_flags & MDB_MAPASYNC) && !force) + ? MS_ASYNC : MS_SYNC; + if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) + rc = ErrCode(); +#ifdef _WIN32 + else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) + rc = ErrCode(); +#endif + } else { + if (MDB_FDATASYNC(env->me_fd)) + rc = ErrCode(); + } + } + return rc; + */ + return nil +} + +func (db *db) Transaction(parent *transaction, flags int) (*transaction, error) { + /* + MDB_txn *txn; + MDB_ntxn *ntxn; + int rc, size, tsize = sizeof(MDB_txn); + + if (env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + return MDB_PANIC; + } + if ((env->me_flags & MDB_RDONLY) && !(flags & MDB_RDONLY)) + return EACCES; + if (parent) { + // Nested transactions: Max 1 child, write txns only, no writemap + if (parent->mt_child || + (flags & MDB_RDONLY) || + (parent->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) || + (env->me_flags & MDB_WRITEMAP)) + { + return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; + } + tsize = sizeof(MDB_ntxn); + } + size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1); + if (!(flags & MDB_RDONLY)) + size += env->me_maxdbs * sizeof(MDB_cursor *); + + if ((txn = calloc(1, size)) == NULL) { + DPRINTF(("calloc: %s", strerror(ErrCode()))); + return ENOMEM; + } + txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); + if (flags & MDB_RDONLY) { + txn->mt_flags |= MDB_TXN_RDONLY; + txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs); + } else { + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs); + } + txn->mt_env = env; + + if (parent) { + unsigned int i; + txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); + if (!txn->mt_u.dirty_list || + !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) + { + free(txn->mt_u.dirty_list); + free(txn); + return ENOMEM; + } + txn->mt_txnid = parent->mt_txnid; + txn->mt_dirty_room = parent->mt_dirty_room; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_spill_pgs = NULL; + txn->mt_next_pgno = parent->mt_next_pgno; + parent->mt_child = txn; + txn->mt_parent = parent; + txn->mt_numdbs = parent->mt_numdbs; + txn->mt_flags = parent->mt_flags; + txn->mt_dbxs = parent->mt_dbxs; + memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + // Copy parent's mt_dbflags, but clear DB_NEW + for (i=0; imt_numdbs; i++) + txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; + rc = 0; + ntxn = (MDB_ntxn *)txn; + ntxn->mnt_pgstate = env->me_pgstate; // save parent me_pghead & co + if (env->me_pghead) { + size = MDB_IDL_SIZEOF(env->me_pghead); + env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); + if (env->me_pghead) + memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); + else + rc = ENOMEM; + } + if (!rc) + rc = mdb_cursor_shadow(parent, txn); + if (rc) + mdb_txn_reset0(txn, "beginchild-fail"); + } else { + rc = mdb_txn_renew0(txn); + } + if (rc) + free(txn); + else { + *ret = txn; + DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); + } + + return rc; + */ + return nil +} + +// Read the environment parameters of a DB environment before +// mapping it into memory. +// @param[in] env the environment handle +// @param[out] meta address of where to store the meta information +// @return 0 on success, non-zero on failure. +func (db *db) readHeader(meta *meta) error { + /* + MDB_metabuf pbuf; + MDB_page *p; + MDB_meta *m; + int i, rc, off; + enum { Size = sizeof(pbuf) }; + + // We don't know the page size yet, so use a minimum value. + // Read both meta pages so we can use the latest one. + + for (i=off=0; i<2; i++, off = meta->mm_psize) { +#ifdef _WIN32 + DWORD len; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; + if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) + rc = 0; +#else + rc = pread(env->me_fd, &pbuf, Size, off); +#endif + if (rc != Size) { + if (rc == 0 && off == 0) + return ENOENT; + rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; + DPRINTF(("read: %s", mdb_strerror(rc))); + return rc; + } + + p = (MDB_page *)&pbuf; + + if (!F_ISSET(p->mp_flags, P_META)) { + DPRINTF(("page %"Z"u not a meta page", p->mp_pgno)); + return MDB_INVALID; + } + + m = METADATA(p); + if (m->mm_magic != MDB_MAGIC) { + DPUTS("meta has invalid magic"); + return MDB_INVALID; + } + + if (m->mm_version != MDB_DATA_VERSION) { + DPRINTF(("database is version %u, expected version %u", + m->mm_version, MDB_DATA_VERSION)); + return MDB_VERSION_MISMATCH; + } + + if (off == 0 || m->mm_txnid > meta->mm_txnid) + *meta = *m; + } + return 0; + */ + return nil +} + +// Write the environment parameters of a freshly created DB environment. +// @param[in] env the environment handle +// @param[out] meta address of where to store the meta information +// @return 0 on success, non-zero on failure. +func (db *db) initMeta(meta *meta) error { + /* + MDB_page *p, *q; + int rc; + unsigned int psize; +#ifdef _WIN32 + DWORD len; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); +#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ + ov.Offset = pos; \ + rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) +#else + int len; +#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ + len = pwrite(fd, ptr, size, pos); \ + rc = (len >= 0); } while(0) +#endif + + DPUTS("writing new meta page"); + + psize = env->me_psize; + + meta->mm_magic = MDB_MAGIC; + meta->mm_version = MDB_DATA_VERSION; + meta->mm_mapsize = env->me_mapsize; + meta->mm_psize = psize; + meta->mm_last_pg = 1; + meta->mm_flags = env->me_flags & 0xffff; + meta->mm_flags |= MDB_INTEGERKEY; + meta->mm_dbs[0].md_root = P_INVALID; + meta->mm_dbs[1].md_root = P_INVALID; + + p = calloc(2, psize); + p->mp_pgno = 0; + p->mp_flags = P_META; + *(MDB_meta *)METADATA(p) = *meta; + + q = (MDB_page *)((char *)p + psize); + q->mp_pgno = 1; + q->mp_flags = P_META; + *(MDB_meta *)METADATA(q) = *meta; + + DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0); + if (!rc) + rc = ErrCode(); + else if ((unsigned) len == psize * 2) + rc = MDB_SUCCESS; + else + rc = ENOSPC; + free(p); + return rc; + */ + return nil +} + +// Check both meta pages to see which one is newer. +// @param[in] env the environment handle +// @return meta toggle (0 or 1). +func (db *db) pickMeta() int { + /* + return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid); + */ + return 0 +} + +func (db *db) Create() error { + /* + MDB_env *e; + + e = calloc(1, sizeof(MDB_env)); + if (!e) + return ENOMEM; + + e->me_maxreaders = DEFAULT_READERS; + e->me_maxdbs = e->me_numdbs = 2; + e->me_fd = INVALID_HANDLE_VALUE; + e->me_lfd = INVALID_HANDLE_VALUE; + e->me_mfd = INVALID_HANDLE_VALUE; +#ifdef MDB_USE_POSIX_SEM + e->me_rmutex = SEM_FAILED; + e->me_wmutex = SEM_FAILED; +#endif + e->me_pid = getpid(); + GET_PAGESIZE(e->me_os_psize); + VGMEMP_CREATE(e,0,0); + *env = e; + return MDB_SUCCESS; + */ + return nil +} + +// int mdb_env_map(MDB_env *env, void *addr, int newsize) +func (db *db) openMmap(newsize int) { + /* + MDB_page *p; + unsigned int flags = env->me_flags; +#ifdef _WIN32 + int rc; + HANDLE mh; + LONG sizelo, sizehi; + sizelo = env->me_mapsize & 0xffffffff; + sizehi = env->me_mapsize >> 16 >> 16; // only needed on Win64 + + // Windows won't create mappings for zero length files. + // Just allocate the maxsize right now. + if (newsize) { + if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo + || !SetEndOfFile(env->me_fd) + || SetFilePointer(env->me_fd, 0, NULL, 0) != 0) + return ErrCode(); + } + mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? + PAGE_READWRITE : PAGE_READONLY, + sizehi, sizelo, NULL); + if (!mh) + return ErrCode(); + env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? + FILE_MAP_WRITE : FILE_MAP_READ, + 0, 0, env->me_mapsize, addr); + rc = env->me_map ? 0 : ErrCode(); + CloseHandle(mh); + if (rc) + return rc; +#else + int prot = PROT_READ; + if (flags & MDB_WRITEMAP) { + prot |= PROT_WRITE; + if (ftruncate(env->me_fd, env->me_mapsize) < 0) + return ErrCode(); + } + env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, + env->me_fd, 0); + if (env->me_map == MAP_FAILED) { + env->me_map = NULL; + return ErrCode(); + } + + if (flags & MDB_NORDAHEAD) { + // Turn off readahead. It's harmful when the DB is larger than RAM. +#ifdef MADV_RANDOM + madvise(env->me_map, env->me_mapsize, MADV_RANDOM); +#else +#ifdef POSIX_MADV_RANDOM + posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); +#endif // POSIX_MADV_RANDOM +#endif // MADV_RANDOM + } +#endif // _WIN32 + + // Can happen because the address argument to mmap() is just a + // hint. mmap() can pick another, e.g. if the range is in use. + // The MAP_FIXED flag would prevent that, but then mmap could + // instead unmap existing pages to make room for the new map. + if (addr && env->me_map != addr) + return EBUSY; // TODO: Make a new MDB_* error code? + + p = (MDB_page *)env->me_map; + env->me_metas[0] = METADATA(p); + env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); + + return MDB_SUCCESS; + */ + return nil +} + +func (db *db) setMapSize(size int) error { + /* + // If env is already open, caller is responsible for making + // sure there are no active txns. + if (env->me_map) { + int rc; + void *old; + if (env->me_txn) + return EINVAL; + if (!size) + size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize; + else if (size < env->me_mapsize) { + // If the configured size is smaller, make sure it's + // still big enough. Silently round up to minimum if not. + size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize; + if (size < minsize) + size = minsize; + } + munmap(env->me_map, env->me_mapsize); + env->me_mapsize = size; + old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; + rc = mdb_env_map(env, old, 1); + if (rc) + return rc; + } + env->me_mapsize = size; + if (env->me_psize) + env->me_maxpg = env->me_mapsize / env->me_psize; + return MDB_SUCCESS; + */ + return nil +} + +func (db *db) setMaxBucketCount(count int) error { + /* + if (env->me_map) + return EINVAL; + env->me_maxdbs = dbs + 2; // Named databases + main and free DB + return MDB_SUCCESS; + */ + return nil +} + +func (db *db) setMaxReaderCount(count int) error { + /* + if (env->me_map || readers < 1) + return EINVAL; + env->me_maxreaders = readers; + return MDB_SUCCESS; + */ +} + +func (db *db) getMaxReaderCount(count int) (int, error) { + /* + if (!env || !readers) + return EINVAL; + *readers = env->me_maxreaders; + return MDB_SUCCESS; + */ + return 0, nil +} + +// Further setup required for opening an MDB environment +func (db *db) open() error { + /* + unsigned int flags = env->me_flags; + int i, newenv = 0, rc; + MDB_meta meta; + +#ifdef _WIN32 + // See if we should use QueryLimited + rc = GetVersion(); + if ((rc & 0xff) > 5) + env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; + else + env->me_pidquery = PROCESS_QUERY_INFORMATION; +#endif // _WIN32 + + memset(&meta, 0, sizeof(meta)); + + if ((i = mdb_env_read_header(env, &meta)) != 0) { + if (i != ENOENT) + return i; + DPUTS("new mdbenv"); + newenv = 1; + env->me_psize = env->me_os_psize; + if (env->me_psize > MAX_PAGESIZE) + env->me_psize = MAX_PAGESIZE; + } else { + env->me_psize = meta.mm_psize; + } + + // Was a mapsize configured? + if (!env->me_mapsize) { + // If this is a new environment, take the default, + // else use the size recorded in the existing env. + env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize; + } else if (env->me_mapsize < meta.mm_mapsize) { + // If the configured size is smaller, make sure it's + // still big enough. Silently round up to minimum if not. + size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; + if (env->me_mapsize < minsize) + env->me_mapsize = minsize; + } + + rc = mdb_env_map(env, meta.mm_address, newenv); + if (rc) + return rc; + + if (newenv) { + if (flags & MDB_FIXEDMAP) + meta.mm_address = env->me_map; + i = mdb_env_init_meta(env, &meta); + if (i != MDB_SUCCESS) { + return i; + } + } + + env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) + - sizeof(indx_t); +#if !(MDB_MAXKEYSIZE) + env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); +#endif + env->me_maxpg = env->me_mapsize / env->me_psize; + +#if MDB_DEBUG + { + int toggle = mdb_env_pick_meta(env); + MDB_db *db = &env->me_metas[toggle]->mm_dbs[MAIN_DBI]; + + DPRINTF(("opened database version %u, pagesize %u", + env->me_metas[0]->mm_version, env->me_psize)); + DPRINTF(("using meta page %d", toggle)); + DPRINTF(("depth: %u", db->md_depth)); + DPRINTF(("entries: %"Z"u", db->md_entries)); + DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); + DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); + DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); + DPRINTF(("root: %"Z"u", db->md_root)); + } +#endif + + return MDB_SUCCESS; + */ + return nil +} + +func (db *db) Open(path string, flags int, mode uint) error { + /* + int oflags, rc, len, excl = -1; + char *lpath, *dpath; + + if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) + return EINVAL; + + len = strlen(path); + if (flags & MDB_NOSUBDIR) { + rc = len + sizeof(LOCKSUFF) + len + 1; + } else { + rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); + } + lpath = malloc(rc); + if (!lpath) + return ENOMEM; + if (flags & MDB_NOSUBDIR) { + dpath = lpath + len + sizeof(LOCKSUFF); + sprintf(lpath, "%s" LOCKSUFF, path); + strcpy(dpath, path); + } else { + dpath = lpath + len + sizeof(LOCKNAME); + sprintf(lpath, "%s" LOCKNAME, path); + sprintf(dpath, "%s" DATANAME, path); + } + + rc = MDB_SUCCESS; + flags |= env->me_flags; + if (flags & MDB_RDONLY) { + // silently ignore WRITEMAP when we're only getting read access + flags &= ~MDB_WRITEMAP; + } else { + if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && + (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) + rc = ENOMEM; + } + env->me_flags = flags |= MDB_ENV_ACTIVE; + if (rc) + goto leave; + + env->me_path = strdup(path); + env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); + env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); + if (!(env->me_dbxs && env->me_path && env->me_dbflags)) { + rc = ENOMEM; + goto leave; + } + + // For RDONLY, get lockfile after we know datafile exists + if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { + rc = mdb_env_setup_locks(env, lpath, mode, &excl); + if (rc) + goto leave; + } + +#ifdef _WIN32 + if (F_ISSET(flags, MDB_RDONLY)) { + oflags = GENERIC_READ; + len = OPEN_EXISTING; + } else { + oflags = GENERIC_READ|GENERIC_WRITE; + len = OPEN_ALWAYS; + } + mode = FILE_ATTRIBUTE_NORMAL; + env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, + NULL, len, mode, NULL); +#else + if (F_ISSET(flags, MDB_RDONLY)) + oflags = O_RDONLY; + else + oflags = O_RDWR | O_CREAT; + + env->me_fd = open(dpath, oflags, mode); +#endif + if (env->me_fd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } + + if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { + rc = mdb_env_setup_locks(env, lpath, mode, &excl); + if (rc) + goto leave; + } + + if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { + if (flags & (MDB_RDONLY|MDB_WRITEMAP)) { + env->me_mfd = env->me_fd; + } else { + // Synchronous fd for meta writes. Needed even with + // MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. +#ifdef _WIN32 + len = OPEN_EXISTING; + env->me_mfd = CreateFile(dpath, oflags, + FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, + mode | FILE_FLAG_WRITE_THROUGH, NULL); +#else + oflags &= ~O_CREAT; + env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode); +#endif + if (env->me_mfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } + } + DPRINTF(("opened dbenv %p", (void *) env)); + if (excl > 0) { + rc = mdb_env_share_locks(env, &excl); + if (rc) + goto leave; + } + if (!((flags & MDB_RDONLY) || + (env->me_pbuf = calloc(1, env->me_psize)))) + rc = ENOMEM; + } + +leave: + if (rc) { + mdb_env_close0(env, excl); + } + free(lpath); + return rc; + */ + return nil +} + +// Destroy resources from mdb_env_open(), clear our readers & DBIs +func (db *db) close0(excl) { + /* + int i; + + if (!(env->me_flags & MDB_ENV_ACTIVE)) + return; + + // Doing this here since me_dbxs may not exist during mdb_env_close + for (i = env->me_maxdbs; --i > MAIN_DBI; ) + free(env->me_dbxs[i].md_name.mv_data); + + free(env->me_pbuf); + free(env->me_dbflags); + free(env->me_dbxs); + free(env->me_path); + free(env->me_dirty_list); + mdb_midl_free(env->me_free_pgs); + + if (env->me_flags & MDB_ENV_TXKEY) { + pthread_key_delete(env->me_txkey); +#ifdef _WIN32 + // Delete our key from the global list + for (i=0; ime_txkey) { + mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; + mdb_tls_nkeys--; + break; + } +#endif + } + + if (env->me_map) { + munmap(env->me_map, env->me_mapsize); + } + if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE) + (void) close(env->me_mfd); + if (env->me_fd != INVALID_HANDLE_VALUE) + (void) close(env->me_fd); + if (env->me_txns) { + MDB_PID_T pid = env->me_pid; + // Clearing readers is done in this function because + // me_txkey with its destructor must be disabled first. + for (i = env->me_numreaders; --i >= 0; ) + if (env->me_txns->mti_readers[i].mr_pid == pid) + env->me_txns->mti_readers[i].mr_pid = 0; +#ifdef _WIN32 + if (env->me_rmutex) { + CloseHandle(env->me_rmutex); + if (env->me_wmutex) CloseHandle(env->me_wmutex); + } + // Windows automatically destroys the mutexes when + // the last handle closes. +#elif defined(MDB_USE_POSIX_SEM) + if (env->me_rmutex != SEM_FAILED) { + sem_close(env->me_rmutex); + if (env->me_wmutex != SEM_FAILED) + sem_close(env->me_wmutex); + // If we have the filelock: If we are the + // only remaining user, clean up semaphores. + if (excl == 0) + mdb_env_excl_lock(env, &excl); + if (excl > 0) { + sem_unlink(env->me_txns->mti_rmname); + sem_unlink(env->me_txns->mti_wmname); + } + } +#endif + munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); + } + if (env->me_lfd != INVALID_HANDLE_VALUE) { +#ifdef _WIN32 + if (excl >= 0) { + // Unlock the lockfile. Windows would have unlocked it + // after closing anyway, but not necessarily at once. + UnlockFile(env->me_lfd, 0, 0, 1, 0); + } +#endif + (void) close(env->me_lfd); + } + + env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); + */ +} + +func (db *db) copyfd(handle int) error { + /* + MDB_txn *txn = NULL; + int rc; + size_t wsize; + char *ptr; +#ifdef _WIN32 + DWORD len, w2; +#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) +#else + ssize_t len; + size_t w2; +#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) +#endif + + // Do the lock/unlock of the reader mutex before starting the + // write txn. Otherwise other read txns could block writers. + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + return rc; + + if (env->me_txns) { + // We must start the actual read txn after blocking writers + mdb_txn_reset0(txn, "reset-stage1"); + + // Temporarily block writers until we snapshot the meta pages + LOCK_MUTEX_W(env); + + rc = mdb_txn_renew0(txn); + if (rc) { + UNLOCK_MUTEX_W(env); + goto leave; + } + } + + wsize = env->me_psize * 2; + ptr = env->me_map; + w2 = wsize; + while (w2 > 0) { + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + w2 -= len; + continue; + } else { + // Non-blocking or async handles are not supported + rc = EIO; + break; + } + } + if (env->me_txns) + UNLOCK_MUTEX_W(env); + + if (rc) + goto leave; + + wsize = txn->mt_next_pgno * env->me_psize - wsize; + while (wsize > 0) { + if (wsize > MAX_WRITE) + w2 = MAX_WRITE; + else + w2 = wsize; + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + +leave: + mdb_txn_abort(txn); + return rc; +} + +int +mdb_env_copy(MDB_env *env, const char *path) +{ + int rc, len; + char *lpath; + HANDLE newfd = INVALID_HANDLE_VALUE; + + if (env->me_flags & MDB_NOSUBDIR) { + lpath = (char *)path; + } else { + len = strlen(path); + len += sizeof(DATANAME); + lpath = malloc(len); + if (!lpath) + return ENOMEM; + sprintf(lpath, "%s" DATANAME, path); + } + + // The destination path must exist, but the destination file must not. + // We don't want the OS to cache the writes, since the source data is + // already in the OS cache. +#ifdef _WIN32 + newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, + FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); +#else + newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); +#endif + if (newfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } + +#ifdef O_DIRECT + // Set O_DIRECT if the file system supports it + if ((rc = fcntl(newfd, F_GETFL)) != -1) + (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); +#endif +#ifdef F_NOCACHE // __APPLE__ + rc = fcntl(newfd, F_NOCACHE, 1); + if (rc) { + rc = ErrCode(); + goto leave; + } +#endif + + rc = mdb_env_copyfd(env, newfd); + +leave: + if (!(env->me_flags & MDB_NOSUBDIR)) + free(lpath); + if (newfd != INVALID_HANDLE_VALUE) + if (close(newfd) < 0 && rc == MDB_SUCCESS) + rc = ErrCode(); + + return rc; + */ + return nil +} + +func (db *db) Close() { + /* + MDB_page *dp; + + if (env == NULL) + return; + + VGMEMP_DESTROY(env); + while ((dp = env->me_dpages) != NULL) { + VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); + env->me_dpages = dp->mp_next; + free(dp); + } + + mdb_env_close0(env, 0); + free(env); + */ +} + +// Calculate the size of a leaf node. +// The size depends on the environment's page size; if a data item +// is too large it will be put onto an overflow page and the node +// size will only include the key and not the data. Sizes are always +// rounded up to an even number of bytes, to guarantee 2-byte alignment +// of the #MDB_node headers. +// @param[in] env The environment handle. +// @param[in] key The key for the node. +// @param[in] data The data for the node. +// @return The number of bytes needed to store the node. +func (db *db) LeafSize(key []byte, data []byte) int { + /* + size_t sz; + + sz = LEAFSIZE(key, data); + if (sz > env->me_nodemax) { + // put on overflow page + sz -= data->mv_size - sizeof(pgno_t); + } + + return EVEN(sz + sizeof(indx_t)); + */ + return 0 +} + +// Calculate the size of a branch node. +// The size should depend on the environment's page size but since +// we currently don't support spilling large keys onto overflow +// pages, it's simply the size of the #MDB_node header plus the +// size of the key. Sizes are always rounded up to an even number +// of bytes, to guarantee 2-byte alignment of the #MDB_node headers. +// @param[in] env The environment handle. +// @param[in] key The key for the node. +// @return The number of bytes needed to store the node. +func (db *db) BranchSize(key []byte) int { + /* + size_t sz; + + sz = INDXSIZE(key); + if (sz > env->me_nodemax) { + // put on overflow page + // not implemented + // sz -= key->size - sizeof(pgno_t); + } + + return sz + sizeof(indx_t); + */ + return 0 +} + +func (db *db) SetFlags(flag int, onoff bool) error { + /* + if ((flag & CHANGEABLE) != flag) + return EINVAL; + if (onoff) + env->me_flags |= flag; + else + env->me_flags &= ~flag; + return MDB_SUCCESS; + */ + return nil +} + +func (db *db) Flags() int { + return db.flags +} + + +func (db *db) Stat() *Stat + /* + int toggle; + + if (env == NULL || arg == NULL) + return EINVAL; + + toggle = mdb_env_pick_meta(env); + stat := &Stat{} + stat->ms_psize = env->me_psize; + stat->ms_depth = db->md_depth; + stat->ms_branch_pages = db->md_branch_pages; + stat->ms_leaf_pages = db->md_leaf_pages; + stat->ms_overflow_pages = db->md_overflow_pages; + stat->ms_entries = db->md_entries; + + //return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], stat); + return stat + */ +} + +func (db *db) Info() *Info { + /* + int toggle; + + if (env == NULL || arg == NULL) + return EINVAL; + + toggle = mdb_env_pick_meta(env); + arg->me_mapaddr = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : 0; + arg->me_mapsize = env->me_mapsize; + arg->me_maxreaders = env->me_maxreaders; + + // me_numreaders may be zero if this process never used any readers. Use + // the shared numreader count if it exists. + arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : env->me_numreaders; + + arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg; + arg->me_last_txnid = env->me_metas[toggle]->mm_txnid; + return MDB_SUCCESS; + */ + return nil +} + +// TODO: Move to bucket.go +func (db *db) CloseBucket(b Bucket) { + /* + char *ptr; + if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs) + return; + ptr = env->me_dbxs[dbi].md_name.mv_data; + env->me_dbxs[dbi].md_name.mv_data = NULL; + env->me_dbxs[dbi].md_name.mv_size = 0; + env->me_dbflags[dbi] = 0; + free(ptr); + */ +} + +//int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) +func (db *db) getReaderList() error { + /* + unsigned int i, rdrs; + MDB_reader *mr; + char buf[64]; + int rc = 0, first = 1; + + if (!env || !func) + return -1; + if (!env->me_txns) { + return func("(no reader locks)\n", ctx); + } + rdrs = env->me_txns->mti_numreaders; + mr = env->me_txns->mti_readers; + for (i=0; ime_txns) + return MDB_SUCCESS; + rdrs = env->me_txns->mti_numreaders; + pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); + if (!pids) + return ENOMEM; + pids[0] = 0; + mr = env->me_txns->mti_readers; + for (i=0; ime_pid) { + pid = mr[i].mr_pid; + if (mdb_pid_insert(pids, pid) == 0) { + if (!mdb_reader_pid(env, Pidcheck, pid)) { + LOCK_MUTEX_R(env); + // Recheck, a new process may have reused pid + if (!mdb_reader_pid(env, Pidcheck, pid)) { + for (j=i; jmv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } // TODO: #define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } + +// Compact the main page after deleting a node on a subpage. +// @param[in] mp The main page to operate on. +// @param[in] indx The index of the subpage on the main page. +func (n *node) shrink(index int) { + /* + MDB_node *node; + MDB_page *sp, *xp; + char *base; + int nsize, delta; + indx_t i, numkeys, ptr; + + node = NODEPTR(mp, indx); + sp = (MDB_page *)NODEDATA(node); + delta = SIZELEFT(sp); + xp = (MDB_page *)((char *)sp + delta); + + // shift subpage upward + if (IS_LEAF2(sp)) { + nsize = NUMKEYS(sp) * sp->mp_pad; + if (nsize & 1) + return; // do not make the node uneven-sized + memmove(METADATA(xp), METADATA(sp), nsize); + } else { + int i; + numkeys = NUMKEYS(sp); + for (i=numkeys-1; i>=0; i--) + xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; + } + xp->mp_upper = sp->mp_lower; + xp->mp_lower = sp->mp_lower; + xp->mp_flags = sp->mp_flags; + xp->mp_pad = sp->mp_pad; + COPY_PGNO(xp->mp_pgno, mp->mp_pgno); + + nsize = NODEDSZ(node) - delta; + SETDSZ(node, nsize); + + // shift lower nodes upward + ptr = mp->mp_ptrs[indx]; + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] += delta; + } + + base = (char *)mp + mp->mp_upper; + memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node)); + mp->mp_upper += delta; + */ +} diff --git a/page.go b/page.go index 3f1e6b3..fe639fa 100644 --- a/page.go +++ b/page.go @@ -17,6 +17,26 @@ const ( KeepPage = 0x8000 /**< leave this page alone during spill */ ) +// maxCommitPages is the maximum number of pages to commit in one writev() call. +const maxCommitPages 64 + +/* max bytes to write in one call */ +const maxWriteByteCount 0x80000000U // TODO: #define MAX_WRITE 0x80000000U >> (sizeof(ssize_t) == 4)) + +// TODO: +// #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES +// #undef MDB_COMMIT_PAGES +// #define MDB_COMMIT_PAGES IOV_MAX +// #endif + +// TODO: #define MDB_PS_MODIFY 1 +// TODO: #define MDB_PS_ROOTONLY 2 +// TODO: #define MDB_PS_FIRST 4 +// TODO: #define MDB_PS_LAST 8 + +// TODO: #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ + + type page struct { header struct { id int @@ -28,6 +48,11 @@ type page struct { metadata []byte } +type pageState struct { + head int /**< Reclaimed freeDB pages, or NULL before use */ + last int /**< ID of last used record, or 0 if !mf_pghead */ +} + // nodeCount returns the number of nodes on the page. func (p *page) nodeCount() int { return 0 // (p.header.lower - unsafe.Sizeof(p.header) >> 1 @@ -42,3 +67,4 @@ func (p *page) remainingSize() int { func (p *page) remainingSize() int { return p.header.upper - p.header.lower } + diff --git a/transaction.go b/transaction.go index 74118d1..1064e8e 100644 --- a/transaction.go +++ b/transaction.go @@ -31,3 +31,1509 @@ type transaction struct { // Implicit from slices? TODO: MDB_dbi mt_numdbs; mt_dirty_room int } + +// ntxn represents a nested transaction. +type ntxn struct { + transaction *transaction /**< the transaction */ + pageState pageState /**< parent transaction's saved freestate */ +} + + +func (t *transaction) allocPage(num int) *page { + /* + MDB_env *env = txn->mt_env; + MDB_page *ret = env->me_dpages; + size_t psize = env->me_psize, sz = psize, off; + // For ! #MDB_NOMEMINIT, psize counts how much to init. + // For a single page alloc, we init everything after the page header. + // For multi-page, we init the final page; if the caller needed that + // many pages they will be filling in at least up to the last page. + if (num == 1) { + if (ret) { + VGMEMP_ALLOC(env, ret, sz); + VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); + env->me_dpages = ret->mp_next; + return ret; + } + psize -= off = PAGEHDRSZ; + } else { + sz *= num; + off = sz - psize; + } + if ((ret = malloc(sz)) != NULL) { + VGMEMP_ALLOC(env, ret, sz); + if (!(env->me_flags & MDB_NOMEMINIT)) { + memset((char *)ret + off, 0, psize); + ret->mp_pad = 0; + } + } else { + txn->mt_flags |= MDB_TXN_ERROR; + } + return ret; + */ + return nil +} + +// Find oldest txnid still referenced. Expects txn->mt_txnid > 0. +func (t *transaction) oldest() int { + /* + int i; + txnid_t mr, oldest = txn->mt_txnid - 1; + if (txn->mt_env->me_txns) { + MDB_reader *r = txn->mt_env->me_txns->mti_readers; + for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { + if (r[i].mr_pid) { + mr = r[i].mr_txnid; + if (oldest > mr) + oldest = mr; + } + } + } + return oldest; + */ + return 0 +} + +// Add a page to the txn's dirty list +func (t *transaction) dirty(p *page) { + /* + MDB_ID2 mid; + int rc, (*insert)(MDB_ID2L, MDB_ID2 *); + + if (txn->mt_env->me_flags & MDB_WRITEMAP) { + insert = mdb_mid2l_append; + } else { + insert = mdb_mid2l_insert; + } + mid.mid = mp->mp_pgno; + mid.mptr = mp; + rc = insert(txn->mt_u.dirty_list, &mid); + mdb_tassert(txn, rc == 0); + txn->mt_dirty_room--; + */ +} + +// Pull a page off the txn's spill list, if present. +// If a page being referenced was spilled to disk in this txn, bring +// it back and make it dirty/writable again. +// @param[in] txn the transaction handle. +// @param[in] mp the page being referenced. It must not be dirty. +// @param[out] ret the writable page, if any. ret is unchanged if +// mp wasn't spilled. +func (t *transaction) unspill(p *page) *page { + /* + MDB_env *env = txn->mt_env; + const MDB_txn *tx2; + unsigned x; + pgno_t pgno = mp->mp_pgno, pn = pgno << 1; + + for (tx2 = txn; tx2; tx2=tx2->mt_parent) { + if (!tx2->mt_spill_pgs) + continue; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + MDB_page *np; + int num; + if (txn->mt_dirty_room == 0) + return MDB_TXN_FULL; + if (IS_OVERFLOW(mp)) + num = mp->mp_pages; + else + num = 1; + if (env->me_flags & MDB_WRITEMAP) { + np = mp; + } else { + np = mdb_page_malloc(txn, num); + if (!np) + return ENOMEM; + if (num > 1) + memcpy(np, mp, num * env->me_psize); + else + mdb_page_copy(np, mp, env->me_psize); + } + if (tx2 == txn) { + // If in current txn, this page is no longer spilled. + // If it happens to be the last page, truncate the spill list. + // Otherwise mark it as deleted by setting the LSB. + if (x == txn->mt_spill_pgs[0]) + txn->mt_spill_pgs[0]--; + else + txn->mt_spill_pgs[x] |= 1; + } // otherwise, if belonging to a parent txn, the + // page remains spilled until child commits + + mdb_page_dirty(txn, np); + np->mp_flags |= P_DIRTY; + *ret = np; + break; + } + } + return MDB_SUCCESS; + */ + return nil +} + +// Back up parent txn's cursors, then grab the originals for tracking +func (t *transaction) shadow(dst *transaction) error { + /* + MDB_cursor *mc, *bk; + MDB_xcursor *mx; + size_t size; + int i; + + for (i = src->mt_numdbs; --i >= 0; ) { + if ((mc = src->mt_cursors[i]) != NULL) { + size = sizeof(MDB_cursor); + if (mc->mc_xcursor) + size += sizeof(MDB_xcursor); + for (; mc; mc = bk->mc_next) { + bk = malloc(size); + if (!bk) + return ENOMEM; + *bk = *mc; + mc->mc_backup = bk; + mc->mc_db = &dst->mt_dbs[i]; + // Kill pointers into src - and dst to reduce abuse: The + // user may not use mc until dst ends. Otherwise we'd... + mc->mc_txn = NULL; // ...set this to dst + mc->mc_dbflag = NULL; // ...and &dst->mt_dbflags[i] + if ((mx = mc->mc_xcursor) != NULL) { + *(MDB_xcursor *)(bk+1) = *mx; + mx->mx_cursor.mc_txn = NULL; // ...and dst. + } + mc->mc_next = dst->mt_cursors[i]; + dst->mt_cursors[i] = mc; + } + } + } + return MDB_SUCCESS; + */ + return nil +} + +// Close this write txn's cursors, give parent txn's cursors back to parent. +// @param[in] txn the transaction handle. +// @param[in] merge true to keep changes to parent cursors, false to revert. +// @return 0 on success, non-zero on failure. +func (t *transaction) closeCursors(merge bool) { + /* + MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; + MDB_xcursor *mx; + int i; + + for (i = txn->mt_numdbs; --i >= 0; ) { + for (mc = cursors[i]; mc; mc = next) { + next = mc->mc_next; + if ((bk = mc->mc_backup) != NULL) { + if (merge) { + // Commit changes to parent txn + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbflag = bk->mc_dbflag; + if ((mx = mc->mc_xcursor) != NULL) + mx->mx_cursor.mc_txn = bk->mc_txn; + } else { + // Abort nested txn + *mc = *bk; + if ((mx = mc->mc_xcursor) != NULL) + *mx = *(MDB_xcursor *)(bk+1); + } + mc = bk; + } + // Only malloced cursors are permanently tracked. + free(mc); + } + cursors[i] = NULL; + } + */ +} + +// Common code for #mdb_txn_begin() and #mdb_txn_renew(). +// @param[in] txn the transaction handle to initialize +// @return 0 on success, non-zero on failure. +func (t *transaction) renew() error { + /* + MDB_env *env = txn->mt_env; + MDB_txninfo *ti = env->me_txns; + MDB_meta *meta; + unsigned int i, nr; + uint16_t x; + int rc, new_notls = 0; + + // Setup db info + txn->mt_numdbs = env->me_numdbs; + txn->mt_dbxs = env->me_dbxs; // mostly static anyway + + if (txn->mt_flags & MDB_TXN_RDONLY) { + if (!ti) { + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + txn->mt_txnid = meta->mm_txnid; + txn->mt_u.reader = NULL; + } else { + MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : + pthread_getspecific(env->me_txkey); + if (r) { + if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) + return MDB_BAD_RSLOT; + } else { + MDB_PID_T pid = env->me_pid; + pthread_t tid = pthread_self(); + + if (!(env->me_flags & MDB_LIVE_READER)) { + rc = mdb_reader_pid(env, Pidset, pid); + if (rc) + return rc; + env->me_flags |= MDB_LIVE_READER; + } + + LOCK_MUTEX_R(env); + nr = ti->mti_numreaders; + for (i=0; imti_readers[i].mr_pid == 0) + break; + if (i == env->me_maxreaders) { + UNLOCK_MUTEX_R(env); + return MDB_READERS_FULL; + } + ti->mti_readers[i].mr_pid = pid; + ti->mti_readers[i].mr_tid = tid; + if (i == nr) + ti->mti_numreaders = ++nr; + // Save numreaders for un-mutexed mdb_env_close() + env->me_numreaders = nr; + UNLOCK_MUTEX_R(env); + + r = &ti->mti_readers[i]; + new_notls = (env->me_flags & MDB_NOTLS); + if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { + r->mr_pid = 0; + return rc; + } + } + txn->mt_txnid = r->mr_txnid = ti->mti_txnid; + txn->mt_u.reader = r; + meta = env->me_metas[txn->mt_txnid & 1]; + } + } else { + if (ti) { + LOCK_MUTEX_W(env); + + txn->mt_txnid = ti->mti_txnid; + meta = env->me_metas[txn->mt_txnid & 1]; + } else { + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + txn->mt_txnid = meta->mm_txnid; + } + txn->mt_txnid++; +#if MDB_DEBUG + if (txn->mt_txnid == mdb_debug_start) + mdb_debug = 1; +#endif + txn->mt_dirty_room = MDB_IDL_UM_MAX; + txn->mt_u.dirty_list = env->me_dirty_list; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_free_pgs = env->me_free_pgs; + txn->mt_free_pgs[0] = 0; + txn->mt_spill_pgs = NULL; + env->me_txn = txn; + } + + // Copy the DB info and flags + memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db)); + + // Moved to here to avoid a data race in read TXNs + txn->mt_next_pgno = meta->mm_last_pg+1; + + for (i=2; imt_numdbs; i++) { + x = env->me_dbflags[i]; + txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; + txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0; + } + txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; + + if (env->me_maxpg < txn->mt_next_pgno) { + mdb_txn_reset0(txn, "renew0-mapfail"); + if (new_notls) { + txn->mt_u.reader->mr_pid = 0; + txn->mt_u.reader = NULL; + } + return MDB_MAP_RESIZED; + } + + return MDB_SUCCESS; + */ + return nil +} + +func (t *transaction) Renew() error { + /* + int rc; + + if (!txn || txn->mt_dbxs) // A reset txn has mt_dbxs==NULL + return EINVAL; + + if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + return MDB_PANIC; + } + + rc = mdb_txn_renew0(txn); + if (rc == MDB_SUCCESS) { + DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); + } + return rc; + */ + return nil +} + +func (t *transaction) DB() DB { + return t.db +} + +// Export or close DBI handles opened in this txn. +func (t *transaction) updateBuckets(keep bool) { + /* + int i; + MDB_dbi n = txn->mt_numdbs; + MDB_env *env = txn->mt_env; + unsigned char *tdbflags = txn->mt_dbflags; + + for (i = n; --i >= 2;) { + if (tdbflags[i] & DB_NEW) { + if (keep) { + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; + } else { + char *ptr = env->me_dbxs[i].md_name.mv_data; + env->me_dbxs[i].md_name.mv_data = NULL; + env->me_dbxs[i].md_name.mv_size = 0; + env->me_dbflags[i] = 0; + free(ptr); + } + } + } + if (keep && env->me_numdbs < n) + env->me_numdbs = n; + */ +} + +// Common code for #mdb_txn_reset() and #mdb_txn_abort(). +// May be called twice for readonly txns: First reset it, then abort. +// @param[in] txn the transaction handle to reset +// @param[in] act why the transaction is being reset +func (t *transaction) reset(act string) { + /* + MDB_env *env = txn->mt_env; + + // Close any DBI handles opened in this txn + mdb_dbis_update(txn, 0); + + DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + if (txn->mt_u.reader) { + txn->mt_u.reader->mr_txnid = (txnid_t)-1; + if (!(env->me_flags & MDB_NOTLS)) + txn->mt_u.reader = NULL; // txn does not own reader + } + txn->mt_numdbs = 0; // close nothing if called again + txn->mt_dbxs = NULL; // mark txn as reset + } else { + mdb_cursors_close(txn, 0); + + if (!(env->me_flags & MDB_WRITEMAP)) { + mdb_dlist_free(txn); + } + mdb_midl_free(env->me_pghead); + + if (txn->mt_parent) { + txn->mt_parent->mt_child = NULL; + env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; + mdb_midl_free(txn->mt_free_pgs); + mdb_midl_free(txn->mt_spill_pgs); + free(txn->mt_u.dirty_list); + return; + } + + if (mdb_midl_shrink(&txn->mt_free_pgs)) + env->me_free_pgs = txn->mt_free_pgs; + env->me_pghead = NULL; + env->me_pglast = 0; + + env->me_txn = NULL; + // The writer mutex was locked in mdb_txn_begin. + if (env->me_txns) + UNLOCK_MUTEX_W(env); + } + */ +} + +func (t *transaction) Reset() { + /* + if (txn == NULL) + return; + + // This call is only valid for read-only txns + if (!(txn->mt_flags & MDB_TXN_RDONLY)) + return; + + mdb_txn_reset0(txn, "reset"); + */ +} + +func (t *transaction) Abort() { + /* + if (txn == NULL) + return; + + if (txn->mt_child) + mdb_txn_abort(txn->mt_child); + + mdb_txn_reset0(txn, "abort"); + // Free reader slot tied to this txn (if MDB_NOTLS && writable FS) + if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) + txn->mt_u.reader->mr_pid = 0; + + free(txn); + */ +} + +// Save the freelist as of this transaction to the freeDB. +// This changes the freelist. Keep trying until it stabilizes. +func (t *transaction) saveFreelist() error { + /* + // env->me_pghead[] can grow and shrink during this call. + // env->me_pglast and txn->mt_free_pgs[] can only grow. + // Page numbers cannot disappear from txn->mt_free_pgs[]. + MDB_cursor mc; + MDB_env *env = txn->mt_env; + int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; + txnid_t pglast = 0, head_id = 0; + pgno_t freecnt = 0, *free_pgs, *mop; + ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; + + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + + if (env->me_pghead) { + // Make sure first page of freeDB is touched and on freelist + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + + // MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) + clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) + ? SSIZE_MAX : maxfree_1pg; + + for (;;) { + // Come back here after each Put() in case freelist changed + MDB_val key, data; + pgno_t *pgs; + ssize_t j; + + // If using records from freeDB which we have not yet + // deleted, delete them and any we reserved for me_pghead. + while (pglast < env->me_pglast) { + rc = mdb_cursor_first(&mc, &key, NULL); + if (rc) + return rc; + pglast = head_id = *(txnid_t *)key.mv_data; + total_room = head_room = 0; + mdb_tassert(txn, pglast <= env->me_pglast); + rc = mdb_cursor_del(&mc, 0); + if (rc) + return rc; + } + + // Save the IDL of pages freed by this txn, to a single record + if (freecnt < txn->mt_free_pgs[0]) { + if (!freecnt) { + // Make sure last page of freeDB is touched and on freelist + rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + free_pgs = txn->mt_free_pgs; + // Write to last page of freeDB + key.mv_size = sizeof(txn->mt_txnid); + key.mv_data = &txn->mt_txnid; + do { + freecnt = free_pgs[0]; + data.mv_size = MDB_IDL_SIZEOF(free_pgs); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + // Retry if mt_free_pgs[] grew during the Put() + free_pgs = txn->mt_free_pgs; + } while (freecnt < free_pgs[0]); + mdb_midl_sort(free_pgs); + memcpy(data.mv_data, free_pgs, data.mv_size); +#if (MDB_DEBUG) > 1 + { + unsigned int i = free_pgs[0]; + DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); + for (; i; i--) + DPRINTF(("IDL %"Z"u", free_pgs[i])); + } +#endif + continue; + } + + mop = env->me_pghead; + mop_len = mop ? mop[0] : 0; + + // Reserve records for me_pghead[]. Split it if multi-page, + // to avoid searching freeDB for a page range. Use keys in + // range [1,me_pglast]: Smaller than txnid of oldest reader. + if (total_room >= mop_len) { + if (total_room == mop_len || --more < 0) + break; + } else if (head_room >= maxfree_1pg && head_id > 1) { + // Keep current record (overflow page), add a new one + head_id--; + head_room = 0; + } + // (Re)write {key = head_id, IDL length = head_room} + total_room -= head_room; + head_room = mop_len - total_room; + if (head_room > maxfree_1pg && head_id > 1) { + // Overflow multi-page for part of me_pghead + head_room /= head_id; // amortize page sizes + head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); + } else if (head_room < 0) { + // Rare case, not bothering to delete this record + head_room = 0; + } + key.mv_size = sizeof(head_id); + key.mv_data = &head_id; + data.mv_size = (head_room + 1) * sizeof(pgno_t); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + // IDL is initially empty, zero out at least the length + pgs = (pgno_t *)data.mv_data; + j = head_room > clean_limit ? head_room : 0; + do { + pgs[j] = 0; + } while (--j >= 0); + total_room += head_room; + } + + // Fill in the reserved me_pghead records + rc = MDB_SUCCESS; + if (mop_len) { + MDB_val key, data; + + mop += mop_len; + rc = mdb_cursor_first(&mc, &key, &data); + for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { + unsigned flags = MDB_CURRENT; + txnid_t id = *(txnid_t *)key.mv_data; + ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; + MDB_ID save; + + mdb_tassert(txn, len >= 0 && id <= env->me_pglast); + key.mv_data = &id; + if (len > mop_len) { + len = mop_len; + data.mv_size = (len + 1) * sizeof(MDB_ID); + flags = 0; + } + data.mv_data = mop -= len; + save = mop[0]; + mop[0] = len; + rc = mdb_cursor_put(&mc, &key, &data, flags); + mop[0] = save; + if (rc || !(mop_len -= len)) + break; + } + } + return rc; + */ + return nil +} + +// Flush (some) dirty pages to the map, after clearing their dirty flag. +// @param[in] txn the transaction that's being committed +// @param[in] keep number of initial pages in dirty_list to keep dirty. +// @return 0 on success, non-zero on failure. +func (t *transaction) flush(keep bool) { + /* + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned psize = env->me_psize, j; + int i, pagecount = dl[0].mid, rc; + size_t size = 0, pos = 0; + pgno_t pgno = 0; + MDB_page *dp = NULL; +#ifdef _WIN32 + OVERLAPPED ov; +#else + struct iovec iov[MDB_COMMIT_PAGES]; + ssize_t wpos = 0, wsize = 0, wres; + size_t next_pos = 1; // impossible pos, so pos != next_pos + int n = 0; +#endif + + j = i = keep; + + if (env->me_flags & MDB_WRITEMAP) { + // Clear dirty flags + while (++i <= pagecount) { + dp = dl[i].mptr; + // Don't flush this page yet + if (dp->mp_flags & P_KEEP) { + dp->mp_flags ^= P_KEEP; + dl[++j] = dl[i]; + continue; + } + dp->mp_flags &= ~P_DIRTY; + } + goto done; + } + + // Write the pages + for (;;) { + if (++i <= pagecount) { + dp = dl[i].mptr; + // Don't flush this page yet + if (dp->mp_flags & P_KEEP) { + dp->mp_flags ^= P_KEEP; + dl[i].mid = 0; + continue; + } + pgno = dl[i].mid; + // clear dirty flag + dp->mp_flags &= ~P_DIRTY; + pos = pgno * psize; + size = psize; + if (IS_OVERFLOW(dp)) size *= dp->mp_pages; + } +#ifdef _WIN32 + else break; + + // Windows actually supports scatter/gather I/O, but only on + // unbuffered file handles. Since we're relying on the OS page + // cache for all our data, that's self-defeating. So we just + // write pages one at a time. We use the ov structure to set + // the write offset, to at least save the overhead of a Seek + // system call. + DPRINTF(("committing page %"Z"u", pgno)); + memset(&ov, 0, sizeof(ov)); + ov.Offset = pos & 0xffffffff; + ov.OffsetHigh = pos >> 16 >> 16; + if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { + rc = ErrCode(); + DPRINTF(("WriteFile: %d", rc)); + return rc; + } +#else + // Write up to MDB_COMMIT_PAGES dirty pages at a time. + if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { + if (n) { + // Write previous page(s) +#ifdef MDB_USE_PWRITEV + wres = pwritev(env->me_fd, iov, n, wpos); +#else + if (n == 1) { + wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); + } else { + if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { + rc = ErrCode(); + DPRINTF(("lseek: %s", strerror(rc))); + return rc; + } + wres = writev(env->me_fd, iov, n); + } +#endif + if (wres != wsize) { + if (wres < 0) { + rc = ErrCode(); + DPRINTF(("Write error: %s", strerror(rc))); + } else { + rc = EIO; // TODO: Use which error code? + DPUTS("short write, filesystem full?"); + } + return rc; + } + n = 0; + } + if (i > pagecount) + break; + wpos = pos; + wsize = 0; + } + DPRINTF(("committing page %"Z"u", pgno)); + next_pos = pos + size; + iov[n].iov_len = size; + iov[n].iov_base = (char *)dp; + wsize += size; + n++; +#endif // _WIN32 + } + + for (i = keep; ++i <= pagecount; ) { + dp = dl[i].mptr; + // This is a page we skipped above + if (!dl[i].mid) { + dl[++j] = dl[i]; + dl[j].mid = dp->mp_pgno; + continue; + } + mdb_dpage_free(env, dp); + } + +done: + i--; + txn->mt_dirty_room += i - j; + dl[0].mid = j; + return MDB_SUCCESS; +} + +int +mdb_txn_commit(MDB_txn *txn) +{ + int rc; + unsigned int i; + MDB_env *env; + + if (txn == NULL || txn->mt_env == NULL) + return EINVAL; + + if (txn->mt_child) { + rc = mdb_txn_commit(txn->mt_child); + txn->mt_child = NULL; + if (rc) + goto fail; + } + + env = txn->mt_env; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + mdb_dbis_update(txn, 1); + txn->mt_numdbs = 2; // so txn_abort() doesn't close any new handles + mdb_txn_abort(txn); + return MDB_SUCCESS; + } + + if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) { + DPUTS("error flag is set, can't commit"); + if (txn->mt_parent) + txn->mt_parent->mt_flags |= MDB_TXN_ERROR; + rc = MDB_BAD_TXN; + goto fail; + } + + if (txn->mt_parent) { + MDB_txn *parent = txn->mt_parent; + MDB_ID2L dst, src; + MDB_IDL pspill; + unsigned x, y, len, ps_len; + + // Append our free list to parent's + rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); + if (rc) + goto fail; + mdb_midl_free(txn->mt_free_pgs); + // Failures after this must either undo the changes + // to the parent or set MDB_TXN_ERROR in the parent. + + parent->mt_next_pgno = txn->mt_next_pgno; + parent->mt_flags = txn->mt_flags; + + // Merge our cursors into parent's and close them + mdb_cursors_close(txn, 1); + + // Update parent's DB table. + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + parent->mt_numdbs = txn->mt_numdbs; + parent->mt_dbflags[0] = txn->mt_dbflags[0]; + parent->mt_dbflags[1] = txn->mt_dbflags[1]; + for (i=2; imt_numdbs; i++) { + // preserve parent's DB_NEW status + x = parent->mt_dbflags[i] & DB_NEW; + parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; + } + + dst = parent->mt_u.dirty_list; + src = txn->mt_u.dirty_list; + // Remove anything in our dirty list from parent's spill list + if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { + x = y = ps_len; + pspill[0] = (pgno_t)-1; + // Mark our dirty pages as deleted in parent spill list + for (i=0, len=src[0].mid; ++i <= len; ) { + MDB_ID pn = src[i].mid << 1; + while (pn > pspill[x]) + x--; + if (pn == pspill[x]) { + pspill[x] = 1; + y = --x; + } + } + // Squash deleted pagenums if we deleted any + for (x=y; ++x <= ps_len; ) + if (!(pspill[x] & 1)) + pspill[++y] = pspill[x]; + pspill[0] = y; + } + + // Find len = length of merging our dirty list with parent's + x = dst[0].mid; + dst[0].mid = 0; // simplify loops + if (parent->mt_parent) { + len = x + src[0].mid; + y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; + for (i = x; y && i; y--) { + pgno_t yp = src[y].mid; + while (yp < dst[i].mid) + i--; + if (yp == dst[i].mid) { + i--; + len--; + } + } + } else { // Simplify the above for single-ancestor case + len = MDB_IDL_UM_MAX - txn->mt_dirty_room; + } + // Merge our dirty list with parent's + y = src[0].mid; + for (i = len; y; dst[i--] = src[y--]) { + pgno_t yp = src[y].mid; + while (yp < dst[x].mid) + dst[i--] = dst[x--]; + if (yp == dst[x].mid) + free(dst[x--].mptr); + } + mdb_tassert(txn, i == x); + dst[0].mid = len; + free(txn->mt_u.dirty_list); + parent->mt_dirty_room = txn->mt_dirty_room; + if (txn->mt_spill_pgs) { + if (parent->mt_spill_pgs) { + // TODO: Prevent failure here, so parent does not fail + rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); + if (rc) + parent->mt_flags |= MDB_TXN_ERROR; + mdb_midl_free(txn->mt_spill_pgs); + mdb_midl_sort(parent->mt_spill_pgs); + } else { + parent->mt_spill_pgs = txn->mt_spill_pgs; + } + } + + parent->mt_child = NULL; + mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); + free(txn); + return rc; + } + + if (txn != env->me_txn) { + DPUTS("attempt to commit unknown transaction"); + rc = EINVAL; + goto fail; + } + + mdb_cursors_close(txn, 0); + + if (!txn->mt_u.dirty_list[0].mid && + !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) + goto done; + + DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); + + // Update DB root pointers + if (txn->mt_numdbs > 2) { + MDB_cursor mc; + MDB_dbi i; + MDB_val data; + data.mv_size = sizeof(MDB_db); + + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + for (i = 2; i < txn->mt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + data.mv_data = &txn->mt_dbs[i]; + rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); + if (rc) + goto fail; + } + } + } + + rc = mdb_freelist_save(txn); + if (rc) + goto fail; + + mdb_midl_free(env->me_pghead); + env->me_pghead = NULL; + if (mdb_midl_shrink(&txn->mt_free_pgs)) + env->me_free_pgs = txn->mt_free_pgs; + +#if (MDB_DEBUG) > 2 + mdb_audit(txn); +#endif + + if ((rc = mdb_page_flush(txn, 0)) || + (rc = mdb_env_sync(env, 0)) || + (rc = mdb_env_write_meta(txn))) + goto fail; + +done: + env->me_pglast = 0; + env->me_txn = NULL; + mdb_dbis_update(txn, 1); + + if (env->me_txns) + UNLOCK_MUTEX_W(env); + free(txn); + + return MDB_SUCCESS; + +fail: + mdb_txn_abort(txn); + return rc; + */ + return nil +} + +// Update the environment info to commit a transaction. +// @param[in] txn the transaction that's being committed +// @return 0 on success, non-zero on failure. +func (t *transaction) writeMeta() error { + /* + MDB_env *env; + MDB_meta meta, metab, *mp; + off_t off; + int rc, len, toggle; + char *ptr; + HANDLE mfd; +#ifdef _WIN32 + OVERLAPPED ov; +#else + int r2; +#endif + + toggle = txn->mt_txnid & 1; + DPRINTF(("writing meta page %d for root page %"Z"u", + toggle, txn->mt_dbs[MAIN_DBI].md_root)); + + env = txn->mt_env; + mp = env->me_metas[toggle]; + + if (env->me_flags & MDB_WRITEMAP) { + // Persist any increases of mapsize config + if (env->me_mapsize > mp->mm_mapsize) + mp->mm_mapsize = env->me_mapsize; + mp->mm_dbs[0] = txn->mt_dbs[0]; + mp->mm_dbs[1] = txn->mt_dbs[1]; + mp->mm_last_pg = txn->mt_next_pgno - 1; + mp->mm_txnid = txn->mt_txnid; + if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { + unsigned meta_size = env->me_psize; + rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + ptr = env->me_map; + if (toggle) { +#ifndef _WIN32 // POSIX msync() requires ptr = start of OS page + if (meta_size < env->me_os_psize) + meta_size += meta_size; + else +#endif + ptr += meta_size; + } + if (MDB_MSYNC(ptr, meta_size, rc)) { + rc = ErrCode(); + goto fail; + } + } + goto done; + } + metab.mm_txnid = env->me_metas[toggle]->mm_txnid; + metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; + + ptr = (char *)&meta; + if (env->me_mapsize > mp->mm_mapsize) { + // Persist any increases of mapsize config + meta.mm_mapsize = env->me_mapsize; + off = offsetof(MDB_meta, mm_mapsize); + } else { + off = offsetof(MDB_meta, mm_dbs[0].md_depth); + } + len = sizeof(MDB_meta) - off; + + ptr += off; + meta.mm_dbs[0] = txn->mt_dbs[0]; + meta.mm_dbs[1] = txn->mt_dbs[1]; + meta.mm_last_pg = txn->mt_next_pgno - 1; + meta.mm_txnid = txn->mt_txnid; + + if (toggle) + off += env->me_psize; + off += PAGEHDRSZ; + + // Write to the SYNC fd + mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ? + env->me_fd : env->me_mfd; +#ifdef _WIN32 + { + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) + rc = -1; + } +#else + rc = pwrite(mfd, ptr, len, off); +#endif + if (rc != len) { + rc = rc < 0 ? ErrCode() : EIO; + DPUTS("write failed, disk error?"); + // On a failure, the pagecache still contains the new data. + // Write some old data back, to prevent it from being used. + // Use the non-SYNC fd; we know it will fail anyway. + meta.mm_last_pg = metab.mm_last_pg; + meta.mm_txnid = metab.mm_txnid; +#ifdef _WIN32 + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + WriteFile(env->me_fd, ptr, len, NULL, &ov); +#else + r2 = pwrite(env->me_fd, ptr, len, off); + (void)r2; // Silence warnings. We don't care about pwrite's return value +#endif +fail: + env->me_flags |= MDB_FATAL_ERROR; + return rc; + } +done: + // Memory ordering issues are irrelevant; since the entire writer + // is wrapped by wmutex, all of these changes will become visible + // after the wmutex is unlocked. Since the DB is multi-version, + // readers will get consistent data regardless of how fresh or + // how stale their view of these values is. + if (env->me_txns) + env->me_txns->mti_txnid = txn->mt_txnid; + + return MDB_SUCCESS; + */ + return nil +} + +// Find the address of the page corresponding to a given page number. +// @param[in] txn the transaction for this access. +// @param[in] pgno the page number for the page to retrieve. +// @param[out] ret address of a pointer where the page's address will be stored. +// @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. +// @return 0 on success, non-zero on failure. +func (t *transaction) getPage(id int) (*page, int, error) { + /* + MDB_env *env = txn->mt_env; + MDB_page *p = NULL; + int level; + + if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) { + MDB_txn *tx2 = txn; + level = 1; + do { + MDB_ID2L dl = tx2->mt_u.dirty_list; + unsigned x; + // Spilled pages were dirtied in this txn and flushed + // because the dirty list got full. Bring this page + // back in from the map (but don't unspill it here, + // leave that unless page_touch happens again). + if (tx2->mt_spill_pgs) { + MDB_ID pn = pgno << 1; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + goto done; + } + } + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + p = dl[x].mptr; + goto done; + } + } + level++; + } while ((tx2 = tx2->mt_parent) != NULL); + } + + if (pgno < txn->mt_next_pgno) { + level = 0; + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + DPRINTF(("page %"Z"u not found", pgno)); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_NOTFOUND; + } + +done: + *ret = p; + if (lvl) + *lvl = level; + return MDB_SUCCESS; + */ + + return nil, 0, nil +} + +// Return the data associated with a given node. +// @param[in] txn The transaction for this operation. +// @param[in] leaf The node being read. +// @param[out] data Updated to point to the node's data. +// @return 0 on success, non-zero on failure. +func (t *transaction) readNode(leaf *node, data []byte) error { + /* + MDB_page *omp; // overflow page + pgno_t pgno; + int rc; + + if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { + data->mv_size = NODEDSZ(leaf); + data->mv_data = NODEDATA(leaf); + return MDB_SUCCESS; + } + + // Read overflow data. + data->mv_size = NODEDSZ(leaf); + memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); + if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) { + DPRINTF(("read overflow page %"Z"u failed", pgno)); + return rc; + } + data->mv_data = METADATA(omp); + + return MDB_SUCCESS; + */ + return nil +} + +func (t *transaction) Get(bucket Bucket, key []byte) ([]byte, error) { + /* + MDB_cursor mc; + MDB_xcursor mx; + int exact = 0; + DKBUF; + + if (key == NULL || data == NULL) + return EINVAL; + + DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); + + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + + mdb_cursor_init(&mc, txn, dbi, &mx); + return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); + */ + return nil, nil +} + +func (t *transaction) Cursor(b Bucket) (Cursor, error) { + /* + MDB_cursor *mc; + size_t size = sizeof(MDB_cursor); + + if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; + + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + + // Allow read access to the freelist + if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EINVAL; + + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) + size += sizeof(MDB_xcursor); + + if ((mc = malloc(size)) != NULL) { + mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); + if (txn->mt_cursors) { + mc->mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = mc; + mc->mc_flags |= C_UNTRACK; + } + } else { + return ENOMEM; + } + + *ret = mc; + + return MDB_SUCCESS; + */ + return nil +} + +func (t *transaction) Renew(c Cursor) error { + /* + if (txn == NULL || mc == NULL || mc->mc_dbi >= txn->mt_numdbs) + return EINVAL; + + if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) + return EINVAL; + + mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); + return MDB_SUCCESS; + */ +} + +func (t *transaction) Delete(b *bucket, key []byte, data []byte) error { + /* + MDB_cursor mc; + MDB_xcursor mx; + MDB_cursor_op op; + MDB_val rdata, *xdata; + int rc, exact; + DKBUF; + + if (key == NULL) + return EINVAL; + + DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); + + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; + + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + mdb_cursor_init(&mc, txn, dbi, &mx); + + exact = 0; + if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { + // must ignore any data + data = NULL; + } + if (data) { + op = MDB_GET_BOTH; + rdata = *data; + xdata = &rdata; + } else { + op = MDB_SET; + xdata = NULL; + } + rc = mdb_cursor_set(&mc, key, xdata, op, &exact); + if (rc == 0) { + // let mdb_page_split know about this cursor if needed: + // delete will trigger a rebalance; if it needs to move + // a node from one page to another, it will have to + // update the parent's separator key(s). If the new sepkey + // is larger than the current one, the parent page may + // run out of space, triggering a split. We need this + // cursor to be consistent until the end of the rebalance. + mc.mc_flags |= C_UNTRACK; + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA); + txn->mt_cursors[dbi] = mc.mc_next; + } + return rc; + */ + return nil +} + +func (t *transaction) Put(b Bucket, key []byte, data []byte, flags int) error { + /* + MDB_cursor mc; + MDB_xcursor mx; + + if (key == NULL || data == NULL) + return EINVAL; + + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; + + if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) + return EINVAL; + + mdb_cursor_init(&mc, txn, dbi, &mx); + return mdb_cursor_put(&mc, key, data, flags); + */ +} + +func (t *transaction) Bucket(name string, flags int) (Bucket, error) { + /* + MDB_val key, data; + MDB_dbi i; + MDB_cursor mc; + int rc, dbflag, exact; + unsigned int unused = 0; + size_t len; + + if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) { + mdb_default_cmp(txn, FREE_DBI); + } + + if ((flags & VALID_FLAGS) != flags) + return EINVAL; + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; + + // main DB? + if (!name) { + *dbi = MAIN_DBI; + if (flags & PERSISTENT_FLAGS) { + uint16_t f2 = flags & PERSISTENT_FLAGS; + // make sure flag changes get committed + if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { + txn->mt_dbs[MAIN_DBI].md_flags |= f2; + txn->mt_flags |= MDB_TXN_DIRTY; + } + } + mdb_default_cmp(txn, MAIN_DBI); + return MDB_SUCCESS; + } + + if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + mdb_default_cmp(txn, MAIN_DBI); + } + + // Is the DB already open? + len = strlen(name); + for (i=2; imt_numdbs; i++) { + if (!txn->mt_dbxs[i].md_name.mv_size) { + // Remember this free slot + if (!unused) unused = i; + continue; + } + if (len == txn->mt_dbxs[i].md_name.mv_size && + !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { + *dbi = i; + return MDB_SUCCESS; + } + } + + // If no free slot and max hit, fail + if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) + return MDB_DBS_FULL; + + // Cannot mix named databases with some mainDB flags + if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) + return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; + + // Find the DB info + dbflag = DB_NEW|DB_VALID; + exact = 0; + key.mv_size = len; + key.mv_data = (void *)name; + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); + if (rc == MDB_SUCCESS) { + // make sure this is actually a DB + MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if (!(node->mn_flags & F_SUBDATA)) + return MDB_INCOMPATIBLE; + } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) { + // Create if requested + MDB_db dummy; + data.mv_size = sizeof(MDB_db); + data.mv_data = &dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.md_root = P_INVALID; + dummy.md_flags = flags & PERSISTENT_FLAGS; + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); + dbflag |= DB_DIRTY; + } + + // OK, got info, add to table + if (rc == MDB_SUCCESS) { + unsigned int slot = unused ? unused : txn->mt_numdbs; + txn->mt_dbxs[slot].md_name.mv_data = strdup(name); + txn->mt_dbxs[slot].md_name.mv_size = len; + txn->mt_dbxs[slot].md_rel = NULL; + txn->mt_dbflags[slot] = dbflag; + memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); + *dbi = slot; + mdb_default_cmp(txn, slot); + if (!unused) { + txn->mt_numdbs++; + } + } + + return rc; + */ + return nil, nil +} + +func (t *transaction) Stat(b Bucket) *Stat { + if (txn == NULL || arg == NULL || dbi >= txn->mt_numdbs) + return EINVAL; + + if (txn->mt_dbflags[dbi] & DB_STALE) { + MDB_cursor mc; + MDB_xcursor mx; + /* Stale, must read the DB's root. cursor_init does it for us. */ + mdb_cursor_init(&mc, txn, dbi, &mx); + } + return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); +} + +func (t *transaction) BucketFlags(b Bucket) (int, error) { + /* + // We could return the flags for the FREE_DBI too but what's the point? + if (txn == NULL || dbi < MAIN_DBI || dbi >= txn->mt_numdbs) + return EINVAL; + *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; + return MDB_SUCCESS; + */ + return 0, nil +} + +func (t *transaction) Drop(b Bucket int del) error { + /* + MDB_cursor *mc, *m2; + int rc; + + if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1 || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EACCES; + + rc = mdb_cursor_open(txn, dbi, &mc); + if (rc) + return rc; + + rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); + // Invalidate the dropped DB's cursors + for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + m2->mc_flags &= ~(C_INITIALIZED|C_EOF); + if (rc) + goto leave; + + // Can't delete the main DB + if (del && dbi > MAIN_DBI) { + rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL); + if (!rc) { + txn->mt_dbflags[dbi] = DB_STALE; + mdb_dbi_close(txn->mt_env, dbi); + } + } else { + // reset the DB record, mark it dirty + txn->mt_dbflags[dbi] |= DB_DIRTY; + txn->mt_dbs[dbi].md_depth = 0; + txn->mt_dbs[dbi].md_branch_pages = 0; + txn->mt_dbs[dbi].md_leaf_pages = 0; + txn->mt_dbs[dbi].md_overflow_pages = 0; + txn->mt_dbs[dbi].md_entries = 0; + txn->mt_dbs[dbi].md_root = P_INVALID; + + txn->mt_flags |= MDB_TXN_DIRTY; + } +leave: + mdb_cursor_close(mc); + return rc; + */ + return nil +} diff --git a/xcursor.go b/xcursor.go index fa0e915..35571ab 100644 --- a/xcursor.go +++ b/xcursor.go @@ -6,3 +6,4 @@ type xcursor struct { bucketx *bucketx bucketFlag int } +