From ee24437bfcb34dbf2549ecd26adc972c1eb7dc16 Mon Sep 17 00:00:00 2001 From: Ben Johnson Date: Sat, 11 Jan 2014 22:51:01 -0700 Subject: [PATCH] Initial db.open. --- Makefile | 20 + bucket.go | 16 +- cursor.go | 5763 ++++++++++++++++++++++++------------------------ db.go | 1432 ++++++------ db_test.go | 27 + error.go | 42 +- meta.go | 29 +- node.go | 76 +- os.go | 5 + page.go | 48 +- reader.go | 2 +- transaction.go | 2372 ++++++++++---------- 12 files changed, 4872 insertions(+), 4960 deletions(-) create mode 100644 Makefile create mode 100644 db_test.go create mode 100644 os.go diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..decd960 --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +PKG=./... +TEST=. +BENCH=. +COVERPROFILE=/tmp/c.out + +bench: benchpreq + go test -v -test.bench=$(BENCH) ./.bench + +cover: fmt + go test -coverprofile=$(COVERPROFILE) . + go tool cover -html=$(COVERPROFILE) + rm $(COVERPROFILE) + +fmt: + @go fmt ./... + +test: fmt + @go test -v -cover -test.run=$(TEST) $(PKG) + +.PHONY: bench cover fmt test diff --git a/bucket.go b/bucket.go index c1a7d27..6e7b791 100644 --- a/bucket.go +++ b/bucket.go @@ -7,12 +7,12 @@ package bolt // TODO: #define MAIN_DBI 1 type Bucket struct { - pad uint32 - flags uint16 - depth uint16 - branches pgno - leafs pgno - overflows pgno - entries uint64 - root pgno + pad uint32 + flags uint16 + depth uint16 + branches pgno + leafs pgno + overflows pgno + entries uint64 + root pgno } diff --git a/cursor.go b/cursor.go index 9faf5bc..bd85a3b 100644 --- a/cursor.go +++ b/cursor.go @@ -11,7 +11,6 @@ package bolt // TODO: #define MDB_NOSPILL 0x8000 /** Do not spill pages to disk if txn is getting full, may fail instead */ - type Cursor interface { First() error FirstDup() error @@ -32,24 +31,24 @@ type Cursor interface { type cursor struct { flags int - next *cursor + _next *cursor backup *cursor xcursor *xcursor transaction *transaction bucketId int - bucket *bucket - bucketx *bucketx - bucketFlag int - snum int - top int - page []*page - ki []int /**< stack of page indices */ + bucket *Bucket + // bucketx *bucketx + bucketFlag int + snum int + top int + page []*page + ki []int /**< stack of page indices */ } type xcursor struct { - cursor cursor - bucket *bucket - bucketx *bucketx + cursor cursor + bucket *Bucket + // bucketx *bucketx bucketFlag int } @@ -59,64 +58,64 @@ type xcursor struct { // P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. // @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). // @return 0 on success, non-zero on failure. -func (c *cursor) xkeep(unsigned pflags, int all) int { +func (c *cursor) xkeep(pflags int, all int) error { /* - enum { Mask = P_SUBP|P_DIRTY|P_KEEP }; - MDB_txn *txn = mc->mc_txn; - MDB_cursor *m3; - MDB_xcursor *mx; - MDB_page *dp, *mp; - MDB_node *leaf; - unsigned i, j; - int rc = MDB_SUCCESS, level; + enum { Mask = P_SUBP|P_DIRTY|P_KEEP }; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m3; + MDB_xcursor *mx; + MDB_page *dp, *mp; + MDB_node *leaf; + unsigned i, j; + int rc = MDB_SUCCESS, level; - // Mark pages seen by cursors - if (mc->mc_flags & C_UNTRACK) - mc = NULL; // will find mc in mt_cursors - for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { - for (; mc; mc=mc->mc_next) { - if (!(mc->mc_flags & C_INITIALIZED)) - continue; - for (m3 = mc;; m3 = &mx->mx_cursor) { - mp = NULL; - for (j=0; jmc_snum; j++) { - mp = m3->mc_pg[j]; - if ((mp->mp_flags & Mask) == pflags) - mp->mp_flags ^= P_KEEP; - } - mx = m3->mc_xcursor; - // Proceed to mx if it is at a sub-database - if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) - break; - if (! (mp && (mp->mp_flags & P_LEAF))) - break; - leaf = NODEPTR(mp, m3->mc_ki[j-1]); - if (!(leaf->mn_flags & F_SUBDATA)) - break; - } - } - if (i == 0) - break; - } - - if (all) { - // Mark dirty root pages - for (i=0; imt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - pgno_t pgno = txn->mt_dbs[i].md_root; - if (pgno == P_INVALID) + // Mark pages seen by cursors + if (mc->mc_flags & C_UNTRACK) + mc = NULL; // will find mc in mt_cursors + for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { + for (; mc; mc=mc->mc_next) { + if (!(mc->mc_flags & C_INITIALIZED)) continue; - if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) - break; - if ((dp->mp_flags & Mask) == pflags && level <= 1) - dp->mp_flags ^= P_KEEP; + for (m3 = mc;; m3 = &mx->mx_cursor) { + mp = NULL; + for (j=0; jmc_snum; j++) { + mp = m3->mc_pg[j]; + if ((mp->mp_flags & Mask) == pflags) + mp->mp_flags ^= P_KEEP; + } + mx = m3->mc_xcursor; + // Proceed to mx if it is at a sub-database + if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) + break; + if (! (mp && (mp->mp_flags & P_LEAF))) + break; + leaf = NODEPTR(mp, m3->mc_ki[j-1]); + if (!(leaf->mn_flags & F_SUBDATA)) + break; + } + } + if (i == 0) + break; + } + + if (all) { + // Mark dirty root pages + for (i=0; imt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + pgno_t pgno = txn->mt_dbs[i].md_root; + if (pgno == P_INVALID) + continue; + if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) + break; + if ((dp->mp_flags & Mask) == pflags && level <= 1) + dp->mp_flags ^= P_KEEP; + } } } - } - return rc; + return rc; */ - return 0 + return nil } // Spill pages from the dirty list back to disk. @@ -150,254 +149,254 @@ func (c *cursor) xkeep(unsigned pflags, int all) int { // @param[in] key For a put operation, the key being stored. // @param[in] data For a put operation, the data being stored. // @return 0 on success, non-zero on failure. -func (c *cursor) spill(MDB_val *key, MDB_val *data) int { +func (c *cursor) spill(key []byte, data []byte) error { /* - MDB_txn *txn = m0->mc_txn; - MDB_page *dp; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned int i, j, need; - int rc; + MDB_txn *txn = m0->mc_txn; + MDB_page *dp; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned int i, j, need; + int rc; - if (m0->mc_flags & C_SUB) - return MDB_SUCCESS; + if (m0->mc_flags & C_SUB) + return MDB_SUCCESS; - // Estimate how much space this op will take - i = m0->mc_db->md_depth; - // Named DBs also dirty the main DB - if (m0->mc_dbi > MAIN_DBI) - i += txn->mt_dbs[MAIN_DBI].md_depth; - // For puts, roughly factor in the key+data size - if (key) - i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; - i += i; // double it for good measure - need = i; + // Estimate how much space this op will take + i = m0->mc_db->md_depth; + // Named DBs also dirty the main DB + if (m0->mc_dbi > MAIN_DBI) + i += txn->mt_dbs[MAIN_DBI].md_depth; + // For puts, roughly factor in the key+data size + if (key) + i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; + i += i; // double it for good measure + need = i; - if (txn->mt_dirty_room > i) - return MDB_SUCCESS; + if (txn->mt_dirty_room > i) + return MDB_SUCCESS; - if (!txn->mt_spill_pgs) { - txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); - if (!txn->mt_spill_pgs) - return ENOMEM; - } else { - // purge deleted slots - MDB_IDL sl = txn->mt_spill_pgs; - unsigned int num = sl[0]; - j=0; - for (i=1; i<=num; i++) { - if (!(sl[i] & 1)) - sl[++j] = sl[i]; + if (!txn->mt_spill_pgs) { + txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); + if (!txn->mt_spill_pgs) + return ENOMEM; + } else { + // purge deleted slots + MDB_IDL sl = txn->mt_spill_pgs; + unsigned int num = sl[0]; + j=0; + for (i=1; i<=num; i++) { + if (!(sl[i] & 1)) + sl[++j] = sl[i]; + } + sl[0] = j; + } + + // Preserve pages which may soon be dirtied again + if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) + goto done; + + // Less aggressive spill - we originally spilled the entire dirty list, + // with a few exceptions for cursor pages and DB root pages. But this + // turns out to be a lot of wasted effort because in a large txn many + // of those pages will need to be used again. So now we spill only 1/8th + // of the dirty pages. Testing revealed this to be a good tradeoff, + // better than 1/2, 1/4, or 1/10. + if (need < MDB_IDL_UM_MAX / 8) + need = MDB_IDL_UM_MAX / 8; + + // Save the page IDs of all the pages we're flushing + // flush from the tail forward, this saves a lot of shifting later on. + for (i=dl[0].mid; i && need; i--) { + MDB_ID pn = dl[i].mid << 1; + dp = dl[i].mptr; + if (dp->mp_flags & P_KEEP) + continue; + // Can't spill twice, make sure it's not already in a parent's + // spill list. + if (txn->mt_parent) { + MDB_txn *tx2; + for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { + if (tx2->mt_spill_pgs) { + j = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { + dp->mp_flags |= P_KEEP; + break; + } + } + } + if (tx2) + continue; + } + if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) + goto done; + need--; + } + mdb_midl_sort(txn->mt_spill_pgs); + + // Flush the spilled part of dirty list + if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) + goto done; + + // Reset any dirty pages we kept that page_flush didn't see + rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); + + done: + txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; + return rc; + /* + return 0 } - sl[0] = j; - } - // Preserve pages which may soon be dirtied again - if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) - goto done; + // Allocate page numbers and memory for writing. Maintain me_pglast, + // me_pghead and mt_next_pgno. + // + // If there are free pages available from older transactions, they + // are re-used first. Otherwise allocate a new page at mt_next_pgno. + // Do not modify the freedB, just merge freeDB records into me_pghead[] + // and move me_pglast to say which records were consumed. Only this + // function can create me_pghead and move me_pglast/mt_next_pgno. + // @param[in] mc cursor A cursor handle identifying the transaction and + // database for which we are allocating. + // @param[in] num the number of pages to allocate. + // @param[out] mp Address of the allocated page(s). Requests for multiple pages + // will always be satisfied by a single contiguous chunk of memory. + // @return 0 on success, non-zero on failure. + func (c *cursor) allocPage(int num, MDB_page **mp) { + int rc, retry = INT_MAX; + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + pgno_t pgno, *mop = env->me_pghead; + unsigned i, j, k, mop_len = mop ? mop[0] : 0, n2 = num-1; + MDB_page *np; + txnid_t oldest = 0, last; + MDB_cursor_op op; + MDB_cursor m2; - // Less aggressive spill - we originally spilled the entire dirty list, - // with a few exceptions for cursor pages and DB root pages. But this - // turns out to be a lot of wasted effort because in a large txn many - // of those pages will need to be used again. So now we spill only 1/8th - // of the dirty pages. Testing revealed this to be a good tradeoff, - // better than 1/2, 1/4, or 1/10. - if (need < MDB_IDL_UM_MAX / 8) - need = MDB_IDL_UM_MAX / 8; + *mp = NULL; - // Save the page IDs of all the pages we're flushing - // flush from the tail forward, this saves a lot of shifting later on. - for (i=dl[0].mid; i && need; i--) { - MDB_ID pn = dl[i].mid << 1; - dp = dl[i].mptr; - if (dp->mp_flags & P_KEEP) - continue; - // Can't spill twice, make sure it's not already in a parent's - // spill list. - if (txn->mt_parent) { - MDB_txn *tx2; - for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { - if (tx2->mt_spill_pgs) { - j = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { - dp->mp_flags |= P_KEEP; + // If our dirty list is already full, we can't do anything + if (txn->mt_dirty_room == 0) { + rc = MDB_TXN_FULL; + goto fail; + } + + for (op = MDB_FIRST;; op = MDB_NEXT) { + MDB_val key, data; + MDB_node *leaf; + pgno_t *idl, old_id, new_id; + + // Seek a big enough contiguous page range. Prefer + // pages at the tail, just truncating the list. + if (mop_len > n2) { + i = mop_len; + do { + pgno = mop[i]; + if (mop[i-n2] == pgno+n2) + goto search_done; + } while (--i > n2); + if (Max_retries < INT_MAX && --retry < 0) break; + } + + if (op == MDB_FIRST) { // 1st iteration + // Prepare to fetch more and coalesce + oldest = mdb_find_oldest(txn); + last = env->me_pglast; + mdb_cursor_init(&m2, txn, FREE_DBI, NULL); + if (last) { + op = MDB_SET_RANGE; + key.mv_data = &last; // will look up last+1 + key.mv_size = sizeof(last); } } + + last++; + // Do not fetch more if the record will be too recent + if (oldest <= last) + break; + rc = mdb_cursor_get(&m2, &key, NULL, op); + if (rc) { + if (rc == MDB_NOTFOUND) + break; + goto fail; + } + last = *(txnid_t*)key.mv_data; + if (oldest <= last) + break; + np = m2.mc_pg[m2.mc_top]; + leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); + if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS) + return rc; + + idl = (MDB_ID *) data.mv_data; + i = idl[0]; + if (!mop) { + if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { + rc = ENOMEM; + goto fail; + } + } else { + if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) + goto fail; + mop = env->me_pghead; + } + env->me_pglast = last; + #if (MDB_DEBUG) > 1 + DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", + last, txn->mt_dbs[FREE_DBI].md_root, i)); + for (k = i; k; k--) + DPRINTF(("IDL %"Z"u", idl[k])); + #endif + // Merge in descending sorted order + j = mop_len; + k = mop_len += i; + mop[0] = (pgno_t)-1; + old_id = mop[j]; + while (i) { + new_id = idl[i--]; + for (; old_id < new_id; old_id = mop[--j]) + mop[k--] = old_id; + mop[k--] = new_id; + } + mop[0] = mop_len; } - if (tx2) - continue; - } - if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) - goto done; - need--; - } - mdb_midl_sort(txn->mt_spill_pgs); - // Flush the spilled part of dirty list - if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) - goto done; - - // Reset any dirty pages we kept that page_flush didn't see - rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); - -done: - txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; - return rc; - /* - return 0 -} - -// Allocate page numbers and memory for writing. Maintain me_pglast, -// me_pghead and mt_next_pgno. -// -// If there are free pages available from older transactions, they -// are re-used first. Otherwise allocate a new page at mt_next_pgno. -// Do not modify the freedB, just merge freeDB records into me_pghead[] -// and move me_pglast to say which records were consumed. Only this -// function can create me_pghead and move me_pglast/mt_next_pgno. -// @param[in] mc cursor A cursor handle identifying the transaction and -// database for which we are allocating. -// @param[in] num the number of pages to allocate. -// @param[out] mp Address of the allocated page(s). Requests for multiple pages -// will always be satisfied by a single contiguous chunk of memory. -// @return 0 on success, non-zero on failure. -func (c *cursor) allocPage(int num, MDB_page **mp) { - int rc, retry = INT_MAX; - MDB_txn *txn = mc->mc_txn; - MDB_env *env = txn->mt_env; - pgno_t pgno, *mop = env->me_pghead; - unsigned i, j, k, mop_len = mop ? mop[0] : 0, n2 = num-1; - MDB_page *np; - txnid_t oldest = 0, last; - MDB_cursor_op op; - MDB_cursor m2; - - *mp = NULL; - - // If our dirty list is already full, we can't do anything - if (txn->mt_dirty_room == 0) { - rc = MDB_TXN_FULL; - goto fail; - } - - for (op = MDB_FIRST;; op = MDB_NEXT) { - MDB_val key, data; - MDB_node *leaf; - pgno_t *idl, old_id, new_id; - - // Seek a big enough contiguous page range. Prefer - // pages at the tail, just truncating the list. - if (mop_len > n2) { - i = mop_len; - do { - pgno = mop[i]; - if (mop[i-n2] == pgno+n2) - goto search_done; - } while (--i > n2); - if (Max_retries < INT_MAX && --retry < 0) - break; - } - - if (op == MDB_FIRST) { // 1st iteration - // Prepare to fetch more and coalesce - oldest = mdb_find_oldest(txn); - last = env->me_pglast; - mdb_cursor_init(&m2, txn, FREE_DBI, NULL); - if (last) { - op = MDB_SET_RANGE; - key.mv_data = &last; // will look up last+1 - key.mv_size = sizeof(last); + // Use new pages from the map when nothing suitable in the freeDB + i = 0; + pgno = txn->mt_next_pgno; + if (pgno + num >= env->me_maxpg) { + DPUTS("DB size maxed out"); + rc = MDB_MAP_FULL; + goto fail; } - } - last++; - // Do not fetch more if the record will be too recent - if (oldest <= last) - break; - rc = mdb_cursor_get(&m2, &key, NULL, op); - if (rc) { - if (rc == MDB_NOTFOUND) - break; - goto fail; - } - last = *(txnid_t*)key.mv_data; - if (oldest <= last) - break; - np = m2.mc_pg[m2.mc_top]; - leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); - if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS) + search_done: + if (env->me_flags & MDB_WRITEMAP) { + np = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + if (!(np = mdb_page_malloc(txn, num))) { + rc = ENOMEM; + goto fail; + } + } + if (i) { + mop[0] = mop_len -= num; + // Move any stragglers down + for (j = i-num; j < mop_len; ) + mop[++j] = mop[++i]; + } else { + txn->mt_next_pgno = pgno + num; + } + np->mp_pgno = pgno; + mdb_page_dirty(txn, np); + *mp = np; + + return MDB_SUCCESS; + + fail: + txn->mt_flags |= MDB_TXN_ERROR; return rc; - - idl = (MDB_ID *) data.mv_data; - i = idl[0]; - if (!mop) { - if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { - rc = ENOMEM; - goto fail; - } - } else { - if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) - goto fail; - mop = env->me_pghead; - } - env->me_pglast = last; -#if (MDB_DEBUG) > 1 - DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", - last, txn->mt_dbs[FREE_DBI].md_root, i)); - for (k = i; k; k--) - DPRINTF(("IDL %"Z"u", idl[k])); -#endif - // Merge in descending sorted order - j = mop_len; - k = mop_len += i; - mop[0] = (pgno_t)-1; - old_id = mop[j]; - while (i) { - new_id = idl[i--]; - for (; old_id < new_id; old_id = mop[--j]) - mop[k--] = old_id; - mop[k--] = new_id; - } - mop[0] = mop_len; - } - - // Use new pages from the map when nothing suitable in the freeDB - i = 0; - pgno = txn->mt_next_pgno; - if (pgno + num >= env->me_maxpg) { - DPUTS("DB size maxed out"); - rc = MDB_MAP_FULL; - goto fail; - } - -search_done: - if (env->me_flags & MDB_WRITEMAP) { - np = (MDB_page *)(env->me_map + env->me_psize * pgno); - } else { - if (!(np = mdb_page_malloc(txn, num))) { - rc = ENOMEM; - goto fail; - } - } - if (i) { - mop[0] = mop_len -= num; - // Move any stragglers down - for (j = i-num; j < mop_len; ) - mop[++j] = mop[++i]; - } else { - txn->mt_next_pgno = pgno + num; - } - np->mp_pgno = pgno; - mdb_page_dirty(txn, np); - *mp = np; - - return MDB_SUCCESS; - -fail: - txn->mt_flags |= MDB_TXN_ERROR; - return rc; */ - return 0 + return nil } // Copy the used portions of a non-overflow page. @@ -406,19 +405,19 @@ fail: // @param[in] psize size of a page func (p *page) copyTo(dst *page, size int) { /* - enum { Align = sizeof(pgno_t) }; - indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; + enum { Align = sizeof(pgno_t) }; + indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; - // If page isn't full, just copy the used portion. Adjust - // alignment so memcpy may copy words instead of bytes. - if ((unused &= -Align) && !IS_LEAF2(src)) { - upper &= -Align; - memcpy(dst, src, (lower + (Align-1)) & -Align); - memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), - psize - upper); - } else { - memcpy(dst, src, psize - unused); - } + // If page isn't full, just copy the used portion. Adjust + // alignment so memcpy may copy words instead of bytes. + if ((unused &= -Align) && !IS_LEAF2(src)) { + upper &= -Align; + memcpy(dst, src, (lower + (Align-1)) & -Align); + memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), + psize - upper); + } else { + memcpy(dst, src, psize - unused); + } */ } @@ -427,101 +426,101 @@ func (p *page) copyTo(dst *page, size int) { // @return 0 on success, non-zero on failure. func (c *cursor) page_touch() int { /* - MDB_page *mp = mc->mc_pg[mc->mc_top], *np; - MDB_txn *txn = mc->mc_txn; - MDB_cursor *m2, *m3; - pgno_t pgno; - int rc; + MDB_page *mp = mc->mc_pg[mc->mc_top], *np; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m2, *m3; + pgno_t pgno; + int rc; - if (!F_ISSET(mp->mp_flags, P_DIRTY)) { - if (txn->mt_flags & MDB_TXN_SPILLS) { - np = NULL; - rc = mdb_page_unspill(txn, mp, &np); - if (rc) - goto fail; - if (np) - goto done; - } - if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || - (rc = mdb_page_alloc(mc, 1, &np))) - goto fail; - pgno = np->mp_pgno; - DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), - mp->mp_pgno, pgno)); - mdb_cassert(mc, mp->mp_pgno != pgno); - mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); - // Update the parent page, if any, to point to the new page - if (mc->mc_top) { - MDB_page *parent = mc->mc_pg[mc->mc_top-1]; - MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); - SETPGNO(node, pgno); - } else { - mc->mc_db->md_root = pgno; - } - } else if (txn->mt_parent && !IS_SUBP(mp)) { - MDB_ID2 mid, *dl = txn->mt_u.dirty_list; - pgno = mp->mp_pgno; - // If txn has a parent, make sure the page is in our - // dirty list. - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - if (mp != dl[x].mptr) { // bad cursor? - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; + if (!F_ISSET(mp->mp_flags, P_DIRTY)) { + if (txn->mt_flags & MDB_TXN_SPILLS) { + np = NULL; + rc = mdb_page_unspill(txn, mp, &np); + if (rc) + goto fail; + if (np) + goto done; } + if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || + (rc = mdb_page_alloc(mc, 1, &np))) + goto fail; + pgno = np->mp_pgno; + DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), + mp->mp_pgno, pgno)); + mdb_cassert(mc, mp->mp_pgno != pgno); + mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + // Update the parent page, if any, to point to the new page + if (mc->mc_top) { + MDB_page *parent = mc->mc_pg[mc->mc_top-1]; + MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); + SETPGNO(node, pgno); + } else { + mc->mc_db->md_root = pgno; + } + } else if (txn->mt_parent && !IS_SUBP(mp)) { + MDB_ID2 mid, *dl = txn->mt_u.dirty_list; + pgno = mp->mp_pgno; + // If txn has a parent, make sure the page is in our + // dirty list. + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (mp != dl[x].mptr) { // bad cursor? + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + return 0; + } + } + mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); + // No - copy it + np = mdb_page_malloc(txn, 1); + if (!np) + return ENOMEM; + mid.mid = pgno; + mid.mptr = np; + rc = mdb_mid2l_insert(dl, &mid); + mdb_cassert(mc, rc == 0); + } else { return 0; } - } - mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); - // No - copy it - np = mdb_page_malloc(txn, 1); - if (!np) - return ENOMEM; - mid.mid = pgno; - mid.mptr = np; - rc = mdb_mid2l_insert(dl, &mid); - mdb_cassert(mc, rc == 0); - } else { - return 0; - } - mdb_page_copy(np, mp, txn->mt_env->me_psize); - np->mp_pgno = pgno; - np->mp_flags |= P_DIRTY; + mdb_page_copy(np, mp, txn->mt_env->me_psize); + np->mp_pgno = pgno; + np->mp_flags |= P_DIRTY; -done: - // Adjust cursors pointing to mp - mc->mc_pg[mc->mc_top] = np; - m2 = txn->mt_cursors[mc->mc_dbi]; - if (mc->mc_flags & C_SUB) { - for (; m2; m2=m2->mc_next) { - m3 = &m2->mc_xcursor->mx_cursor; - if (m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[mc->mc_top] == mp) - m3->mc_pg[mc->mc_top] = np; - } - } else { - for (; m2; m2=m2->mc_next) { - if (m2->mc_snum < mc->mc_snum) continue; - if (m2->mc_pg[mc->mc_top] == mp) { - m2->mc_pg[mc->mc_top] = np; - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) - { - MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]); - if (!(leaf->mn_flags & F_SUBDATA)) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + done: + // Adjust cursors pointing to mp + mc->mc_pg[mc->mc_top] = np; + m2 = txn->mt_cursors[mc->mc_dbi]; + if (mc->mc_flags & C_SUB) { + for (; m2; m2=m2->mc_next) { + m3 = &m2->mc_xcursor->mx_cursor; + if (m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[mc->mc_top] == mp) + m3->mc_pg[mc->mc_top] = np; + } + } else { + for (; m2; m2=m2->mc_next) { + if (m2->mc_snum < mc->mc_snum) continue; + if (m2->mc_pg[mc->mc_top] == mp) { + m2->mc_pg[mc->mc_top] = np; + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) + { + MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]); + if (!(leaf->mn_flags & F_SUBDATA)) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + } } } - } - } - return 0; + return 0; -fail: - txn->mt_flags |= MDB_TXN_ERROR; - return rc; + fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; */ return 0 @@ -535,127 +534,127 @@ fail: // If no entry larger or equal to the key is found, returns NULL. func (c *cursor) search(key []byte) (*node, bool) { /* - unsigned int i = 0, nkeys; - int low, high; - int rc = 0; - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_node *node = NULL; - MDB_val nodekey; - MDB_cmp_func *cmp; - DKBUF; + unsigned int i = 0, nkeys; + int low, high; + int rc = 0; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NULL; + MDB_val nodekey; + MDB_cmp_func *cmp; + DKBUF; - nkeys = NUMKEYS(mp); + nkeys = NUMKEYS(mp); - DPRINTF(("searching %u keys in %s %spage %"Z"u", - nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - mdb_dbg_pgno(mp))); + DPRINTF(("searching %u keys in %s %spage %"Z"u", + nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mdb_dbg_pgno(mp))); - low = IS_LEAF(mp) ? 0 : 1; - high = nkeys - 1; - cmp = mc->mc_dbx->md_cmp; + low = IS_LEAF(mp) ? 0 : 1; + high = nkeys - 1; + cmp = mc->mc_dbx->md_cmp; - // Branch pages have no data, so if using integer keys, - // alignment is guaranteed. Use faster mdb_cmp_int. - if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { - if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) - cmp = mdb_cmp_long; - else - cmp = mdb_cmp_int; - } + // Branch pages have no data, so if using integer keys, + // alignment is guaranteed. Use faster mdb_cmp_int. + if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { + if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) + cmp = mdb_cmp_long; + else + cmp = mdb_cmp_int; + } - if (IS_LEAF2(mp)) { - nodekey.mv_size = mc->mc_db->md_pad; - node = NODEPTR(mp, 0); // fake - while (low <= high) { - i = (low + high) >> 1; - nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); - rc = cmp(key, &nodekey); - DPRINTF(("found leaf index %u [%s], rc = %i", - i, DKEY(&nodekey), rc)); - if (rc == 0) - break; - if (rc > 0) - low = i + 1; - else - high = i - 1; - } - } else { - while (low <= high) { - i = (low + high) >> 1; + if (IS_LEAF2(mp)) { + nodekey.mv_size = mc->mc_db->md_pad; + node = NODEPTR(mp, 0); // fake + while (low <= high) { + i = (low + high) >> 1; + nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); + rc = cmp(key, &nodekey); + DPRINTF(("found leaf index %u [%s], rc = %i", + i, DKEY(&nodekey), rc)); + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } else { + while (low <= high) { + i = (low + high) >> 1; - node = NODEPTR(mp, i); - nodekey.mv_size = NODEKSZ(node); - nodekey.mv_data = NODEKEY(node); + node = NODEPTR(mp, i); + nodekey.mv_size = NODEKSZ(node); + nodekey.mv_data = NODEKEY(node); - rc = cmp(key, &nodekey); -#if MDB_DEBUG - if (IS_LEAF(mp)) - DPRINTF(("found leaf index %u [%s], rc = %i", - i, DKEY(&nodekey), rc)); - else - DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", - i, DKEY(&nodekey), NODEPGNO(node), rc)); -#endif - if (rc == 0) - break; - if (rc > 0) - low = i + 1; - else - high = i - 1; - } - } + rc = cmp(key, &nodekey); + #if MDB_DEBUG + if (IS_LEAF(mp)) + DPRINTF(("found leaf index %u [%s], rc = %i", + i, DKEY(&nodekey), rc)); + else + DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", + i, DKEY(&nodekey), NODEPGNO(node), rc)); + #endif + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } - if (rc > 0) { // Found entry is less than the key. - i++; // Skip to get the smallest entry larger than key. - if (!IS_LEAF2(mp)) - node = NODEPTR(mp, i); - } - if (exactp) - *exactp = (rc == 0 && nkeys > 0); - // store the key index - mc->mc_ki[mc->mc_top] = i; - if (i >= nkeys) - // There is no entry larger or equal to the key. - return NULL; + if (rc > 0) { // Found entry is less than the key. + i++; // Skip to get the smallest entry larger than key. + if (!IS_LEAF2(mp)) + node = NODEPTR(mp, i); + } + if (exactp) + *exactp = (rc == 0 && nkeys > 0); + // store the key index + mc->mc_ki[mc->mc_top] = i; + if (i >= nkeys) + // There is no entry larger or equal to the key. + return NULL; - // nodeptr is fake for LEAF2 - return node; + // nodeptr is fake for LEAF2 + return node; */ return nil, false } func (c *cursor) pop() { /* - if (mc->mc_snum) { -#if MDB_DEBUG - MDB_page *top = mc->mc_pg[mc->mc_top]; -#endif - mc->mc_snum--; - if (mc->mc_snum) - mc->mc_top--; + if (mc->mc_snum) { + #if MDB_DEBUG + MDB_page *top = mc->mc_pg[mc->mc_top]; + #endif + mc->mc_snum--; + if (mc->mc_snum) + mc->mc_top--; - DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno, - DDBI(mc), (void *) mc)); - } + DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno, + DDBI(mc), (void *) mc)); + } */ } /** Push a page onto the top of the cursor's stack. */ func (c *cursor) push(p *page) error { /* - DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, - DDBI(mc), (void *) mc)); + DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, + DDBI(mc), (void *) mc)); - if (mc->mc_snum >= CURSOR_STACK) { - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CURSOR_FULL; - } + if (mc->mc_snum >= CURSOR_STACK) { + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CURSOR_FULL; + } - mc->mc_top = mc->mc_snum++; - mc->mc_pg[mc->mc_top] = mp; - mc->mc_ki[mc->mc_top] = 0; + mc->mc_top = mc->mc_snum++; + mc->mc_pg[mc->mc_top] = mp; + mc->mc_ki[mc->mc_top] = 0; - return MDB_SUCCESS; + return MDB_SUCCESS; */ return nil } @@ -664,67 +663,67 @@ func (c *cursor) push(p *page) error { // The cursor is at the root page, set up the rest of it. func (c *cursor) searchRoot(key []byte, flags int) error { /* - MDB_page *mp = mc->mc_pg[mc->mc_top]; - int rc; - DKBUF; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int rc; + DKBUF; - while (IS_BRANCH(mp)) { - MDB_node *node; - indx_t i; + while (IS_BRANCH(mp)) { + MDB_node *node; + indx_t i; - DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); - mdb_cassert(mc, NUMKEYS(mp) > 1); - DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); + DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); + mdb_cassert(mc, NUMKEYS(mp) > 1); + DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); - if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { - i = 0; - if (flags & MDB_PS_LAST) - i = NUMKEYS(mp) - 1; - } else { - int exact; - node = mdb_node_search(mc, key, &exact); - if (node == NULL) - i = NUMKEYS(mp) - 1; - else { - i = mc->mc_ki[mc->mc_top]; - if (!exact) { - mdb_cassert(mc, i > 0); - i--; + if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { + i = 0; + if (flags & MDB_PS_LAST) + i = NUMKEYS(mp) - 1; + } else { + int exact; + node = mdb_node_search(mc, key, &exact); + if (node == NULL) + i = NUMKEYS(mp) - 1; + else { + i = mc->mc_ki[mc->mc_top]; + if (!exact) { + mdb_cassert(mc, i > 0); + i--; + } } + DPRINTF(("following index %u for key [%s]", i, DKEY(key))); } - DPRINTF(("following index %u for key [%s]", i, DKEY(key))); - } - mdb_cassert(mc, i < NUMKEYS(mp)); - node = NODEPTR(mp, i); + mdb_cassert(mc, i < NUMKEYS(mp)); + node = NODEPTR(mp, i); - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) - return rc; - - mc->mc_ki[mc->mc_top] = i; - if ((rc = mdb_cursor_push(mc, mp))) - return rc; - - if (flags & MDB_PS_MODIFY) { - if ((rc = mdb_page_touch(mc)) != 0) + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) return rc; - mp = mc->mc_pg[mc->mc_top]; + + mc->mc_ki[mc->mc_top] = i; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + + if (flags & MDB_PS_MODIFY) { + if ((rc = mdb_page_touch(mc)) != 0) + return rc; + mp = mc->mc_pg[mc->mc_top]; + } } - } - if (!IS_LEAF(mp)) { - DPRINTF(("internal error, index points to a %02X page!?", - mp->mp_flags)); - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; - } + if (!IS_LEAF(mp)) { + DPRINTF(("internal error, index points to a %02X page!?", + mp->mp_flags)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } - DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, - key ? DKEY(key) : "null")); - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; + DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, + key ? DKEY(key) : "null")); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; - return MDB_SUCCESS; + return MDB_SUCCESS; */ return nil } @@ -736,18 +735,19 @@ func (c *cursor) searchRoot(key []byte, flags int) error { // be underfilled. func (c *cursor) searchLowest() error { /* - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_node *node = NODEPTR(mp, 0); - int rc; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NODEPTR(mp, 0); + int rc; - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) - return rc; + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) + return rc; - mc->mc_ki[mc->mc_top] = 0; - if ((rc = mdb_cursor_push(mc, mp))) - return rc; - return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); + mc->mc_ki[mc->mc_top] = 0; + if ((rc = mdb_cursor_push(mc, mp))) + return rc; + return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); */ + return nil } // Search for the page a given key should be in. @@ -762,151 +762,150 @@ func (c *cursor) searchLowest() error { // @return 0 on success, non-zero on failure. func (c *cursor) findPage(key []byte, flags int) error { /* - int rc; - pgno_t root; + int rc; + pgno_t root; - // Make sure the txn is still viable, then find the root from - // the txn's db table and set it as the root of the cursor's stack. - if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) { - DPUTS("transaction has failed, must abort"); - return MDB_BAD_TXN; - } else { - // Make sure we're using an up-to-date root - if (*mc->mc_dbflag & DB_STALE) { - MDB_cursor mc2; - mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); - rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); - if (rc) - return rc; - { - MDB_val data; - int exact = 0; - uint16_t flags; - MDB_node *leaf = mdb_node_search(&mc2, - &mc->mc_dbx->md_name, &exact); - if (!exact) - return MDB_NOTFOUND; - rc = mdb_node_read(mc->mc_txn, leaf, &data); + // Make sure the txn is still viable, then find the root from + // the txn's db table and set it as the root of the cursor's stack. + if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) { + DPUTS("transaction has failed, must abort"); + return MDB_BAD_TXN; + } else { + // Make sure we're using an up-to-date root + if (*mc->mc_dbflag & DB_STALE) { + MDB_cursor mc2; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); if (rc) return rc; - memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), - sizeof(uint16_t)); - // The txn may not know this DBI, or another process may - // have dropped and recreated the DB with other flags. - if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) - return MDB_INCOMPATIBLE; - memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); - } - *mc->mc_dbflag &= ~DB_STALE; + { + MDB_val data; + int exact = 0; + uint16_t flags; + MDB_node *leaf = mdb_node_search(&mc2, + &mc->mc_dbx->md_name, &exact); + if (!exact) + return MDB_NOTFOUND; + rc = mdb_node_read(mc->mc_txn, leaf, &data); + if (rc) + return rc; + memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), + sizeof(uint16_t)); + // The txn may not know this DBI, or another process may + // have dropped and recreated the DB with other flags. + if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) + return MDB_INCOMPATIBLE; + memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); + } + *mc->mc_dbflag &= ~DB_STALE; + } + root = mc->mc_db->md_root; + + if (root == P_INVALID) { // Tree is empty. + DPUTS("tree is empty"); + return MDB_NOTFOUND; + } } - root = mc->mc_db->md_root; - if (root == P_INVALID) { // Tree is empty. - DPUTS("tree is empty"); - return MDB_NOTFOUND; + mdb_cassert(mc, root > 1); + if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) + if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0) + return rc; + + mc->mc_snum = 1; + mc->mc_top = 0; + + DPRINTF(("db %d root page %"Z"u has flags 0x%X", + DDBI(mc), root, mc->mc_pg[0]->mp_flags)); + + if (flags & MDB_PS_MODIFY) { + if ((rc = mdb_page_touch(mc))) + return rc; } - } - mdb_cassert(mc, root > 1); - if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) - if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0) - return rc; + if (flags & MDB_PS_ROOTONLY) + return MDB_SUCCESS; - mc->mc_snum = 1; - mc->mc_top = 0; - - DPRINTF(("db %d root page %"Z"u has flags 0x%X", - DDBI(mc), root, mc->mc_pg[0]->mp_flags)); - - if (flags & MDB_PS_MODIFY) { - if ((rc = mdb_page_touch(mc))) - return rc; - } - - if (flags & MDB_PS_ROOTONLY) - return MDB_SUCCESS; - - return mdb_page_search_root(mc, key, flags); + return mdb_page_search_root(mc, key, flags); */ return nil } func (c *cursor) freeOverflowPage(p *page) error { /* - MDB_txn *txn = mc->mc_txn; - pgno_t pg = mp->mp_pgno; - unsigned x = 0, ovpages = mp->mp_pages; - MDB_env *env = txn->mt_env; - MDB_IDL sl = txn->mt_spill_pgs; - MDB_ID pn = pg << 1; - int rc; + MDB_txn *txn = mc->mc_txn; + pgno_t pg = mp->mp_pgno; + unsigned x = 0, ovpages = mp->mp_pages; + MDB_env *env = txn->mt_env; + MDB_IDL sl = txn->mt_spill_pgs; + MDB_ID pn = pg << 1; + int rc; - DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); - // If the page is dirty or on the spill list we just acquired it, - // so we should give it back to our current free list, if any. - // Otherwise put it onto the list of pages we freed in this txn. - // - // Won't create me_pghead: me_pglast must be inited along with it. - // Unsupported in nested txns: They would need to hide the page - // range in ancestor txns' dirty and spilled lists. - if (env->me_pghead && - !txn->mt_parent && - ((mp->mp_flags & P_DIRTY) || - (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) - { - unsigned i, j; - pgno_t *mop; - MDB_ID2 *dl, ix, iy; - rc = mdb_midl_need(&env->me_pghead, ovpages); - if (rc) - return rc; - if (!(mp->mp_flags & P_DIRTY)) { - // This page is no longer spilled - if (x == sl[0]) - sl[0]--; - else - sl[x] |= 1; - goto release; - } - // Remove from dirty list - dl = txn->mt_u.dirty_list; - x = dl[0].mid--; - for (ix = dl[x]; ix.mptr != mp; ix = iy) { - if (x > 1) { - x--; - iy = dl[x]; - dl[x] = ix; + DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); + // If the page is dirty or on the spill list we just acquired it, + // so we should give it back to our current free list, if any. + // Otherwise put it onto the list of pages we freed in this txn. + // + // Won't create me_pghead: me_pglast must be inited along with it. + // Unsupported in nested txns: They would need to hide the page + // range in ancestor txns' dirty and spilled lists. + if (env->me_pghead && + !txn->mt_parent && + ((mp->mp_flags & P_DIRTY) || + (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) + { + unsigned i, j; + pgno_t *mop; + MDB_ID2 *dl, ix, iy; + rc = mdb_midl_need(&env->me_pghead, ovpages); + if (rc) + return rc; + if (!(mp->mp_flags & P_DIRTY)) { + // This page is no longer spilled + if (x == sl[0]) + sl[0]--; + else + sl[x] |= 1; + goto release; + } + // Remove from dirty list + dl = txn->mt_u.dirty_list; + x = dl[0].mid--; + for (ix = dl[x]; ix.mptr != mp; ix = iy) { + if (x > 1) { + x--; + iy = dl[x]; + dl[x] = ix; + } else { + mdb_cassert(mc, x > 1); + j = ++(dl[0].mid); + dl[j] = ix; // Unsorted. OK when MDB_TXN_ERROR. + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + } + if (!(env->me_flags & MDB_WRITEMAP)) + mdb_dpage_free(env, mp); + release: + // Insert in me_pghead + mop = env->me_pghead; + j = mop[0] + ovpages; + for (i = mop[0]; i && mop[i] < pg; i--) + mop[j--] = mop[i]; + while (j>i) + mop[j--] = pg++; + mop[0] += ovpages; } else { - mdb_cassert(mc, x > 1); - j = ++(dl[0].mid); - dl[j] = ix; // Unsorted. OK when MDB_TXN_ERROR. - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; + rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); + if (rc) + return rc; } - } - if (!(env->me_flags & MDB_WRITEMAP)) - mdb_dpage_free(env, mp); -release: - // Insert in me_pghead - mop = env->me_pghead; - j = mop[0] + ovpages; - for (i = mop[0]; i && mop[i] < pg; i--) - mop[j--] = mop[i]; - while (j>i) - mop[j--] = pg++; - mop[0] += ovpages; - } else { - rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); - if (rc) - return rc; - } - mc->mc_db->md_overflow_pages -= ovpages; - return 0; + mc->mc_db->md_overflow_pages -= ovpages; + return 0; */ return nil } - // Find a sibling for a page. // Replaces the page at the top of the cursor's stack with the // specified sibling, if one exists. @@ -916,50 +915,50 @@ release: // @return 0 on success, non-zero on failure. func (c *cursor) sibling(moveRight bool) error { /* - int rc; - MDB_node *indx; - MDB_page *mp; + int rc; + MDB_node *indx; + MDB_page *mp; - if (mc->mc_snum < 2) { - return MDB_NOTFOUND; // root has no siblings - } + if (mc->mc_snum < 2) { + return MDB_NOTFOUND; // root has no siblings + } - mdb_cursor_pop(mc); - DPRINTF(("parent page is page %"Z"u, index %u", - mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); + mdb_cursor_pop(mc); + DPRINTF(("parent page is page %"Z"u, index %u", + mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); - if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) - : (mc->mc_ki[mc->mc_top] == 0)) { - DPRINTF(("no more keys left, moving to %s sibling", - move_right ? "right" : "left")); - if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { - // undo cursor_pop before returning - mc->mc_top++; - mc->mc_snum++; + if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) + : (mc->mc_ki[mc->mc_top] == 0)) { + DPRINTF(("no more keys left, moving to %s sibling", + move_right ? "right" : "left")); + if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { + // undo cursor_pop before returning + mc->mc_top++; + mc->mc_snum++; + return rc; + } + } else { + if (move_right) + mc->mc_ki[mc->mc_top]++; + else + mc->mc_ki[mc->mc_top]--; + DPRINTF(("just moving to %s index key %u", + move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); + } + mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + + indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) { + // mc will be inconsistent if caller does mc_snum++ as above + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); return rc; } - } else { - if (move_right) - mc->mc_ki[mc->mc_top]++; - else - mc->mc_ki[mc->mc_top]--; - DPRINTF(("just moving to %s index key %u", - move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); - } - mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); - indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) { - // mc will be inconsistent if caller does mc_snum++ as above - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - return rc; - } + mdb_cursor_push(mc, mp); + if (!move_right) + mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; - mdb_cursor_push(mc, mp); - if (!move_right) - mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; - - return MDB_SUCCESS; + return MDB_SUCCESS; */ return nil } @@ -967,81 +966,81 @@ func (c *cursor) sibling(moveRight bool) error { // Move the cursor to the next data item. func (c *cursor) next(key []byte, data []byte, op int) error { /* - MDB_page *mp; - MDB_node *leaf; - int rc; + MDB_page *mp; + MDB_node *leaf; + int rc; - if (mc->mc_flags & C_EOF) { - return MDB_NOTFOUND; - } + if (mc->mc_flags & C_EOF) { + return MDB_NOTFOUND; + } - mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); + mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); - mp = mc->mc_pg[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; - if (mc->mc_db->md_flags & MDB_DUPSORT) { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_NEXT || op == MDB_NEXT_DUP) { - rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); - if (op != MDB_NEXT || rc != MDB_NOTFOUND) { - if (rc == MDB_SUCCESS) - MDB_GET_KEY(leaf, key); - return rc; + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_NEXT || op == MDB_NEXT_DUP) { + rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); + if (op != MDB_NEXT || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) + MDB_GET_KEY(leaf, key); + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (op == MDB_NEXT_DUP) + return MDB_NOTFOUND; } } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (op == MDB_NEXT_DUP) - return MDB_NOTFOUND; - } - } - DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", - mdb_dbg_pgno(mp), (void *) mc)); - if (mc->mc_flags & C_DEL) - goto skip; + DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); + if (mc->mc_flags & C_DEL) + goto skip; - if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { - DPUTS("=====> move to next sibling page"); - if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { - mc->mc_flags |= C_EOF; - return rc; - } - mp = mc->mc_pg[mc->mc_top]; - DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); - } else - mc->mc_ki[mc->mc_top]++; + if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { + DPUTS("=====> move to next sibling page"); + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { + mc->mc_flags |= C_EOF; + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + } else + mc->mc_ki[mc->mc_top]++; -skip: - DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", - mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + skip: + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc != MDB_SUCCESS) - return rc; - } - } + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } + } - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; */ return nil } @@ -1049,516 +1048,520 @@ skip: // Move the cursor to the previous data item. func (c *cursor) prev(key []byte, data []byte, op int) error { /* - MDB_page *mp; - MDB_node *leaf; - int rc; + MDB_page *mp; + MDB_node *leaf; + int rc; - mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); + mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); - mp = mc->mc_pg[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; - if (mc->mc_db->md_flags & MDB_DUPSORT) { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_PREV || op == MDB_PREV_DUP) { - rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); - if (op != MDB_PREV || rc != MDB_NOTFOUND) { - if (rc == MDB_SUCCESS) - MDB_GET_KEY(leaf, key); - return rc; + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_PREV || op == MDB_PREV_DUP) { + rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); + if (op != MDB_PREV || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) + MDB_GET_KEY(leaf, key); + return rc; + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (op == MDB_PREV_DUP) + return MDB_NOTFOUND; } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (op == MDB_PREV_DUP) - return MDB_NOTFOUND; } } - } - DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", - mdb_dbg_pgno(mp), (void *) mc)); + DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", + mdb_dbg_pgno(mp), (void *) mc)); - if (mc->mc_ki[mc->mc_top] == 0) { - DPUTS("=====> move to prev sibling page"); - if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { - return rc; + if (mc->mc_ki[mc->mc_top] == 0) { + DPUTS("=====> move to prev sibling page"); + if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; + DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + } else + mc->mc_ki[mc->mc_top]--; + + mc->mc_flags &= ~C_EOF; + + DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; } - mp = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; - DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); - } else - mc->mc_ki[mc->mc_top]--; - mc->mc_flags &= ~C_EOF; - - DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", - mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); - - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc != MDB_SUCCESS) - return rc; + mdb_xcursor_init1(mc, leaf); } - } + if (data) { + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; */ + return nil } // Set the cursor on a specific data item. // (bool return is whether it is exact). func (c *cursor) set(key []byte, data []byte, op int) (error, bool) { /* - int rc; - MDB_page *mp; - MDB_node *leaf = NULL; - DKBUF; + int rc; + MDB_page *mp; + MDB_node *leaf = NULL; + DKBUF; - if (key->mv_size == 0) - return MDB_BAD_VALSIZE; + if (key->mv_size == 0) + return MDB_BAD_VALSIZE; - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - // See if we're already on the right page - if (mc->mc_flags & C_INITIALIZED) { - MDB_val nodekey; + // See if we're already on the right page + if (mc->mc_flags & C_INITIALIZED) { + MDB_val nodekey; - mp = mc->mc_pg[mc->mc_top]; - if (!NUMKEYS(mp)) { - mc->mc_ki[mc->mc_top] = 0; - return MDB_NOTFOUND; - } - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_size = mc->mc_db->md_pad; - nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); - } else { - leaf = NODEPTR(mp, 0); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - // Probably happens rarely, but first node on the page - // was the one we wanted. - mc->mc_ki[mc->mc_top] = 0; - if (exactp) - *exactp = 1; - goto set1; - } - if (rc > 0) { - unsigned int i; - unsigned int nkeys = NUMKEYS(mp); - if (nkeys > 1) { + mp = mc->mc_pg[mc->mc_top]; + if (!NUMKEYS(mp)) { + mc->mc_ki[mc->mc_top] = 0; + return MDB_NOTFOUND; + } if (mp->mp_flags & P_LEAF2) { - nodekey.mv_data = LEAF2KEY(mp, - nkeys-1, nodekey.mv_size); + nodekey.mv_size = mc->mc_db->md_pad; + nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); } else { - leaf = NODEPTR(mp, nkeys-1); + leaf = NODEPTR(mp, 0); MDB_GET_KEY2(leaf, nodekey); } rc = mc->mc_dbx->md_cmp(key, &nodekey); if (rc == 0) { - // last node was the one we wanted - mc->mc_ki[mc->mc_top] = nkeys-1; + // Probably happens rarely, but first node on the page + // was the one we wanted. + mc->mc_ki[mc->mc_top] = 0; if (exactp) *exactp = 1; goto set1; } - if (rc < 0) { - if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { - // This is definitely the right page, skip search_page + if (rc > 0) { + unsigned int i; + unsigned int nkeys = NUMKEYS(mp); + if (nkeys > 1) { if (mp->mp_flags & P_LEAF2) { nodekey.mv_data = LEAF2KEY(mp, - mc->mc_ki[mc->mc_top], nodekey.mv_size); + nkeys-1, nodekey.mv_size); } else { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + leaf = NODEPTR(mp, nkeys-1); MDB_GET_KEY2(leaf, nodekey); } rc = mc->mc_dbx->md_cmp(key, &nodekey); if (rc == 0) { - // current node was the one we wanted + // last node was the one we wanted + mc->mc_ki[mc->mc_top] = nkeys-1; if (exactp) *exactp = 1; goto set1; } + if (rc < 0) { + if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { + // This is definitely the right page, skip search_page + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, + mc->mc_ki[mc->mc_top], nodekey.mv_size); + } else { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + // current node was the one we wanted + if (exactp) + *exactp = 1; + goto set1; + } + } + rc = 0; + goto set2; + } } - rc = 0; - goto set2; + // If any parents have right-sibs, search. + // Otherwise, there's nothing further. + for (i=0; imc_top; i++) + if (mc->mc_ki[i] < + NUMKEYS(mc->mc_pg[i])-1) + break; + if (i == mc->mc_top) { + // There are no other pages + mc->mc_ki[mc->mc_top] = nkeys; + return MDB_NOTFOUND; + } + } + if (!mc->mc_top) { + // There are no other pages + mc->mc_ki[mc->mc_top] = 0; + if (op == MDB_SET_RANGE) { + rc = 0; + goto set1; + } else + return MDB_NOTFOUND; } } - // If any parents have right-sibs, search. - // Otherwise, there's nothing further. - for (i=0; imc_top; i++) - if (mc->mc_ki[i] < - NUMKEYS(mc->mc_pg[i])-1) - break; - if (i == mc->mc_top) { - // There are no other pages - mc->mc_ki[mc->mc_top] = nkeys; + + rc = mdb_page_search(mc, key, 0); + if (rc != MDB_SUCCESS) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + mdb_cassert(mc, IS_LEAF(mp)); + + set2: + leaf = mdb_node_search(mc, key, exactp); + if (exactp != NULL && !*exactp) { + // MDB_SET specified and not an exact match. return MDB_NOTFOUND; } - } - if (!mc->mc_top) { - // There are no other pages - mc->mc_ki[mc->mc_top] = 0; - if (op == MDB_SET_RANGE) { - rc = 0; - goto set1; - } else - return MDB_NOTFOUND; - } - } - rc = mdb_page_search(mc, key, 0); - if (rc != MDB_SUCCESS) - return rc; + if (leaf == NULL) { + DPUTS("===> inexact leaf not found, goto sibling"); + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) + return rc; // no entries matched + mp = mc->mc_pg[mc->mc_top]; + mdb_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, 0); + } - mp = mc->mc_pg[mc->mc_top]; - mdb_cassert(mc, IS_LEAF(mp)); + set1: + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; -set2: - leaf = mdb_node_search(mc, key, exactp); - if (exactp != NULL && !*exactp) { - // MDB_SET specified and not an exact match. - return MDB_NOTFOUND; - } + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } - if (leaf == NULL) { - DPUTS("===> inexact leaf not found, goto sibling"); - if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) - return rc; // no entries matched - mp = mc->mc_pg[mc->mc_top]; - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, 0); - } + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + } + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + } else { + int ex2, *ex2p; + if (op == MDB_GET_BOTH) { + ex2p = &ex2; + ex2 = 0; + } else { + ex2p = NULL; + } + rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); + if (rc != MDB_SUCCESS) + return rc; + } + } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { + MDB_val d2; + if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS) + return rc; + rc = mc->mc_dbx->md_dcmp(data, &d2); + if (rc) { + if (op == MDB_GET_BOTH || rc > 0) + return MDB_NOTFOUND; + rc = 0; + *data = d2; + } -set1: - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - } else { - int ex2, *ex2p; - if (op == MDB_GET_BOTH) { - ex2p = &ex2; - ex2 = 0; } else { - ex2p = NULL; + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; } - rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); - if (rc != MDB_SUCCESS) - return rc; - } - } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { - MDB_val d2; - if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS) - return rc; - rc = mc->mc_dbx->md_dcmp(data, &d2); - if (rc) { - if (op == MDB_GET_BOTH || rc > 0) - return MDB_NOTFOUND; - rc = 0; - *data = d2; } - } else { - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; - } - } + // The key already matches in all other cases + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) + MDB_GET_KEY(leaf, key); + DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); - // The key already matches in all other cases - if (op == MDB_SET_RANGE || op == MDB_SET_KEY) - MDB_GET_KEY(leaf, key); - DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); - - return rc; + return rc; */ - return nil + return nil, false } // Move the cursor to the first item in the database. func (c *cursor) first(key []byte, data []byte) error { /* - int rc; - MDB_node *leaf; + int rc; + MDB_node *leaf; - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); - if (rc != MDB_SUCCESS) - return rc; - } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - - leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - mc->mc_ki[mc->mc_top] = 0; - - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); - return MDB_SUCCESS; - } - - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc) - return rc; - } else { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); + if (rc != MDB_SUCCESS) return rc; } - } - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + mc->mc_ki[mc->mc_top] = 0; + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); + return MDB_SUCCESS; + } + + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else { + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; */ return nil } // Move the cursor to the last item in the database. func (c *cursor) last() ([]byte, []byte) { - int rc; - MDB_node *leaf; + /* + int rc; + MDB_node *leaf; - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (!(mc->mc_flags & C_EOF)) { + if (!(mc->mc_flags & C_EOF)) { + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_LAST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_LAST); - if (rc != MDB_SUCCESS) - return rc; } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_flags |= C_INITIALIZED|C_EOF; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - } - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; - mc->mc_flags |= C_INITIALIZED|C_EOF; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (rc) + return rc; + } else { + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) + return rc; + } + } + + MDB_GET_KEY(leaf, key); return MDB_SUCCESS; - } - - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (rc) - return rc; - } else { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; + */ + return nil, nil } func (c *cursor) Get(key []byte, data []byte, op int) ([]byte, []byte, error) { /* - int rc; - int exact = 0; - int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); + int rc; + int exact = 0; + int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); - if (mc == NULL) - return EINVAL; + if (mc == NULL) + return EINVAL; - if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; + if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; - switch (op) { - case MDB_GET_CURRENT: - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = EINVAL; - } else { - MDB_page *mp = mc->mc_pg[mc->mc_top]; - int nkeys = NUMKEYS(mp); - if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { - mc->mc_ki[mc->mc_top] = nkeys; - rc = MDB_NOTFOUND; - break; - } - rc = MDB_SUCCESS; - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_pad; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - } else { - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDB_GET_KEY(leaf, key); - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (mc->mc_flags & C_DEL) - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); + switch (op) { + case MDB_GET_CURRENT: + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + } else { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int nkeys = NUMKEYS(mp); + if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { + mc->mc_ki[mc->mc_top] = nkeys; + rc = MDB_NOTFOUND; + break; + } + rc = MDB_SUCCESS; + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_pad; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); } else { - rc = mdb_node_read(mc->mc_txn, leaf, data); + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY(leaf, key); + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (mc->mc_flags & C_DEL) + mdb_xcursor_init1(mc, leaf); + rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); + } else { + rc = mdb_node_read(mc->mc_txn, leaf, data); + } + } } } - } - } - break; - case MDB_GET_BOTH: - case MDB_GET_BOTH_RANGE: - if (data == NULL) { - rc = EINVAL; - break; - } - if (mc->mc_xcursor == NULL) { - rc = MDB_INCOMPATIBLE; - break; - } - // FALLTHRU - case MDB_SET: - case MDB_SET_KEY: - case MDB_SET_RANGE: - if (key == NULL) { - rc = EINVAL; - } else { - rc = mdb_cursor_set(mc, key, data, op, - op == MDB_SET_RANGE ? NULL : &exact); - } - break; - case MDB_GET_MULTIPLE: - if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { - rc = EINVAL; - break; - } - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - rc = MDB_INCOMPATIBLE; - break; - } - rc = MDB_SUCCESS; - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || - (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) - break; - goto fetchm; - case MDB_NEXT_MULTIPLE: - if (data == NULL) { - rc = EINVAL; - break; - } - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - rc = MDB_INCOMPATIBLE; - break; - } - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_first(mc, key, data); - else - rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); - if (rc == MDB_SUCCESS) { - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - MDB_cursor *mx; -fetchm: - mx = &mc->mc_xcursor->mx_cursor; - data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * - mx->mc_db->md_pad; - data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); - mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; - } else { - rc = MDB_NOTFOUND; - } - } - break; - case MDB_NEXT: - case MDB_NEXT_DUP: - case MDB_NEXT_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_first(mc, key, data); - else - rc = mdb_cursor_next(mc, key, data, op); - break; - case MDB_PREV: - case MDB_PREV_DUP: - case MDB_PREV_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = mdb_cursor_last(mc, key, data); - if (rc) break; - mc->mc_flags |= C_INITIALIZED; - mc->mc_ki[mc->mc_top]++; - } - rc = mdb_cursor_prev(mc, key, data, op); - break; - case MDB_FIRST: - rc = mdb_cursor_first(mc, key, data); - break; - case MDB_FIRST_DUP: - mfunc = mdb_cursor_first; - mmove: - if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { - rc = EINVAL; - break; - } - if (mc->mc_xcursor == NULL) { - rc = MDB_INCOMPATIBLE; - break; - } - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - rc = EINVAL; - break; - } - rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); - break; - case MDB_LAST: - rc = mdb_cursor_last(mc, key, data); - break; - case MDB_LAST_DUP: - mfunc = mdb_cursor_last; - goto mmove; - default: - DPRINTF(("unhandled/unimplemented cursor operation %u", op)); - rc = EINVAL; - break; - } + case MDB_GET_BOTH: + case MDB_GET_BOTH_RANGE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (mc->mc_xcursor == NULL) { + rc = MDB_INCOMPATIBLE; + break; + } + // FALLTHRU + case MDB_SET: + case MDB_SET_KEY: + case MDB_SET_RANGE: + if (key == NULL) { + rc = EINVAL; + } else { + rc = mdb_cursor_set(mc, key, data, op, + op == MDB_SET_RANGE ? NULL : &exact); + } + break; + case MDB_GET_MULTIPLE: + if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = MDB_SUCCESS; + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || + (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) + break; + goto fetchm; + case MDB_NEXT_MULTIPLE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_first(mc, key, data); + else + rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); + if (rc == MDB_SUCCESS) { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + MDB_cursor *mx; + fetchm: + mx = &mc->mc_xcursor->mx_cursor; + data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * + mx->mc_db->md_pad; + data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); + mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_NEXT: + case MDB_NEXT_DUP: + case MDB_NEXT_NODUP: + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_first(mc, key, data); + else + rc = mdb_cursor_next(mc, key, data, op); + break; + case MDB_PREV: + case MDB_PREV_DUP: + case MDB_PREV_NODUP: + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdb_cursor_last(mc, key, data); + if (rc) + break; + mc->mc_flags |= C_INITIALIZED; + mc->mc_ki[mc->mc_top]++; + } + rc = mdb_cursor_prev(mc, key, data, op); + break; + case MDB_FIRST: + rc = mdb_cursor_first(mc, key, data); + break; + case MDB_FIRST_DUP: + mfunc = mdb_cursor_first; + mmove: + if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + if (mc->mc_xcursor == NULL) { + rc = MDB_INCOMPATIBLE; + break; + } + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + rc = EINVAL; + break; + } + rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); + break; + case MDB_LAST: + rc = mdb_cursor_last(mc, key, data); + break; + case MDB_LAST_DUP: + mfunc = mdb_cursor_last; + goto mmove; + default: + DPRINTF(("unhandled/unimplemented cursor operation %u", op)); + rc = EINVAL; + break; + } - if (mc->mc_flags & C_DEL) - mc->mc_flags ^= C_DEL; + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; - return rc; + return rc; */ return nil, nil, nil } @@ -1568,576 +1571,576 @@ fetchm: // @param[in] mc The cursor to operate on. func (c *cursor) touch() error { /* - int rc = MDB_SUCCESS; + int rc = MDB_SUCCESS; - if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { - MDB_cursor mc2; - MDB_xcursor mcx; - mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); - rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); - if (rc) - return rc; - *mc->mc_dbflag |= DB_DIRTY; - } - mc->mc_top = 0; - if (mc->mc_snum) { - do { - rc = mdb_page_touch(mc); - } while (!rc && ++(mc->mc_top) < mc->mc_snum); - mc->mc_top = mc->mc_snum-1; - } - return rc; -} - -int -mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, - unsigned int flags) -{ - enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; // internal code - MDB_env *env; - MDB_node *leaf = NULL; - MDB_page *fp, *mp; - uint16_t fp_flags; - MDB_val xdata, *rdata, dkey, olddata; - MDB_db dummy; - int do_sub = 0, insert; - unsigned int mcount = 0, dcount = 0, nospill; - size_t nsize; - int rc, rc2; - unsigned int nflags; - DKBUF; - - if (mc == NULL || key == NULL) - return EINVAL; - - env = mc->mc_txn->mt_env; - - // Check this first so counter will always be zero on any - // early failures. - if (flags & MDB_MULTIPLE) { - dcount = data[1].mv_size; - data[1].mv_size = 0; - if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) - return MDB_INCOMPATIBLE; - } - - nospill = flags & MDB_NOSPILL; - flags &= ~MDB_NOSPILL; - - if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - if (flags != MDB_CURRENT && key->mv_size-1 >= ENV_MAXKEY(env)) - return MDB_BAD_VALSIZE; - -#if SIZE_MAX > MAXDATASIZE - if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) - return MDB_BAD_VALSIZE; -#else - if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) - return MDB_BAD_VALSIZE; -#endif - - DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", - DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); - - dkey.mv_size = 0; - - if (flags == MDB_CURRENT) { - if (!(mc->mc_flags & C_INITIALIZED)) - return EINVAL; - rc = MDB_SUCCESS; - } else if (mc->mc_db->md_root == P_INVALID) { - // new database, cursor has nothing to point to - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_flags &= ~C_INITIALIZED; - rc = MDB_NO_ROOT; - } else { - int exact = 0; - MDB_val d2; - if (flags & MDB_APPEND) { - MDB_val k2; - rc = mdb_cursor_last(mc, &k2, &d2); - if (rc == 0) { - rc = mc->mc_dbx->md_cmp(key, &k2); - if (rc > 0) { - rc = MDB_NOTFOUND; - mc->mc_ki[mc->mc_top]++; - } else { - // new key is <= last key - rc = MDB_KEYEXIST; - } + if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { + MDB_cursor mc2; + MDB_xcursor mcx; + mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); + if (rc) + return rc; + *mc->mc_dbflag |= DB_DIRTY; + } + mc->mc_top = 0; + if (mc->mc_snum) { + do { + rc = mdb_page_touch(mc); + } while (!rc && ++(mc->mc_top) < mc->mc_snum); + mc->mc_top = mc->mc_snum-1; } - } else { - rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); - } - if ((flags & MDB_NOOVERWRITE) && rc == 0) { - DPRINTF(("duplicate key [%s]", DKEY(key))); - *data = d2; - return MDB_KEYEXIST; - } - if (rc && rc != MDB_NOTFOUND) return rc; - } - - if (mc->mc_flags & C_DEL) - mc->mc_flags ^= C_DEL; - - // Cursor is positioned, check for room in the dirty list - if (!nospill) { - if (flags & MDB_MULTIPLE) { - rdata = &xdata; - xdata.mv_size = data->mv_size * dcount; - } else { - rdata = data; } - if ((rc2 = mdb_page_spill(mc, key, rdata))) - return rc2; - } - if (rc == MDB_NO_ROOT) { - MDB_page *np; - // new database, write a root leaf page - DPUTS("allocating new root leaf page"); - if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { - return rc2; - } - mdb_cursor_push(mc, np); - mc->mc_db->md_root = np->mp_pgno; - mc->mc_db->md_depth++; - *mc->mc_dbflag |= DB_DIRTY; - if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) - == MDB_DUPFIXED) - np->mp_flags |= P_LEAF2; - mc->mc_flags |= C_INITIALIZED; - } else { - // make sure all cursor pages are writable - rc2 = mdb_cursor_touch(mc); - if (rc2) - return rc2; - } - - insert = rc; - if (insert) { - // The key does not exist - DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - LEAFSIZE(key, data) > env->me_nodemax) + int + mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, + unsigned int flags) { - // Too big for a node, insert in sub-DB - fp_flags = P_LEAF|P_DIRTY; - fp = env->me_pbuf; - fp->mp_pad = data->mv_size; // used if MDB_DUPFIXED - fp->mp_lower = fp->mp_upper = olddata.mv_size = PAGEHDRSZ; - goto prep_subDB; - } - } else { - // there's only a key anyway, so this is a no-op - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - unsigned int ksize = mc->mc_db->md_pad; - if (key->mv_size != ksize) + enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; // internal code + MDB_env *env; + MDB_node *leaf = NULL; + MDB_page *fp, *mp; + uint16_t fp_flags; + MDB_val xdata, *rdata, dkey, olddata; + MDB_db dummy; + int do_sub = 0, insert; + unsigned int mcount = 0, dcount = 0, nospill; + size_t nsize; + int rc, rc2; + unsigned int nflags; + DKBUF; + + if (mc == NULL || key == NULL) + return EINVAL; + + env = mc->mc_txn->mt_env; + + // Check this first so counter will always be zero on any + // early failures. + if (flags & MDB_MULTIPLE) { + dcount = data[1].mv_size; + data[1].mv_size = 0; + if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) + return MDB_INCOMPATIBLE; + } + + nospill = flags & MDB_NOSPILL; + flags &= ~MDB_NOSPILL; + + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (flags != MDB_CURRENT && key->mv_size-1 >= ENV_MAXKEY(env)) return MDB_BAD_VALSIZE; + + #if SIZE_MAX > MAXDATASIZE + if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) + return MDB_BAD_VALSIZE; + #else + if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) + return MDB_BAD_VALSIZE; + #endif + + DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", + DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); + + dkey.mv_size = 0; + if (flags == MDB_CURRENT) { - char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); - memcpy(ptr, key->mv_data, ksize); - } - return MDB_SUCCESS; - } - -more: - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - olddata.mv_size = NODEDSZ(leaf); - olddata.mv_data = NODEDATA(leaf); - - // DB has dups? - if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { - // Prepare (sub-)page/sub-DB to accept the new item, - // if needed. fp: old sub-page or a header faking - // it. mp: new (sub-)page. offset: growth in page - // size. xdata: node data with new page or DB. - ssize_t i, offset = 0; - mp = fp = xdata.mv_data = env->me_pbuf; - mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; - - // Was a single item before, must convert now - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - // Just overwrite the current item - if (flags == MDB_CURRENT) - goto current; - -#if UINT_MAX < SIZE_MAX - if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) -#ifdef MISALIGNED_OK - mc->mc_dbx->md_dcmp = mdb_cmp_long; -#else - mc->mc_dbx->md_dcmp = mdb_cmp_cint; -#endif -#endif - // if data matches, skip it - if (!mc->mc_dbx->md_dcmp(data, &olddata)) { - if (flags & MDB_NODUPDATA) - rc = MDB_KEYEXIST; - else if (flags & MDB_MULTIPLE) - goto next_mult; - else - rc = MDB_SUCCESS; - return rc; - } - - // Back up original data item - dkey.mv_size = olddata.mv_size; - dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); - - // Make sub-page header for the dup items, with dummy body - fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; - fp->mp_lower = PAGEHDRSZ; - xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - fp->mp_flags |= P_LEAF2; - fp->mp_pad = data->mv_size; - xdata.mv_size += 2 * data->mv_size; // leave space for 2 more - } else { - xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + - (dkey.mv_size & 1) + (data->mv_size & 1); - } - fp->mp_upper = xdata.mv_size; - olddata.mv_size = fp->mp_upper; // pretend olddata is fp - } else if (leaf->mn_flags & F_SUBDATA) { - // Data is on sub-DB, just store it - flags |= F_DUPDATA|F_SUBDATA; - goto put_sub; + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; + rc = MDB_SUCCESS; + } else if (mc->mc_db->md_root == P_INVALID) { + // new database, cursor has nothing to point to + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + rc = MDB_NO_ROOT; } else { - // Data is on sub-page - fp = olddata.mv_data; - switch (flags) { - default: - i = -(ssize_t)SIZELEFT(fp); - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - offset = i += (ssize_t) EVEN( - sizeof(indx_t) + NODESIZE + data->mv_size); - } else { - i += offset = fp->mp_pad; - offset *= 4; // space for 4 more + int exact = 0; + MDB_val d2; + if (flags & MDB_APPEND) { + MDB_val k2; + rc = mdb_cursor_last(mc, &k2, &d2); + if (rc == 0) { + rc = mc->mc_dbx->md_cmp(key, &k2); + if (rc > 0) { + rc = MDB_NOTFOUND; + mc->mc_ki[mc->mc_top]++; + } else { + // new key is <= last key + rc = MDB_KEYEXIST; + } } - if (i > 0) - break; - // FALLTHRU: Sub-page is big enough - case MDB_CURRENT: - fp->mp_flags |= P_DIRTY; - COPY_PGNO(fp->mp_pgno, mp->mp_pgno); - mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; - flags |= F_DUPDATA; - goto put_sub; - } - xdata.mv_size = olddata.mv_size + offset; - } - - fp_flags = fp->mp_flags; - if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { - // Too big for a sub-page, convert to sub-DB - fp_flags &= ~P_SUBP; -prep_subDB: - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - fp_flags |= P_LEAF2; - dummy.md_pad = fp->mp_pad; - dummy.md_flags = MDB_DUPFIXED; - if (mc->mc_db->md_flags & MDB_INTEGERDUP) - dummy.md_flags |= MDB_INTEGERKEY; - } else { - dummy.md_pad = 0; - dummy.md_flags = 0; - } - dummy.md_depth = 1; - dummy.md_branch_pages = 0; - dummy.md_leaf_pages = 1; - dummy.md_overflow_pages = 0; - dummy.md_entries = NUMKEYS(fp); - xdata.mv_size = sizeof(MDB_db); - xdata.mv_data = &dummy; - if ((rc = mdb_page_alloc(mc, 1, &mp))) - return rc; - offset = env->me_psize - olddata.mv_size; - flags |= F_DUPDATA|F_SUBDATA; - dummy.md_root = mp->mp_pgno; - } - if (mp != fp) { - mp->mp_flags = fp_flags | P_DIRTY; - mp->mp_pad = fp->mp_pad; - mp->mp_lower = fp->mp_lower; - mp->mp_upper = fp->mp_upper + offset; - if (fp_flags & P_LEAF2) { - memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); } else { - memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, - olddata.mv_size - fp->mp_upper); - for (i = NUMKEYS(fp); --i >= 0; ) - mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; + rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); } - } - - rdata = &xdata; - flags |= F_DUPDATA; - do_sub = 1; - if (!insert) - mdb_node_del(mc, 0); - goto new_sub; - } -current: - // overflow page overwrites need special handling - if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; - pgno_t pg; - int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); - - memcpy(&pg, olddata.mv_data, sizeof(pg)); - if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0) - return rc2; - ovpages = omp->mp_pages; - - // Is the ov page large enough? - if (ovpages >= dpages) { - if (!(omp->mp_flags & P_DIRTY) && - (level || (env->me_flags & MDB_WRITEMAP))) - { - rc = mdb_page_unspill(mc->mc_txn, omp, &omp); - if (rc) + if ((flags & MDB_NOOVERWRITE) && rc == 0) { + DPRINTF(("duplicate key [%s]", DKEY(key))); + *data = d2; + return MDB_KEYEXIST; + } + if (rc && rc != MDB_NOTFOUND) return rc; - level = 0; // dirty in this txn or clean - } - // Is it dirty? - if (omp->mp_flags & P_DIRTY) { - // yes, overwrite it. Note in this case we don't - // bother to try shrinking the page if the new data - // is smaller than the overflow threshold. - if (level > 1) { - // It is writable only in a parent txn - size_t sz = (size_t) env->me_psize * ovpages, off; - MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); - MDB_ID2 id2; - if (!np) - return ENOMEM; - id2.mid = pg; - id2.mptr = np; - rc = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); - mdb_cassert(mc, rc == 0); - if (!(flags & MDB_RESERVE)) { - // Copy end of page, adjusting alignment so - // compiler may copy words instead of bytes. - off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); - memcpy((size_t *)((char *)np + off), - (size_t *)((char *)omp + off), sz - off); - sz = PAGEHDRSZ; - } - memcpy(np, omp, sz); // Copy beginning of page - omp = np; - } - SETDSZ(leaf, data->mv_size); - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = METADATA(omp); - else - memcpy(METADATA(omp), data->mv_data, data->mv_size); - goto done; - } } - if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) - return rc2; - } else if (data->mv_size == olddata.mv_size) { - // same size, just replace it. Note that we could - // also reuse this node if the new data is smaller, - // but instead we opt to shrink the node in that case. - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = olddata.mv_data; - else if (data->mv_size) - memcpy(olddata.mv_data, data->mv_data, data->mv_size); - else - memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); - goto done; - } - mdb_node_del(mc, 0); - mc->mc_db->md_entries--; - } - rdata = data; + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; -new_sub: - nflags = flags & NODE_ADD_FLAGS; - nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); - if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { - if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) - nflags &= ~MDB_APPEND; - if (!insert) - nflags |= MDB_SPLIT_REPLACE; - rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); - } else { - // There is room already in this leaf page. - rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); - if (rc == 0 && !do_sub && insert) { - // Adjust other cursors pointing to mp - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - unsigned i = mc->mc_top; - MDB_page *mp = mc->mc_pg[i]; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) { - m3->mc_ki[i]++; + // Cursor is positioned, check for room in the dirty list + if (!nospill) { + if (flags & MDB_MULTIPLE) { + rdata = &xdata; + xdata.mv_size = data->mv_size * dcount; + } else { + rdata = data; } + if ((rc2 = mdb_page_spill(mc, key, rdata))) + return rc2; } - } - } - if (rc != MDB_SUCCESS) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - else { - // Now store the actual data in the child DB. Note that we're - // storing the user data in the keys field, so there are strict - // size limits on dupdata. The actual data fields of the child - // DB are all zero size. - if (do_sub) { - int xflags; -put_sub: - xdata.mv_size = 0; - xdata.mv_data = ""; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (flags & MDB_CURRENT) { - xflags = MDB_CURRENT|MDB_NOSPILL; + if (rc == MDB_NO_ROOT) { + MDB_page *np; + // new database, write a root leaf page + DPUTS("allocating new root leaf page"); + if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { + return rc2; + } + mdb_cursor_push(mc, np); + mc->mc_db->md_root = np->mp_pgno; + mc->mc_db->md_depth++; + *mc->mc_dbflag |= DB_DIRTY; + if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) + == MDB_DUPFIXED) + np->mp_flags |= P_LEAF2; + mc->mc_flags |= C_INITIALIZED; } else { - mdb_xcursor_init1(mc, leaf); - xflags = (flags & MDB_NODUPDATA) ? - MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; + // make sure all cursor pages are writable + rc2 = mdb_cursor_touch(mc); + if (rc2) + return rc2; } - // converted, write the original data first - if (dkey.mv_size) { - rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); - if (rc) - return rc; + + insert = rc; + if (insert) { + // The key does not exist + DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + LEAFSIZE(key, data) > env->me_nodemax) { + // Too big for a node, insert in sub-DB + fp_flags = P_LEAF|P_DIRTY; + fp = env->me_pbuf; + fp->mp_pad = data->mv_size; // used if MDB_DUPFIXED + fp->mp_lower = fp->mp_upper = olddata.mv_size = PAGEHDRSZ; + goto prep_subDB; + } + } else { + // there's only a key anyway, so this is a no-op + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + unsigned int ksize = mc->mc_db->md_pad; + if (key->mv_size != ksize) + return MDB_BAD_VALSIZE; + if (flags == MDB_CURRENT) { + char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + memcpy(ptr, key->mv_data, ksize); + } + return MDB_SUCCESS; + } + + more: + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + olddata.mv_size = NODEDSZ(leaf); + olddata.mv_data = NODEDATA(leaf); + + // DB has dups? + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + // Prepare (sub-)page/sub-DB to accept the new item, + // if needed. fp: old sub-page or a header faking + // it. mp: new (sub-)page. offset: growth in page + // size. xdata: node data with new page or DB. + ssize_t i, offset = 0; + mp = fp = xdata.mv_data = env->me_pbuf; + mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + + // Was a single item before, must convert now + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + // Just overwrite the current item + if (flags == MDB_CURRENT) + goto current; + + #if UINT_MAX < SIZE_MAX + if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) + #ifdef MISALIGNED_OK + mc->mc_dbx->md_dcmp = mdb_cmp_long; + #else + mc->mc_dbx->md_dcmp = mdb_cmp_cint; + #endif + #endif + // if data matches, skip it + if (!mc->mc_dbx->md_dcmp(data, &olddata)) { + if (flags & MDB_NODUPDATA) + rc = MDB_KEYEXIST; + else if (flags & MDB_MULTIPLE) + goto next_mult; + else + rc = MDB_SUCCESS; + return rc; + } + + // Back up original data item + dkey.mv_size = olddata.mv_size; + dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); + + // Make sub-page header for the dup items, with dummy body + fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; + fp->mp_lower = PAGEHDRSZ; + xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp->mp_flags |= P_LEAF2; + fp->mp_pad = data->mv_size; + xdata.mv_size += 2 * data->mv_size; // leave space for 2 more + } else { + xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + + (dkey.mv_size & 1) + (data->mv_size & 1); + } + fp->mp_upper = xdata.mv_size; + olddata.mv_size = fp->mp_upper; // pretend olddata is fp + } else if (leaf->mn_flags & F_SUBDATA) { + // Data is on sub-DB, just store it + flags |= F_DUPDATA|F_SUBDATA; + goto put_sub; + } else { + // Data is on sub-page + fp = olddata.mv_data; + switch (flags) { + default: + i = -(ssize_t)SIZELEFT(fp); + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + offset = i += (ssize_t) EVEN( + sizeof(indx_t) + NODESIZE + data->mv_size); + } else { + i += offset = fp->mp_pad; + offset *= 4; // space for 4 more + } + if (i > 0) + break; + // FALLTHRU: Sub-page is big enough + case MDB_CURRENT: + fp->mp_flags |= P_DIRTY; + COPY_PGNO(fp->mp_pgno, mp->mp_pgno); + mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; + flags |= F_DUPDATA; + goto put_sub; + } + xdata.mv_size = olddata.mv_size + offset; + } + + fp_flags = fp->mp_flags; + if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { + // Too big for a sub-page, convert to sub-DB + fp_flags &= ~P_SUBP; + prep_subDB: + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp_flags |= P_LEAF2; + dummy.md_pad = fp->mp_pad; + dummy.md_flags = MDB_DUPFIXED; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + dummy.md_flags |= MDB_INTEGERKEY; + } else { + dummy.md_pad = 0; + dummy.md_flags = 0; + } + dummy.md_depth = 1; + dummy.md_branch_pages = 0; + dummy.md_leaf_pages = 1; + dummy.md_overflow_pages = 0; + dummy.md_entries = NUMKEYS(fp); + xdata.mv_size = sizeof(MDB_db); + xdata.mv_data = &dummy; + if ((rc = mdb_page_alloc(mc, 1, &mp))) + return rc; + offset = env->me_psize - olddata.mv_size; + flags |= F_DUPDATA|F_SUBDATA; + dummy.md_root = mp->mp_pgno; + } + if (mp != fp) { + mp->mp_flags = fp_flags | P_DIRTY; + mp->mp_pad = fp->mp_pad; + mp->mp_lower = fp->mp_lower; + mp->mp_upper = fp->mp_upper + offset; + if (fp_flags & P_LEAF2) { + memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); + } else { + memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, + olddata.mv_size - fp->mp_upper); + for (i = NUMKEYS(fp); --i >= 0; ) + mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; + } + } + + rdata = &xdata; + flags |= F_DUPDATA; + do_sub = 1; + if (!insert) + mdb_node_del(mc, 0); + goto new_sub; + } + current: + // overflow page overwrites need special handling + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); + + memcpy(&pg, olddata.mv_data, sizeof(pg)); + if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0) + return rc2; + ovpages = omp->mp_pages; + + // Is the ov page large enough? + if (ovpages >= dpages) { + if (!(omp->mp_flags & P_DIRTY) && + (level || (env->me_flags & MDB_WRITEMAP))) + { + rc = mdb_page_unspill(mc->mc_txn, omp, &omp); + if (rc) + return rc; + level = 0; // dirty in this txn or clean + } + // Is it dirty? + if (omp->mp_flags & P_DIRTY) { + // yes, overwrite it. Note in this case we don't + // bother to try shrinking the page if the new data + // is smaller than the overflow threshold. + if (level > 1) { + // It is writable only in a parent txn + size_t sz = (size_t) env->me_psize * ovpages, off; + MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); + MDB_ID2 id2; + if (!np) + return ENOMEM; + id2.mid = pg; + id2.mptr = np; + rc = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); + mdb_cassert(mc, rc == 0); + if (!(flags & MDB_RESERVE)) { + // Copy end of page, adjusting alignment so + // compiler may copy words instead of bytes. + off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); + memcpy((size_t *)((char *)np + off), + (size_t *)((char *)omp + off), sz - off); + sz = PAGEHDRSZ; + } + memcpy(np, omp, sz); // Copy beginning of page + omp = np; + } + SETDSZ(leaf, data->mv_size); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = METADATA(omp); + else + memcpy(METADATA(omp), data->mv_data, data->mv_size); + goto done; + } + } + if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) + return rc2; + } else if (data->mv_size == olddata.mv_size) { + // same size, just replace it. Note that we could + // also reuse this node if the new data is smaller, + // but instead we opt to shrink the node in that case. + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = olddata.mv_data; + else if (data->mv_size) + memcpy(olddata.mv_data, data->mv_data, data->mv_size); + else + memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); + goto done; + } + mdb_node_del(mc, 0); + mc->mc_db->md_entries--; + } + + rdata = data; + + new_sub: + nflags = flags & NODE_ADD_FLAGS; + nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); + if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { + if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) + nflags &= ~MDB_APPEND; + if (!insert) + nflags |= MDB_SPLIT_REPLACE; + rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); + } else { + // There is room already in this leaf page. + rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); + if (rc == 0 && !do_sub && insert) { // Adjust other cursors pointing to mp - MDB_cursor *m2; + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; unsigned i = mc->mc_top; MDB_page *mp = mc->mc_pg[i]; - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (!(m2->mc_flags & C_INITIALIZED)) continue; - if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) { - mdb_xcursor_init1(m2, leaf); + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) { + m3->mc_ki[i]++; } } } - // we've done our job - dkey.mv_size = 0; } - if (flags & MDB_APPENDDUP) - xflags |= MDB_APPEND; - rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); - if (flags & F_SUBDATA) { - void *db = NODEDATA(leaf); - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); - } - } - // sub-writes might have failed so check rc again. - // Don't increment count if we just replaced an existing item. - if (!rc && !(flags & MDB_CURRENT)) - mc->mc_db->md_entries++; - if (flags & MDB_MULTIPLE) { - if (!rc) { -next_mult: - mcount++; - // let caller know how many succeeded, if any - data[1].mv_size = mcount; - if (mcount < dcount) { - data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; - goto more; + + if (rc != MDB_SUCCESS) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + else { + // Now store the actual data in the child DB. Note that we're + // storing the user data in the keys field, so there are strict + // size limits on dupdata. The actual data fields of the child + // DB are all zero size. + if (do_sub) { + int xflags; + put_sub: + xdata.mv_size = 0; + xdata.mv_data = ""; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (flags & MDB_CURRENT) { + xflags = MDB_CURRENT|MDB_NOSPILL; + } else { + mdb_xcursor_init1(mc, leaf); + xflags = (flags & MDB_NODUPDATA) ? + MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; + } + // converted, write the original data first + if (dkey.mv_size) { + rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); + if (rc) + return rc; + { + // Adjust other cursors pointing to mp + MDB_cursor *m2; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (!(m2->mc_flags & C_INITIALIZED)) continue; + if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) { + mdb_xcursor_init1(m2, leaf); + } + } + } + // we've done our job + dkey.mv_size = 0; + } + if (flags & MDB_APPENDDUP) + xflags |= MDB_APPEND; + rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); + if (flags & F_SUBDATA) { + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } + } + // sub-writes might have failed so check rc again. + // Don't increment count if we just replaced an existing item. + if (!rc && !(flags & MDB_CURRENT)) + mc->mc_db->md_entries++; + if (flags & MDB_MULTIPLE) { + if (!rc) { + next_mult: + mcount++; + // let caller know how many succeeded, if any + data[1].mv_size = mcount; + if (mcount < dcount) { + data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; + goto more; + } + } } } - } - } -done: - // If we succeeded and the key didn't exist before, make sure - // the cursor is marked valid. - if (!rc && insert) - mc->mc_flags |= C_INITIALIZED; - return rc; + done: + // If we succeeded and the key didn't exist before, make sure + // the cursor is marked valid. + if (!rc && insert) + mc->mc_flags |= C_INITIALIZED; + return rc; */ return nil } func (c *cursor) Del(flags int) error { /* - MDB_node *leaf; - MDB_page *mp; - int rc; + MDB_node *leaf; + MDB_page *mp; + int rc; - if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - if (!(mc->mc_flags & C_INITIALIZED)) - return EINVAL; + if (!(mc->mc_flags & C_INITIALIZED)) + return EINVAL; - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) - return MDB_NOTFOUND; + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDB_NOTFOUND; - if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) - return rc; + if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) + return rc; - rc = mdb_cursor_touch(mc); - if (rc) - return rc; + rc = mdb_cursor_touch(mc); + if (rc) + return rc; - mp = mc->mc_pg[mc->mc_top]; - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + mp = mc->mc_pg[mc->mc_top]; + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (!(flags & MDB_NODUPDATA)) { - if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { - mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } - rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); - // If sub-DB still has entries, we're done - if (mc->mc_xcursor->mx_db.md_entries) { - if (leaf->mn_flags & F_SUBDATA) { - // update subDB info - void *db = NODEDATA(leaf); - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); - } else { - MDB_cursor *m2; - // shrink fake page - mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (!(flags & MDB_NODUPDATA)) { + if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - // fix other sub-DB cursors pointed at this fake page - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (m2->mc_pg[mc->mc_top] == mp && - m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } } - mc->mc_db->md_entries--; - mc->mc_flags |= C_DEL; - return rc; + rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); + // If sub-DB still has entries, we're done + if (mc->mc_xcursor->mx_db.md_entries) { + if (leaf->mn_flags & F_SUBDATA) { + // update subDB info + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } else { + MDB_cursor *m2; + // shrink fake page + mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + // fix other sub-DB cursors pointed at this fake page + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (m2->mc_pg[mc->mc_top] == mp && + m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + } + mc->mc_db->md_entries--; + mc->mc_flags |= C_DEL; + return rc; + } + // otherwise fall thru and delete the sub-DB } - // otherwise fall thru and delete the sub-DB - } - if (leaf->mn_flags & F_SUBDATA) { - // add all the child DB's pages to the free list - rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); - if (rc == MDB_SUCCESS) { - mc->mc_db->md_entries -= - mc->mc_xcursor->mx_db.md_entries; + if (leaf->mn_flags & F_SUBDATA) { + // add all the child DB's pages to the free list + rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (rc == MDB_SUCCESS) { + mc->mc_db->md_entries -= + mc->mc_xcursor->mx_db.md_entries; + } } } - } - return mdb_cursor_del0(mc, leaf); + return mdb_cursor_del0(mc, leaf); */ return nil } @@ -2151,28 +2154,28 @@ func (c *cursor) Del(flags int) error { // @return 0 on success, non-zero on failure. func (c *cursor) newPage(flags int, num int) ([]*page, error) { /* - MDB_page *np; - int rc; + MDB_page *np; + int rc; - if ((rc = mdb_page_alloc(mc, num, &np))) - return rc; - DPRINTF(("allocated new mpage %"Z"u, page size %u", - np->mp_pgno, mc->mc_txn->mt_env->me_psize)); - np->mp_flags = flags | P_DIRTY; - np->mp_lower = PAGEHDRSZ; - np->mp_upper = mc->mc_txn->mt_env->me_psize; + if ((rc = mdb_page_alloc(mc, num, &np))) + return rc; + DPRINTF(("allocated new mpage %"Z"u, page size %u", + np->mp_pgno, mc->mc_txn->mt_env->me_psize)); + np->mp_flags = flags | P_DIRTY; + np->mp_lower = PAGEHDRSZ; + np->mp_upper = mc->mc_txn->mt_env->me_psize; - if (IS_BRANCH(np)) - mc->mc_db->md_branch_pages++; - else if (IS_LEAF(np)) - mc->mc_db->md_leaf_pages++; - else if (IS_OVERFLOW(np)) { - mc->mc_db->md_overflow_pages += num; - np->mp_pages = num; - } - *mp = np; + if (IS_BRANCH(np)) + mc->mc_db->md_branch_pages++; + else if (IS_LEAF(np)) + mc->mc_db->md_leaf_pages++; + else if (IS_OVERFLOW(np)) { + mc->mc_db->md_overflow_pages += num; + np->mp_pages = num; + } + *mp = np; - return 0; + return 0; */ return nil, nil } @@ -2193,123 +2196,123 @@ func (c *cursor) newPage(flags int, num int) ([]*page, error) { // func (c *cursor) addNode(index int, key []byte, data []byte, pgno int, flags int) error { /* - unsigned int i; - size_t node_size = NODESIZE; - ssize_t room; - indx_t ofs; - MDB_node *node; - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_page *ofp = NULL; // overflow page - DKBUF; + unsigned int i; + size_t node_size = NODESIZE; + ssize_t room; + indx_t ofs; + MDB_node *node; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_page *ofp = NULL; // overflow page + DKBUF; - mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); + mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); - DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", - IS_LEAF(mp) ? "leaf" : "branch", - IS_SUBP(mp) ? "sub-" : "", - mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, - key ? key->mv_size : 0, key ? DKEY(key) : "null")); + DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", + IS_LEAF(mp) ? "leaf" : "branch", + IS_SUBP(mp) ? "sub-" : "", + mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, + key ? key->mv_size : 0, key ? DKEY(key) : "null")); - if (IS_LEAF2(mp)) { - // Move higher keys up one slot. - int ksize = mc->mc_db->md_pad, dif; - char *ptr = LEAF2KEY(mp, indx, ksize); - dif = NUMKEYS(mp) - indx; - if (dif > 0) - memmove(ptr+ksize, ptr, dif*ksize); - // insert new key - memcpy(ptr, key->mv_data, ksize); + if (IS_LEAF2(mp)) { + // Move higher keys up one slot. + int ksize = mc->mc_db->md_pad, dif; + char *ptr = LEAF2KEY(mp, indx, ksize); + dif = NUMKEYS(mp) - indx; + if (dif > 0) + memmove(ptr+ksize, ptr, dif*ksize); + // insert new key + memcpy(ptr, key->mv_data, ksize); - // Just using these for counting - mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); - return MDB_SUCCESS; - } + // Just using these for counting + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + return MDB_SUCCESS; + } - room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); - if (key != NULL) - node_size += key->mv_size; - if (IS_LEAF(mp)) { - mdb_cassert(mc, data); - if (F_ISSET(flags, F_BIGDATA)) { - // Data already on overflow page. - node_size += sizeof(pgno_t); - } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { - int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); - int rc; - // Put data on overflow page. - DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", - data->mv_size, node_size+data->mv_size)); - node_size = EVEN(node_size + sizeof(pgno_t)); + room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); + if (key != NULL) + node_size += key->mv_size; + if (IS_LEAF(mp)) { + mdb_cassert(mc, data); + if (F_ISSET(flags, F_BIGDATA)) { + // Data already on overflow page. + node_size += sizeof(pgno_t); + } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { + int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); + int rc; + // Put data on overflow page. + DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", + data->mv_size, node_size+data->mv_size)); + node_size = EVEN(node_size + sizeof(pgno_t)); + if ((ssize_t)node_size > room) + goto full; + if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) + return rc; + DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); + flags |= F_BIGDATA; + goto update; + } else { + node_size += data->mv_size; + } + } + node_size = EVEN(node_size); if ((ssize_t)node_size > room) goto full; - if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) - return rc; - DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); - flags |= F_BIGDATA; - goto update; - } else { - node_size += data->mv_size; - } - } - node_size = EVEN(node_size); - if ((ssize_t)node_size > room) - goto full; -update: - // Move higher pointers up one slot. - for (i = NUMKEYS(mp); i > indx; i--) - mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; + update: + // Move higher pointers up one slot. + for (i = NUMKEYS(mp); i > indx; i--) + mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; - // Adjust free space offsets. - ofs = mp->mp_upper - node_size; - mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); - mp->mp_ptrs[indx] = ofs; - mp->mp_upper = ofs; - mp->mp_lower += sizeof(indx_t); + // Adjust free space offsets. + ofs = mp->mp_upper - node_size; + mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); + mp->mp_ptrs[indx] = ofs; + mp->mp_upper = ofs; + mp->mp_lower += sizeof(indx_t); - // Write the node data. - node = NODEPTR(mp, indx); - node->mn_ksize = (key == NULL) ? 0 : key->mv_size; - node->mn_flags = flags; - if (IS_LEAF(mp)) - SETDSZ(node,data->mv_size); - else - SETPGNO(node,pgno); - - if (key) - memcpy(NODEKEY(node), key->mv_data, key->mv_size); - - if (IS_LEAF(mp)) { - mdb_cassert(mc, key); - if (ofp == NULL) { - if (F_ISSET(flags, F_BIGDATA)) - memcpy(node->mn_data + key->mv_size, data->mv_data, - sizeof(pgno_t)); - else if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = node->mn_data + key->mv_size; + // Write the node data. + node = NODEPTR(mp, indx); + node->mn_ksize = (key == NULL) ? 0 : key->mv_size; + node->mn_flags = flags; + if (IS_LEAF(mp)) + SETDSZ(node,data->mv_size); else - memcpy(node->mn_data + key->mv_size, data->mv_data, - data->mv_size); - } else { - memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno, - sizeof(pgno_t)); - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = METADATA(ofp); - else - memcpy(METADATA(ofp), data->mv_data, data->mv_size); - } - } + SETPGNO(node,pgno); - return MDB_SUCCESS; + if (key) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); -full: - DPRINTF(("not enough room in page %"Z"u, got %u ptrs", - mdb_dbg_pgno(mp), NUMKEYS(mp))); - DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); - DPRINTF(("node size = %"Z"u", node_size)); - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_FULL; + if (IS_LEAF(mp)) { + mdb_cassert(mc, key); + if (ofp == NULL) { + if (F_ISSET(flags, F_BIGDATA)) + memcpy(node->mn_data + key->mv_size, data->mv_data, + sizeof(pgno_t)); + else if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = node->mn_data + key->mv_size; + else + memcpy(node->mn_data + key->mv_size, data->mv_data, + data->mv_size); + } else { + memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno, + sizeof(pgno_t)); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = METADATA(ofp); + else + memcpy(METADATA(ofp), data->mv_data, data->mv_size); + } + } + + return MDB_SUCCESS; + + full: + DPRINTF(("not enough room in page %"Z"u, got %u ptrs", + mdb_dbg_pgno(mp), NUMKEYS(mp))); + DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); + DPRINTF(("node size = %"Z"u", node_size)); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_FULL; */ return nil @@ -2322,54 +2325,54 @@ full: // part of a #MDB_DUPFIXED database. func (c *cursor) deleteNode(ksize int) { /* - MDB_page *mp = mc->mc_pg[mc->mc_top]; - indx_t indx = mc->mc_ki[mc->mc_top]; - unsigned int sz; - indx_t i, j, numkeys, ptr; - MDB_node *node; - char *base; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + indx_t indx = mc->mc_ki[mc->mc_top]; + unsigned int sz; + indx_t i, j, numkeys, ptr; + MDB_node *node; + char *base; - DPRINTF(("delete node %u on %s page %"Z"u", indx, - IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); - numkeys = NUMKEYS(mp); - mdb_cassert(mc, indx < numkeys); + DPRINTF(("delete node %u on %s page %"Z"u", indx, + IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); + numkeys = NUMKEYS(mp); + mdb_cassert(mc, indx < numkeys); - if (IS_LEAF2(mp)) { - int x = numkeys - 1 - indx; - base = LEAF2KEY(mp, indx, ksize); - if (x) - memmove(base, base + ksize, x * ksize); - mp->mp_lower -= sizeof(indx_t); - mp->mp_upper += ksize - sizeof(indx_t); - return; - } - - node = NODEPTR(mp, indx); - sz = NODESIZE + node->mn_ksize; - if (IS_LEAF(mp)) { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - sz += sizeof(pgno_t); - else - sz += NODEDSZ(node); - } - sz = EVEN(sz); - - ptr = mp->mp_ptrs[indx]; - for (i = j = 0; i < numkeys; i++) { - if (i != indx) { - mp->mp_ptrs[j] = mp->mp_ptrs[i]; - if (mp->mp_ptrs[i] < ptr) - mp->mp_ptrs[j] += sz; - j++; + if (IS_LEAF2(mp)) { + int x = numkeys - 1 - indx; + base = LEAF2KEY(mp, indx, ksize); + if (x) + memmove(base, base + ksize, x * ksize); + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += ksize - sizeof(indx_t); + return; } - } - base = (char *)mp + mp->mp_upper; - memmove(base + sz, base, ptr - mp->mp_upper); + node = NODEPTR(mp, indx); + sz = NODESIZE + node->mn_ksize; + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + sz += sizeof(pgno_t); + else + sz += NODEDSZ(node); + } + sz = EVEN(sz); - mp->mp_lower -= sizeof(indx_t); - mp->mp_upper += sz; -*/ + ptr = mp->mp_ptrs[indx]; + for (i = j = 0; i < numkeys; i++) { + if (i != indx) { + mp->mp_ptrs[j] = mp->mp_ptrs[i]; + if (mp->mp_ptrs[i] < ptr) + mp->mp_ptrs[j] += sz; + j++; + } + } + + base = (char *)mp + mp->mp_upper; + memmove(base + sz, base, ptr - mp->mp_upper); + + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += sz; + */ } // Initial setup of a sorted-dups cursor. @@ -2382,22 +2385,22 @@ func (c *cursor) deleteNode(ksize int) { // @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. func (c *cursor) xcursor_init0() { /* - MDB_xcursor *mx = mc->mc_xcursor; + MDB_xcursor *mx = mc->mc_xcursor; - mx->mx_cursor.mc_xcursor = NULL; - mx->mx_cursor.mc_txn = mc->mc_txn; - mx->mx_cursor.mc_db = &mx->mx_db; - mx->mx_cursor.mc_dbx = &mx->mx_dbx; - mx->mx_cursor.mc_dbi = mc->mc_dbi; - mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; - mx->mx_cursor.mc_snum = 0; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; - mx->mx_dbx.md_name.mv_size = 0; - mx->mx_dbx.md_name.mv_data = NULL; - mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; - mx->mx_dbx.md_dcmp = NULL; - mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; + mx->mx_cursor.mc_xcursor = NULL; + mx->mx_cursor.mc_txn = mc->mc_txn; + mx->mx_cursor.mc_db = &mx->mx_db; + mx->mx_cursor.mc_dbx = &mx->mx_dbx; + mx->mx_cursor.mc_dbi = mc->mc_dbi; + mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + mx->mx_dbx.md_name.mv_size = 0; + mx->mx_dbx.md_name.mv_data = NULL; + mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; + mx->mx_dbx.md_dcmp = NULL; + mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; */ } @@ -2408,126 +2411,126 @@ func (c *cursor) xcursor_init0() { // sorted-dup database. func (c *cursor) xcursor_init1(n *node) { /* - MDB_xcursor *mx = mc->mc_xcursor; + MDB_xcursor *mx = mc->mc_xcursor; - if (node->mn_flags & F_SUBDATA) { - memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); - mx->mx_cursor.mc_pg[0] = 0; - mx->mx_cursor.mc_snum = 0; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; - } else { - MDB_page *fp = NODEDATA(node); - mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad; - mx->mx_db.md_flags = 0; - mx->mx_db.md_depth = 1; - mx->mx_db.md_branch_pages = 0; - mx->mx_db.md_leaf_pages = 1; - mx->mx_db.md_overflow_pages = 0; - mx->mx_db.md_entries = NUMKEYS(fp); - COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); - mx->mx_cursor.mc_snum = 1; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; - mx->mx_cursor.mc_pg[0] = fp; - mx->mx_cursor.mc_ki[0] = 0; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - mx->mx_db.md_flags = MDB_DUPFIXED; - mx->mx_db.md_pad = fp->mp_pad; - if (mc->mc_db->md_flags & MDB_INTEGERDUP) - mx->mx_db.md_flags |= MDB_INTEGERKEY; - } - } - DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, - mx->mx_db.md_root)); - mx->mx_dbflag = DB_VALID|DB_DIRTY; // DB_DIRTY guides mdb_cursor_touch -#if UINT_MAX < SIZE_MAX - if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) -#ifdef MISALIGNED_OK - mx->mx_dbx.md_cmp = mdb_cmp_long; -#else - mx->mx_dbx.md_cmp = mdb_cmp_cint; -#endif -#endif + if (node->mn_flags & F_SUBDATA) { + memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); + mx->mx_cursor.mc_pg[0] = 0; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + } else { + MDB_page *fp = NODEDATA(node); + mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad; + mx->mx_db.md_flags = 0; + mx->mx_db.md_depth = 1; + mx->mx_db.md_branch_pages = 0; + mx->mx_db.md_leaf_pages = 1; + mx->mx_db.md_overflow_pages = 0; + mx->mx_db.md_entries = NUMKEYS(fp); + COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; + mx->mx_cursor.mc_pg[0] = fp; + mx->mx_cursor.mc_ki[0] = 0; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + mx->mx_db.md_flags = MDB_DUPFIXED; + mx->mx_db.md_pad = fp->mp_pad; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + mx->mx_db.md_flags |= MDB_INTEGERKEY; + } + } + DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root)); + mx->mx_dbflag = DB_VALID|DB_DIRTY; // DB_DIRTY guides mdb_cursor_touch + #if UINT_MAX < SIZE_MAX + if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) + #ifdef MISALIGNED_OK + mx->mx_dbx.md_cmp = mdb_cmp_long; + #else + mx->mx_dbx.md_cmp = mdb_cmp_cint; + #endif + #endif */ } // Initialize a cursor for a given transaction and database. -func (c *cursor) init(t *transaction, bucket *bucket, mx *xcursor) { +func (c *cursor) init(t *transaction, bucket *Bucket, mx *xcursor) { /* - mc->mc_next = NULL; - mc->mc_backup = NULL; - mc->mc_dbi = dbi; - mc->mc_txn = txn; - mc->mc_db = &txn->mt_dbs[dbi]; - mc->mc_dbx = &txn->mt_dbxs[dbi]; - mc->mc_dbflag = &txn->mt_dbflags[dbi]; - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_pg[0] = 0; - mc->mc_flags = 0; - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { - mdb_tassert(txn, mx != NULL); - mc->mc_xcursor = mx; - mdb_xcursor_init0(mc); - } else { - mc->mc_xcursor = NULL; - } - if (*mc->mc_dbflag & DB_STALE) { - mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); - } + mc->mc_next = NULL; + mc->mc_backup = NULL; + mc->mc_dbi = dbi; + mc->mc_txn = txn; + mc->mc_db = &txn->mt_dbs[dbi]; + mc->mc_dbx = &txn->mt_dbxs[dbi]; + mc->mc_dbflag = &txn->mt_dbflags[dbi]; + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_pg[0] = 0; + mc->mc_flags = 0; + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { + mdb_tassert(txn, mx != NULL); + mc->mc_xcursor = mx; + mdb_xcursor_init0(mc); + } else { + mc->mc_xcursor = NULL; + } + if (*mc->mc_dbflag & DB_STALE) { + mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); + } */ } // Return the count of duplicate data items for the current key. func (c *cursor) count() (int, error) { /* - MDB_node *leaf; + MDB_node *leaf; - if (mc == NULL || countp == NULL) - return EINVAL; - - if (mc->mc_xcursor == NULL) - return MDB_INCOMPATIBLE; - - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - *countp = 1; - } else { - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + if (mc == NULL || countp == NULL) return EINVAL; - *countp = mc->mc_xcursor->mx_db.md_entries; - } - return MDB_SUCCESS; + if (mc->mc_xcursor == NULL) + return MDB_INCOMPATIBLE; + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + *countp = 1; + } else { + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + return EINVAL; + + *countp = mc->mc_xcursor->mx_db.md_entries; + } + return MDB_SUCCESS; */ return 0, nil } func (c *cursor) Close() { /* - if (mc && !mc->mc_backup) { - // remove from txn, if tracked - if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { - MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) prev = &(*prev)->mc_next; - if (*prev == mc) - *prev = mc->mc_next; + if (mc && !mc->mc_backup) { + // remove from txn, if tracked + if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { + MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + while (*prev && *prev != mc) prev = &(*prev)->mc_next; + if (*prev == mc) + *prev = mc->mc_next; + } + free(mc); } - free(mc); - } */ } func (c *cursor) Transaction() Transaction { /* - if (!mc) return NULL; - return mc->mc_txn; + if (!mc) return NULL; + return mc->mc_txn; */ return nil } -func (c *cursor) Bucket() Bucket { +func (c *cursor) Bucket() *Bucket { return c.bucket } @@ -2537,70 +2540,70 @@ func (c *cursor) Bucket() Bucket { // @return 0 on success, non-zero on failure. func (c *cursor) updateKey(key []byte) error { /* - MDB_page *mp; - MDB_node *node; - char *base; - size_t len; - int delta, ksize, oksize; - indx_t ptr, i, numkeys, indx; - DKBUF; + MDB_page *mp; + MDB_node *node; + char *base; + size_t len; + int delta, ksize, oksize; + indx_t ptr, i, numkeys, indx; + DKBUF; - indx = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - node = NODEPTR(mp, indx); - ptr = mp->mp_ptrs[indx]; -#if MDB_DEBUG - { - MDB_val k2; - char kbuf2[DKBUF_MAXKEYSIZE*2+1]; - k2.mv_data = NODEKEY(node); - k2.mv_size = node->mn_ksize; - DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", - indx, ptr, - mdb_dkey(&k2, kbuf2), - DKEY(key), - mp->mp_pgno)); - } -#endif + indx = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + node = NODEPTR(mp, indx); + ptr = mp->mp_ptrs[indx]; + #if MDB_DEBUG + { + MDB_val k2; + char kbuf2[DKBUF_MAXKEYSIZE*2+1]; + k2.mv_data = NODEKEY(node); + k2.mv_size = node->mn_ksize; + DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", + indx, ptr, + mdb_dkey(&k2, kbuf2), + DKEY(key), + mp->mp_pgno)); + } + #endif - // Sizes must be 2-byte aligned. - ksize = EVEN(key->mv_size); - oksize = EVEN(node->mn_ksize); - delta = ksize - oksize; + // Sizes must be 2-byte aligned. + ksize = EVEN(key->mv_size); + oksize = EVEN(node->mn_ksize); + delta = ksize - oksize; - // Shift node contents if EVEN(key length) changed. - if (delta) { - if (delta > 0 && SIZELEFT(mp) < delta) { - pgno_t pgno; - // not enough space left, do a delete and split - DPRINTF(("Not enough room, delta = %d, splitting...", delta)); - pgno = NODEPGNO(node); - mdb_node_del(mc, 0); - return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); - } + // Shift node contents if EVEN(key length) changed. + if (delta) { + if (delta > 0 && SIZELEFT(mp) < delta) { + pgno_t pgno; + // not enough space left, do a delete and split + DPRINTF(("Not enough room, delta = %d, splitting...", delta)); + pgno = NODEPGNO(node); + mdb_node_del(mc, 0); + return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); + } - numkeys = NUMKEYS(mp); - for (i = 0; i < numkeys; i++) { - if (mp->mp_ptrs[i] <= ptr) - mp->mp_ptrs[i] -= delta; - } + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] -= delta; + } - base = (char *)mp + mp->mp_upper; - len = ptr - mp->mp_upper + NODESIZE; - memmove(base - delta, base, len); - mp->mp_upper -= delta; + base = (char *)mp + mp->mp_upper; + len = ptr - mp->mp_upper + NODESIZE; + memmove(base - delta, base, len); + mp->mp_upper -= delta; - node = NODEPTR(mp, indx); - } + node = NODEPTR(mp, indx); + } - // But even if no shift was needed, update ksize - if (node->mn_ksize != key->mv_size) - node->mn_ksize = key->mv_size; + // But even if no shift was needed, update ksize + if (node->mn_ksize != key->mv_size) + node->mn_ksize = key->mv_size; - if (key->mv_size) - memcpy(NODEKEY(node), key->mv_data, key->mv_size); + if (key->mv_size) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); - return MDB_SUCCESS; + return MDB_SUCCESS; */ return nil } @@ -2608,214 +2611,33 @@ func (c *cursor) updateKey(key []byte) error { // Move a node from csrc to cdst. func (c *cursor) moveNodeTo(dst *cursor) error { /* - MDB_node *srcnode; - MDB_val key, data; - pgno_t srcpg; - MDB_cursor mn; - int rc; - unsigned short flags; + MDB_node *srcnode; + MDB_val key, data; + pgno_t srcpg; + MDB_cursor mn; + int rc; + unsigned short flags; - DKBUF; + DKBUF; - // Mark src and dst as dirty. - if ((rc = mdb_page_touch(csrc)) || - (rc = mdb_page_touch(cdst))) - return rc; - - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_pad; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); - data.mv_size = 0; - data.mv_data = NULL; - srcpg = 0; - flags = 0; - } else { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); - mdb_cassert(csrc, !((size_t)srcnode & 1)); - srcpg = NODEPGNO(srcnode); - flags = srcnode->mn_flags; - if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - unsigned int snum = csrc->mc_snum; - MDB_node *s2; - // must find the lowest key below src - mdb_page_search_lowest(csrc); - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_pad; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); - } else { - s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(s2); - key.mv_data = NODEKEY(s2); - } - csrc->mc_snum = snum--; - csrc->mc_top = snum; - } else { - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - data.mv_size = NODEDSZ(srcnode); - data.mv_data = NODEDATA(srcnode); - } - if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { - unsigned int snum = cdst->mc_snum; - MDB_node *s2; - MDB_val bkey; - // must find the lowest key below dst - mdb_page_search_lowest(cdst); - if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) { - bkey.mv_size = cdst->mc_db->md_pad; - bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size); - } else { - s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); - bkey.mv_size = NODEKSZ(s2); - bkey.mv_data = NODEKEY(s2); - } - cdst->mc_snum = snum--; - cdst->mc_top = snum; - mdb_cursor_copy(cdst, &mn); - mn.mc_ki[snum] = 0; - rc = mdb_update_key(&mn, &bkey); - if (rc) + // Mark src and dst as dirty. + if ((rc = mdb_page_touch(csrc)) || + (rc = mdb_page_touch(cdst))) return rc; - } - DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", - IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", - csrc->mc_ki[csrc->mc_top], - DKEY(&key), - csrc->mc_pg[csrc->mc_top]->mp_pgno, - cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); - - // Add the node to the destination page. - rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); - if (rc != MDB_SUCCESS) - return rc; - - // Delete the node from the source page. - mdb_node_del(csrc, key.mv_size); - - { - // Adjust other cursors pointing to mp - MDB_cursor *m2, *m3; - MDB_dbi dbi = csrc->mc_dbi; - MDB_page *mp = csrc->mc_pg[csrc->mc_top]; - - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == csrc) continue; - if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] == - csrc->mc_ki[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; - m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - } - } - } - - // Update the parent separators. - if (csrc->mc_ki[csrc->mc_top] == 0) { - if (csrc->mc_ki[csrc->mc_top-1] != 0) { - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); - } else { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - DPRINTF(("update separator for source page %"Z"u to [%s]", - csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); - mdb_cursor_copy(csrc, &mn); - mn.mc_snum--; - mn.mc_top--; - if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) - return rc; - } - if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - MDB_val nullkey; - indx_t ix = csrc->mc_ki[csrc->mc_top]; - nullkey.mv_size = 0; - csrc->mc_ki[csrc->mc_top] = 0; - rc = mdb_update_key(csrc, &nullkey); - csrc->mc_ki[csrc->mc_top] = ix; - mdb_cassert(csrc, rc == MDB_SUCCESS); - } - } - - if (cdst->mc_ki[cdst->mc_top] == 0) { - if (cdst->mc_ki[cdst->mc_top-1] != 0) { - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); - } else { - srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - DPRINTF(("update separator for destination page %"Z"u to [%s]", - cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); - mdb_cursor_copy(cdst, &mn); - mn.mc_snum--; - mn.mc_top--; - if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) - return rc; - } - if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { - MDB_val nullkey; - indx_t ix = cdst->mc_ki[cdst->mc_top]; - nullkey.mv_size = 0; - cdst->mc_ki[cdst->mc_top] = 0; - rc = mdb_update_key(cdst, &nullkey); - cdst->mc_ki[cdst->mc_top] = ix; - mdb_cassert(csrc, rc == MDB_SUCCESS); - } - } - - return MDB_SUCCESS; - */ - - return nil -} - -// Merge one page into another. -// The nodes from the page pointed to by \b csrc will -// be copied to the page pointed to by \b cdst and then -// the \b csrc page will be freed. -// @param[in] csrc Cursor pointing to the source page. -// @param[in] cdst Cursor pointing to the destination page. -func (c *cursor) mergePage(dst *cursor) error { - /* - int rc; - indx_t i, j; - MDB_node *srcnode; - MDB_val key, data; - unsigned nkeys; - - DPRINTF(("merging page %"Z"u into %"Z"u", csrc->mc_pg[csrc->mc_top]->mp_pgno, - cdst->mc_pg[cdst->mc_top]->mp_pgno)); - - mdb_cassert(csrc, csrc->mc_snum > 1); // can't merge root page - mdb_cassert(csrc, cdst->mc_snum > 1); - - // Mark dst as dirty. - if ((rc = mdb_page_touch(cdst))) - return rc; - - // Move all nodes from src to dst. - j = nkeys = NUMKEYS(cdst->mc_pg[cdst->mc_top]); - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_pad; - key.mv_data = METADATA(csrc->mc_pg[csrc->mc_top]); - for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { - rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); - if (rc != MDB_SUCCESS) - return rc; - key.mv_data = (char *)key.mv_data + key.mv_size; - } - } else { - for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i); - if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); + data.mv_size = 0; + data.mv_data = NULL; + srcpg = 0; + flags = 0; + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); + mdb_cassert(csrc, !((size_t)srcnode & 1)); + srcpg = NODEPGNO(srcnode); + flags = srcnode->mn_flags; + if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { unsigned int snum = csrc->mc_snum; MDB_node *s2; // must find the lowest key below src @@ -2831,65 +2653,246 @@ func (c *cursor) mergePage(dst *cursor) error { csrc->mc_snum = snum--; csrc->mc_top = snum; } else { - key.mv_size = srcnode->mn_ksize; + key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - data.mv_size = NODEDSZ(srcnode); data.mv_data = NODEDATA(srcnode); - rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); - if (rc != MDB_SUCCESS) + } + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { + unsigned int snum = cdst->mc_snum; + MDB_node *s2; + MDB_val bkey; + // must find the lowest key below dst + mdb_page_search_lowest(cdst); + if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) { + bkey.mv_size = cdst->mc_db->md_pad; + bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size); + } else { + s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); + bkey.mv_size = NODEKSZ(s2); + bkey.mv_data = NODEKEY(s2); + } + cdst->mc_snum = snum--; + cdst->mc_top = snum; + mdb_cursor_copy(cdst, &mn); + mn.mc_ki[snum] = 0; + rc = mdb_update_key(&mn, &bkey); + if (rc) return rc; } - } - DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", - cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]), - (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10)); + DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", + IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", + csrc->mc_ki[csrc->mc_top], + DKEY(&key), + csrc->mc_pg[csrc->mc_top]->mp_pgno, + cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); - // Unlink the src page from parent and add to free list. - csrc->mc_top--; - mdb_node_del(csrc, 0); - if (csrc->mc_ki[csrc->mc_top] == 0) { - key.mv_size = 0; - rc = mdb_update_key(csrc, &key); - if (rc) { - csrc->mc_top++; + // Add the node to the destination page. + rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); + if (rc != MDB_SUCCESS) return rc; - } - } - csrc->mc_top++; - rc = mdb_midl_append(&csrc->mc_txn->mt_free_pgs, - csrc->mc_pg[csrc->mc_top]->mp_pgno); - if (rc) - return rc; - if (IS_LEAF(csrc->mc_pg[csrc->mc_top])) - csrc->mc_db->md_leaf_pages--; - else - csrc->mc_db->md_branch_pages--; - { - // Adjust other cursors pointing to mp - MDB_cursor *m2, *m3; - MDB_dbi dbi = csrc->mc_dbi; - MDB_page *mp = cdst->mc_pg[cdst->mc_top]; + // Delete the node from the source page. + mdb_node_del(csrc, key.mv_size); - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == csrc) continue; - if (m3->mc_snum < csrc->mc_snum) continue; - if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = mp; - m3->mc_ki[csrc->mc_top] += nkeys; + { + // Adjust other cursors pointing to mp + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + MDB_page *mp = csrc->mc_pg[csrc->mc_top]; + + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] == + csrc->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + } } } - } - mdb_cursor_pop(csrc); - return mdb_rebalance(csrc); + // Update the parent separators. + if (csrc->mc_ki[csrc->mc_top] == 0) { + if (csrc->mc_ki[csrc->mc_top-1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + DPRINTF(("update separator for source page %"Z"u to [%s]", + csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); + mdb_cursor_copy(csrc, &mn); + mn.mc_snum--; + mn.mc_top--; + if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) + return rc; + } + if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + MDB_val nullkey; + indx_t ix = csrc->mc_ki[csrc->mc_top]; + nullkey.mv_size = 0; + csrc->mc_ki[csrc->mc_top] = 0; + rc = mdb_update_key(csrc, &nullkey); + csrc->mc_ki[csrc->mc_top] = ix; + mdb_cassert(csrc, rc == MDB_SUCCESS); + } + } + + if (cdst->mc_ki[cdst->mc_top] == 0) { + if (cdst->mc_ki[cdst->mc_top-1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + DPRINTF(("update separator for destination page %"Z"u to [%s]", + cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); + mdb_cursor_copy(cdst, &mn); + mn.mc_snum--; + mn.mc_top--; + if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) + return rc; + } + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { + MDB_val nullkey; + indx_t ix = cdst->mc_ki[cdst->mc_top]; + nullkey.mv_size = 0; + cdst->mc_ki[cdst->mc_top] = 0; + rc = mdb_update_key(cdst, &nullkey); + cdst->mc_ki[cdst->mc_top] = ix; + mdb_cassert(csrc, rc == MDB_SUCCESS); + } + } + + return MDB_SUCCESS; + */ + + return nil +} + +// Merge one page into another. +// The nodes from the page pointed to by \b csrc will +// be copied to the page pointed to by \b cdst and then +// the \b csrc page will be freed. +// @param[in] csrc Cursor pointing to the source page. +// @param[in] cdst Cursor pointing to the destination page. +func (c *cursor) mergePage(dst *cursor) error { + /* + int rc; + indx_t i, j; + MDB_node *srcnode; + MDB_val key, data; + unsigned nkeys; + + DPRINTF(("merging page %"Z"u into %"Z"u", csrc->mc_pg[csrc->mc_top]->mp_pgno, + cdst->mc_pg[cdst->mc_top]->mp_pgno)); + + mdb_cassert(csrc, csrc->mc_snum > 1); // can't merge root page + mdb_cassert(csrc, cdst->mc_snum > 1); + + // Mark dst as dirty. + if ((rc = mdb_page_touch(cdst))) + return rc; + + // Move all nodes from src to dst. + j = nkeys = NUMKEYS(cdst->mc_pg[cdst->mc_top]); + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = METADATA(csrc->mc_pg[csrc->mc_top]); + for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { + rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); + if (rc != MDB_SUCCESS) + return rc; + key.mv_data = (char *)key.mv_data + key.mv_size; + } + } else { + for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i); + if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + unsigned int snum = csrc->mc_snum; + MDB_node *s2; + // must find the lowest key below src + mdb_page_search_lowest(csrc); + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_pad; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + csrc->mc_snum = snum--; + csrc->mc_top = snum; + } else { + key.mv_size = srcnode->mn_ksize; + key.mv_data = NODEKEY(srcnode); + } + + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); + if (rc != MDB_SUCCESS) + return rc; + } + } + + DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", + cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]), + (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10)); + + // Unlink the src page from parent and add to free list. + csrc->mc_top--; + mdb_node_del(csrc, 0); + if (csrc->mc_ki[csrc->mc_top] == 0) { + key.mv_size = 0; + rc = mdb_update_key(csrc, &key); + if (rc) { + csrc->mc_top++; + return rc; + } + } + csrc->mc_top++; + + rc = mdb_midl_append(&csrc->mc_txn->mt_free_pgs, + csrc->mc_pg[csrc->mc_top]->mp_pgno); + if (rc) + return rc; + if (IS_LEAF(csrc->mc_pg[csrc->mc_top])) + csrc->mc_db->md_leaf_pages--; + else + csrc->mc_db->md_branch_pages--; + { + // Adjust other cursors pointing to mp + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + MDB_page *mp = cdst->mc_pg[cdst->mc_top]; + + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) continue; + if (m3->mc_snum < csrc->mc_snum) continue; + if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = mp; + m3->mc_ki[csrc->mc_top] += nkeys; + } + } + } + mdb_cursor_pop(csrc); + + return mdb_rebalance(csrc); */ return nil @@ -2900,20 +2903,20 @@ func (c *cursor) mergePage(dst *cursor) error { // @param[out] cdst The cursor to copy to. func (c *cursor) copyTo(dst *cursor) { /* - unsigned int i; + unsigned int i; - cdst->mc_txn = csrc->mc_txn; - cdst->mc_dbi = csrc->mc_dbi; - cdst->mc_db = csrc->mc_db; - cdst->mc_dbx = csrc->mc_dbx; - cdst->mc_snum = csrc->mc_snum; - cdst->mc_top = csrc->mc_top; - cdst->mc_flags = csrc->mc_flags; + cdst->mc_txn = csrc->mc_txn; + cdst->mc_dbi = csrc->mc_dbi; + cdst->mc_db = csrc->mc_db; + cdst->mc_dbx = csrc->mc_dbx; + cdst->mc_snum = csrc->mc_snum; + cdst->mc_top = csrc->mc_top; + cdst->mc_flags = csrc->mc_flags; - for (i=0; imc_snum; i++) { - cdst->mc_pg[i] = csrc->mc_pg[i]; - cdst->mc_ki[i] = csrc->mc_ki[i]; - } + for (i=0; imc_snum; i++) { + cdst->mc_pg[i] = csrc->mc_pg[i]; + cdst->mc_ki[i] = csrc->mc_ki[i]; + } */ } @@ -2923,154 +2926,154 @@ func (c *cursor) copyTo(dst *cursor) { // @return 0 on success, non-zero on failure. func (c *cursor) rebalance() error { /* - MDB_node *node; - int rc; - unsigned int ptop, minkeys; - MDB_cursor mn; + MDB_node *node; + int rc; + unsigned int ptop, minkeys; + MDB_cursor mn; - minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top])); - DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", - IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", - mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), - (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); + minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top])); + DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", + IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); - if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD && - NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { - DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", - mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); - return MDB_SUCCESS; - } - - if (mc->mc_snum < 2) { - MDB_page *mp = mc->mc_pg[0]; - if (IS_SUBP(mp)) { - DPUTS("Can't rebalance a subpage, ignoring"); + if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD && + NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { + DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", + mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); return MDB_SUCCESS; } - if (NUMKEYS(mp) == 0) { - DPUTS("tree is completely empty"); - mc->mc_db->md_root = P_INVALID; - mc->mc_db->md_depth = 0; - mc->mc_db->md_leaf_pages = 0; - rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); - if (rc) - return rc; - // Adjust cursors pointing to mp - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_flags &= ~C_INITIALIZED; - { - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[0] == mp) { - m3->mc_snum = 0; - m3->mc_top = 0; - m3->mc_flags &= ~C_INITIALIZED; - } - } + if (mc->mc_snum < 2) { + MDB_page *mp = mc->mc_pg[0]; + if (IS_SUBP(mp)) { + DPUTS("Can't rebalance a subpage, ignoring"); + return MDB_SUCCESS; } - } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { - DPUTS("collapsing root page!"); - rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); - if (rc) - return rc; - mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); - rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL); - if (rc) - return rc; - mc->mc_db->md_depth--; - mc->mc_db->md_branch_pages--; - mc->mc_ki[0] = mc->mc_ki[1]; - { - // Adjust other cursors pointing to mp - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; + if (NUMKEYS(mp) == 0) { + DPUTS("tree is completely empty"); + mc->mc_db->md_root = P_INVALID; + mc->mc_db->md_depth = 0; + mc->mc_db->md_leaf_pages = 0; + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; + // Adjust cursors pointing to mp + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + { + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[0] == mp) { - int i; - m3->mc_snum--; - m3->mc_top--; - for (i=0; imc_snum; i++) { - m3->mc_pg[i] = m3->mc_pg[i+1]; - m3->mc_ki[i] = m3->mc_ki[i+1]; + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[0] == mp) { + m3->mc_snum = 0; + m3->mc_top = 0; + m3->mc_flags &= ~C_INITIALIZED; } } } - } - } else - DPUTS("root page doesn't need rebalancing"); - return MDB_SUCCESS; - } + } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { + DPUTS("collapsing root page!"); + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; + mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); + rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL); + if (rc) + return rc; + mc->mc_db->md_depth--; + mc->mc_db->md_branch_pages--; + mc->mc_ki[0] = mc->mc_ki[1]; + { + // Adjust other cursors pointing to mp + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; - // The parent (branch page) must have at least 2 pointers, - // otherwise the tree is invalid. - ptop = mc->mc_top-1; - mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); - - // Leaf page fill factor is below the threshold. - // Try to move keys from left or right neighbor, or - // merge with a neighbor page. - - // Find neighbors. - mdb_cursor_copy(mc, &mn); - mn.mc_xcursor = NULL; - - if (mc->mc_ki[ptop] == 0) { - // We're the leftmost leaf in our parent. - DPUTS("reading right neighbor"); - mn.mc_ki[ptop]++; - node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); - if (rc) - return rc; - mn.mc_ki[mn.mc_top] = 0; - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); - } else { - // There is at least one neighbor to the left. - DPUTS("reading left neighbor"); - mn.mc_ki[ptop]--; - node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); - if (rc) - return rc; - mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; - mc->mc_ki[mc->mc_top] = 0; - } - - DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", - mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), - (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); - - // If the neighbor page is above threshold and has enough keys, - // move one key from it. Otherwise we should try to merge them. - // (A branch page must never have less than 2 keys.) - minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top])); - if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) - return mdb_node_move(&mn, mc); - else { - if (mc->mc_ki[ptop] == 0) - rc = mdb_page_merge(&mn, mc); - else { - mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; - rc = mdb_page_merge(mc, &mn); - mdb_cursor_copy(&mn, mc); + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[0] == mp) { + int i; + m3->mc_snum--; + m3->mc_top--; + for (i=0; imc_snum; i++) { + m3->mc_pg[i] = m3->mc_pg[i+1]; + m3->mc_ki[i] = m3->mc_ki[i+1]; + } + } + } + } + } else + DPUTS("root page doesn't need rebalancing"); + return MDB_SUCCESS; } - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - } - return rc; + + // The parent (branch page) must have at least 2 pointers, + // otherwise the tree is invalid. + ptop = mc->mc_top-1; + mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); + + // Leaf page fill factor is below the threshold. + // Try to move keys from left or right neighbor, or + // merge with a neighbor page. + + // Find neighbors. + mdb_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + + if (mc->mc_ki[ptop] == 0) { + // We're the leftmost leaf in our parent. + DPUTS("reading right neighbor"); + mn.mc_ki[ptop]++; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + if (rc) + return rc; + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + } else { + // There is at least one neighbor to the left. + DPUTS("reading left neighbor"); + mn.mc_ki[ptop]--; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + if (rc) + return rc; + mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; + mc->mc_ki[mc->mc_top] = 0; + } + + DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", + mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); + + // If the neighbor page is above threshold and has enough keys, + // move one key from it. Otherwise we should try to merge them. + // (A branch page must never have less than 2 keys.) + minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top])); + if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) + return mdb_node_move(&mn, mc); + else { + if (mc->mc_ki[ptop] == 0) + rc = mdb_page_merge(&mn, mc); + else { + mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; + rc = mdb_page_merge(mc, &mn); + mdb_cursor_copy(&mn, mc); + } + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + } + return rc; */ return nil } @@ -3078,61 +3081,61 @@ func (c *cursor) rebalance() error { // Complete a delete operation started by #mdb_cursor_del(). func (c *cursor) del0(leaf *node) error { /* - int rc; - MDB_page *mp; - indx_t ki; - unsigned int nkeys; - - mp = mc->mc_pg[mc->mc_top]; - ki = mc->mc_ki[mc->mc_top]; - - // add overflow pages to free list - if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; - pgno_t pg; - - memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || - (rc = mdb_ovpage_free(mc, omp))) - return rc; - } - mdb_node_del(mc, mc->mc_db->md_pad); - mc->mc_db->md_entries--; - rc = mdb_rebalance(mc); - if (rc != MDB_SUCCESS) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - else { - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; + int rc; + MDB_page *mp; + indx_t ki; + unsigned int nkeys; mp = mc->mc_pg[mc->mc_top]; - nkeys = NUMKEYS(mp); + ki = mc->mc_ki[mc->mc_top]; - // if mc points past last node in page, find next sibling - if (mc->mc_ki[mc->mc_top] >= nkeys) - mdb_cursor_sibling(mc, 1); + // add overflow pages to free list + if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; - // Adjust other cursors pointing to mp - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3 == mc || m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] >= ki) { - m3->mc_flags |= C_DEL; - if (m3->mc_ki[mc->mc_top] > ki) - m3->mc_ki[mc->mc_top]--; - } - if (m3->mc_ki[mc->mc_top] >= nkeys) - mdb_cursor_sibling(m3, 1); - } + memcpy(&pg, NODEDATA(leaf), sizeof(pg)); + if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || + (rc = mdb_ovpage_free(mc, omp))) + return rc; } - mc->mc_flags |= C_DEL; - } + mdb_node_del(mc, mc->mc_db->md_pad); + mc->mc_db->md_entries--; + rc = mdb_rebalance(mc); + if (rc != MDB_SUCCESS) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + else { + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; - return rc; + mp = mc->mc_pg[mc->mc_top]; + nkeys = NUMKEYS(mp); + + // if mc points past last node in page, find next sibling + if (mc->mc_ki[mc->mc_top] >= nkeys) + mdb_cursor_sibling(mc, 1); + + // Adjust other cursors pointing to mp + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3 == mc || m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= ki) { + m3->mc_flags |= C_DEL; + if (m3->mc_ki[mc->mc_top] > ki) + m3->mc_ki[mc->mc_top]--; + } + if (m3->mc_ki[mc->mc_top] >= nkeys) + mdb_cursor_sibling(m3, 1); + } + } + mc->mc_flags |= C_DEL; + } + + return rc; */ return nil } @@ -3148,319 +3151,223 @@ func (c *cursor) del0(leaf *node) error { // @return 0 on success, non-zero on failure. func (c *cursor) splitPage(newKey []byte, newData []byte, newpgno int, nflags int) error { /* - unsigned int flags; - int rc = MDB_SUCCESS, new_root = 0, did_split = 0; - indx_t newindx; - pgno_t pgno = 0; - int i, j, split_indx, nkeys, pmax; - MDB_env *env = mc->mc_txn->mt_env; - MDB_node *node; - MDB_val sepkey, rkey, xdata, *rdata = &xdata; - MDB_page *copy = NULL; - MDB_page *mp, *rp, *pp; - int ptop; - MDB_cursor mn; - DKBUF; + unsigned int flags; + int rc = MDB_SUCCESS, new_root = 0, did_split = 0; + indx_t newindx; + pgno_t pgno = 0; + int i, j, split_indx, nkeys, pmax; + MDB_env *env = mc->mc_txn->mt_env; + MDB_node *node; + MDB_val sepkey, rkey, xdata, *rdata = &xdata; + MDB_page *copy = NULL; + MDB_page *mp, *rp, *pp; + int ptop; + MDB_cursor mn; + DKBUF; - mp = mc->mc_pg[mc->mc_top]; - newindx = mc->mc_ki[mc->mc_top]; - nkeys = NUMKEYS(mp); + mp = mc->mc_pg[mc->mc_top]; + newindx = mc->mc_ki[mc->mc_top]; + nkeys = NUMKEYS(mp); - DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, - DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); + DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, + DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); - // Create a right sibling. - if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) - return rc; - DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); - - if (mc->mc_snum < 2) { - if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) + // Create a right sibling. + if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) return rc; - // shift current top to make room for new parent - mc->mc_pg[1] = mc->mc_pg[0]; - mc->mc_ki[1] = mc->mc_ki[0]; - mc->mc_pg[0] = pp; - mc->mc_ki[0] = 0; - mc->mc_db->md_root = pp->mp_pgno; - DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); - mc->mc_db->md_depth++; - new_root = 1; + DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); - // Add left (implicit) pointer. - if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { - // undo the pre-push - mc->mc_pg[0] = mc->mc_pg[1]; - mc->mc_ki[0] = mc->mc_ki[1]; - mc->mc_db->md_root = mp->mp_pgno; - mc->mc_db->md_depth--; - return rc; - } - mc->mc_snum = 2; - mc->mc_top = 1; - ptop = 0; - } else { - ptop = mc->mc_top-1; - DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); - } + if (mc->mc_snum < 2) { + if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) + return rc; + // shift current top to make room for new parent + mc->mc_pg[1] = mc->mc_pg[0]; + mc->mc_ki[1] = mc->mc_ki[0]; + mc->mc_pg[0] = pp; + mc->mc_ki[0] = 0; + mc->mc_db->md_root = pp->mp_pgno; + DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); + mc->mc_db->md_depth++; + new_root = 1; - mc->mc_flags |= C_SPLITTING; - mdb_cursor_copy(mc, &mn); - mn.mc_pg[mn.mc_top] = rp; - mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; - - if (nflags & MDB_APPEND) { - mn.mc_ki[mn.mc_top] = 0; - sepkey = *newkey; - split_indx = newindx; - nkeys = 0; - } else { - - split_indx = (nkeys+1) / 2; - - if (IS_LEAF2(rp)) { - char *split, *ins; - int x; - unsigned int lsize, rsize, ksize; - // Move half of the keys to the right sibling - copy = NULL; - x = mc->mc_ki[mc->mc_top] - split_indx; - ksize = mc->mc_db->md_pad; - split = LEAF2KEY(mp, split_indx, ksize); - rsize = (nkeys - split_indx) * ksize; - lsize = (nkeys - split_indx) * sizeof(indx_t); - mp->mp_lower -= lsize; - rp->mp_lower += lsize; - mp->mp_upper += rsize - lsize; - rp->mp_upper -= rsize - lsize; - sepkey.mv_size = ksize; - if (newindx == split_indx) { - sepkey.mv_data = newkey->mv_data; - } else { - sepkey.mv_data = split; - } - if (x<0) { - ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); - memcpy(rp->mp_ptrs, split, rsize); - sepkey.mv_data = rp->mp_ptrs; - memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); - memcpy(ins, newkey->mv_data, ksize); - mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); - } else { - if (x) - memcpy(rp->mp_ptrs, split, x * ksize); - ins = LEAF2KEY(rp, x, ksize); - memcpy(ins, newkey->mv_data, ksize); - memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); - rp->mp_lower += sizeof(indx_t); - rp->mp_upper -= ksize - sizeof(indx_t); - mc->mc_ki[mc->mc_top] = x; - mc->mc_pg[mc->mc_top] = rp; - } - } else { - int psize, nsize, k; - // Maximum free space in an empty page - pmax = env->me_psize - PAGEHDRSZ; - if (IS_LEAF(mp)) - nsize = mdb_leaf_size(env, newkey, newdata); - else - nsize = mdb_branch_size(env, newkey); - nsize = EVEN(nsize); - - // grab a page to hold a temporary copy - copy = mdb_page_malloc(mc->mc_txn, 1); - if (copy == NULL) - return ENOMEM; - copy->mp_pgno = mp->mp_pgno; - copy->mp_flags = mp->mp_flags; - copy->mp_lower = PAGEHDRSZ; - copy->mp_upper = env->me_psize; - - // prepare to insert - for (i=0, j=0; imp_ptrs[j++] = 0; - } - copy->mp_ptrs[j++] = mp->mp_ptrs[i]; - } - - // When items are relatively large the split point needs - // to be checked, because being off-by-one will make the - // difference between success or failure in mdb_node_add. - // - // It's also relevant if a page happens to be laid out - // such that one half of its nodes are all "small" and - // the other half of its nodes are "large." If the new - // item is also "large" and falls on the half with - // "large" nodes, it also may not fit. - // - // As a final tweak, if the new item goes on the last - // spot on the page (and thus, onto the new page), bias - // the split so the new page is emptier than the old page. - // This yields better packing during sequential inserts. - if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { - // Find split point - psize = 0; - if (newindx <= split_indx || newindx >= nkeys) { - i = 0; j = 1; - k = newindx >= nkeys ? nkeys : split_indx+2; - } else { - i = nkeys; j = -1; - k = split_indx-1; - } - for (; i!=k; i+=j) { - if (i == newindx) { - psize += nsize; - node = NULL; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]); - psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); - if (IS_LEAF(mp)) { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - psize += sizeof(pgno_t); - else - psize += NODEDSZ(node); - } - psize = EVEN(psize); - } - if (psize > pmax || i == k-j) { - split_indx = i + (j<0); - break; - } - } - } - if (split_indx == newindx) { - sepkey.mv_size = newkey->mv_size; - sepkey.mv_data = newkey->mv_data; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]); - sepkey.mv_size = node->mn_ksize; - sepkey.mv_data = NODEKEY(node); - } - } - } - - DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); - - // Copy separator key to the parent. - if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { - mn.mc_snum--; - mn.mc_top--; - did_split = 1; - rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0); - - // root split? - if (mn.mc_snum == mc->mc_snum) { - mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top]; - mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop]; - mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop]; - mc->mc_snum++; - mc->mc_top++; - ptop++; - } - // Right page might now have changed parent. - // Check if left page also changed parent. - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; imc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - mc->mc_pg[ptop] = mn.mc_pg[ptop]; - mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; - } - } else { - mn.mc_top--; - rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); - mn.mc_top++; - } - mc->mc_flags ^= C_SPLITTING; - if (rc != MDB_SUCCESS) { - return rc; - } - if (nflags & MDB_APPEND) { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[mc->mc_top] = 0; - rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); - if (rc) - return rc; - for (i=0; imc_top; i++) - mc->mc_ki[i] = mn.mc_ki[i]; - } else if (!IS_LEAF2(mp)) { - // Move nodes - mc->mc_pg[mc->mc_top] = rp; - i = split_indx; - j = 0; - do { - if (i == newindx) { - rkey.mv_data = newkey->mv_data; - rkey.mv_size = newkey->mv_size; - if (IS_LEAF(mp)) { - rdata = newdata; - } else - pgno = newpgno; - flags = nflags; - // Update index for the new key. - mc->mc_ki[mc->mc_top] = j; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]); - rkey.mv_data = NODEKEY(node); - rkey.mv_size = node->mn_ksize; - if (IS_LEAF(mp)) { - xdata.mv_data = NODEDATA(node); - xdata.mv_size = NODEDSZ(node); - rdata = &xdata; - } else - pgno = NODEPGNO(node); - flags = node->mn_flags; - } - - if (!IS_LEAF(mp) && j == 0) { - // First branch index doesn't need key data. - rkey.mv_size = 0; - } - - rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); - if (rc) { - // return tmp page to freelist - mdb_page_free(env, copy); + // Add left (implicit) pointer. + if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { + // undo the pre-push + mc->mc_pg[0] = mc->mc_pg[1]; + mc->mc_ki[0] = mc->mc_ki[1]; + mc->mc_db->md_root = mp->mp_pgno; + mc->mc_db->md_depth--; return rc; } - if (i == nkeys) { - i = 0; - j = 0; - mc->mc_pg[mc->mc_top] = copy; - } else { - i++; - j++; - } - } while (i != split_indx); - - nkeys = NUMKEYS(copy); - for (i=0; imp_ptrs[i] = copy->mp_ptrs[i]; - mp->mp_lower = copy->mp_lower; - mp->mp_upper = copy->mp_upper; - memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), - env->me_psize - copy->mp_upper); - - // reset back to original page - if (newindx < split_indx) { - mc->mc_pg[mc->mc_top] = mp; - if (nflags & MDB_RESERVE) { - node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!(node->mn_flags & F_BIGDATA)) - newdata->mv_data = NODEDATA(node); - } + mc->mc_snum = 2; + mc->mc_top = 1; + ptop = 0; } else { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[ptop]++; - // Make sure mc_ki is still valid. + ptop = mc->mc_top-1; + DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); + } + + mc->mc_flags |= C_SPLITTING; + mdb_cursor_copy(mc, &mn); + mn.mc_pg[mn.mc_top] = rp; + mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; + + if (nflags & MDB_APPEND) { + mn.mc_ki[mn.mc_top] = 0; + sepkey = *newkey; + split_indx = newindx; + nkeys = 0; + } else { + + split_indx = (nkeys+1) / 2; + + if (IS_LEAF2(rp)) { + char *split, *ins; + int x; + unsigned int lsize, rsize, ksize; + // Move half of the keys to the right sibling + copy = NULL; + x = mc->mc_ki[mc->mc_top] - split_indx; + ksize = mc->mc_db->md_pad; + split = LEAF2KEY(mp, split_indx, ksize); + rsize = (nkeys - split_indx) * ksize; + lsize = (nkeys - split_indx) * sizeof(indx_t); + mp->mp_lower -= lsize; + rp->mp_lower += lsize; + mp->mp_upper += rsize - lsize; + rp->mp_upper -= rsize - lsize; + sepkey.mv_size = ksize; + if (newindx == split_indx) { + sepkey.mv_data = newkey->mv_data; + } else { + sepkey.mv_data = split; + } + if (x<0) { + ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); + memcpy(rp->mp_ptrs, split, rsize); + sepkey.mv_data = rp->mp_ptrs; + memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memcpy(ins, newkey->mv_data, ksize); + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + } else { + if (x) + memcpy(rp->mp_ptrs, split, x * ksize); + ins = LEAF2KEY(rp, x, ksize); + memcpy(ins, newkey->mv_data, ksize); + memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); + rp->mp_lower += sizeof(indx_t); + rp->mp_upper -= ksize - sizeof(indx_t); + mc->mc_ki[mc->mc_top] = x; + mc->mc_pg[mc->mc_top] = rp; + } + } else { + int psize, nsize, k; + // Maximum free space in an empty page + pmax = env->me_psize - PAGEHDRSZ; + if (IS_LEAF(mp)) + nsize = mdb_leaf_size(env, newkey, newdata); + else + nsize = mdb_branch_size(env, newkey); + nsize = EVEN(nsize); + + // grab a page to hold a temporary copy + copy = mdb_page_malloc(mc->mc_txn, 1); + if (copy == NULL) + return ENOMEM; + copy->mp_pgno = mp->mp_pgno; + copy->mp_flags = mp->mp_flags; + copy->mp_lower = PAGEHDRSZ; + copy->mp_upper = env->me_psize; + + // prepare to insert + for (i=0, j=0; imp_ptrs[j++] = 0; + } + copy->mp_ptrs[j++] = mp->mp_ptrs[i]; + } + + // When items are relatively large the split point needs + // to be checked, because being off-by-one will make the + // difference between success or failure in mdb_node_add. + // + // It's also relevant if a page happens to be laid out + // such that one half of its nodes are all "small" and + // the other half of its nodes are "large." If the new + // item is also "large" and falls on the half with + // "large" nodes, it also may not fit. + // + // As a final tweak, if the new item goes on the last + // spot on the page (and thus, onto the new page), bias + // the split so the new page is emptier than the old page. + // This yields better packing during sequential inserts. + if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { + // Find split point + psize = 0; + if (newindx <= split_indx || newindx >= nkeys) { + i = 0; j = 1; + k = newindx >= nkeys ? nkeys : split_indx+2; + } else { + i = nkeys; j = -1; + k = split_indx-1; + } + for (; i!=k; i+=j) { + if (i == newindx) { + psize += nsize; + node = NULL; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]); + psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + psize += sizeof(pgno_t); + else + psize += NODEDSZ(node); + } + psize = EVEN(psize); + } + if (psize > pmax || i == k-j) { + split_indx = i + (j<0); + break; + } + } + } + if (split_indx == newindx) { + sepkey.mv_size = newkey->mv_size; + sepkey.mv_data = newkey->mv_data; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]); + sepkey.mv_size = node->mn_ksize; + sepkey.mv_data = NODEKEY(node); + } + } + } + + DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); + + // Copy separator key to the parent. + if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { + mn.mc_snum--; + mn.mc_top--; + did_split = 1; + rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0); + + // root split? + if (mn.mc_snum == mc->mc_snum) { + mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top]; + mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top]; + mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop]; + mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop]; + mc->mc_snum++; + mc->mc_top++; + ptop++; + } + // Right page might now have changed parent. + // Check if left page also changed parent. if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { for (i=0; imc_pg[i] = mn.mc_pg[i]; mc->mc_ki[i] = mn.mc_ki[i]; @@ -3468,60 +3375,156 @@ func (c *cursor) splitPage(newKey []byte, newData []byte, newpgno int, nflags in mc->mc_pg[ptop] = mn.mc_pg[ptop]; mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; } + } else { + mn.mc_top--; + rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); + mn.mc_top++; } - // return tmp page to freelist - mdb_page_free(env, copy); - } - - { - // Adjust other cursors pointing to mp - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - int fixup = NUMKEYS(mp); - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc) - continue; - if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_flags & C_SPLITTING) - continue; - if (new_root) { - int k; - // root split - for (k=m3->mc_top; k>=0; k--) { - m3->mc_ki[k+1] = m3->mc_ki[k]; - m3->mc_pg[k+1] = m3->mc_pg[k]; - } - if (m3->mc_ki[0] >= split_indx) { - m3->mc_ki[0] = 1; + mc->mc_flags ^= C_SPLITTING; + if (rc != MDB_SUCCESS) { + return rc; + } + if (nflags & MDB_APPEND) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[mc->mc_top] = 0; + rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); + if (rc) + return rc; + for (i=0; imc_top; i++) + mc->mc_ki[i] = mn.mc_ki[i]; + } else if (!IS_LEAF2(mp)) { + // Move nodes + mc->mc_pg[mc->mc_top] = rp; + i = split_indx; + j = 0; + do { + if (i == newindx) { + rkey.mv_data = newkey->mv_data; + rkey.mv_size = newkey->mv_size; + if (IS_LEAF(mp)) { + rdata = newdata; + } else + pgno = newpgno; + flags = nflags; + // Update index for the new key. + mc->mc_ki[mc->mc_top] = j; } else { - m3->mc_ki[0] = 0; + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]); + rkey.mv_data = NODEKEY(node); + rkey.mv_size = node->mn_ksize; + if (IS_LEAF(mp)) { + xdata.mv_data = NODEDATA(node); + xdata.mv_size = NODEDSZ(node); + rdata = &xdata; + } else + pgno = NODEPGNO(node); + flags = node->mn_flags; + } + + if (!IS_LEAF(mp) && j == 0) { + // First branch index doesn't need key data. + rkey.mv_size = 0; + } + + rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); + if (rc) { + // return tmp page to freelist + mdb_page_free(env, copy); + return rc; + } + if (i == nkeys) { + i = 0; + j = 0; + mc->mc_pg[mc->mc_top] = copy; + } else { + i++; + j++; + } + } while (i != split_indx); + + nkeys = NUMKEYS(copy); + for (i=0; imp_ptrs[i] = copy->mp_ptrs[i]; + mp->mp_lower = copy->mp_lower; + mp->mp_upper = copy->mp_upper; + memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), + env->me_psize - copy->mp_upper); + + // reset back to original page + if (newindx < split_indx) { + mc->mc_pg[mc->mc_top] = mp; + if (nflags & MDB_RESERVE) { + node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); + } + } else { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + // Make sure mc_ki is still valid. + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; imc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + mc->mc_pg[ptop] = mn.mc_pg[ptop]; + mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; } - m3->mc_pg[0] = mc->mc_pg[0]; - m3->mc_snum++; - m3->mc_top++; } - if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) - m3->mc_ki[mc->mc_top]++; - if (m3->mc_ki[mc->mc_top] >= fixup) { - m3->mc_pg[mc->mc_top] = rp; - m3->mc_ki[mc->mc_top] -= fixup; - m3->mc_ki[ptop] = mn.mc_ki[ptop]; + // return tmp page to freelist + mdb_page_free(env, copy); + } + + { + // Adjust other cursors pointing to mp + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + int fixup = NUMKEYS(mp); + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) + continue; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_flags & C_SPLITTING) + continue; + if (new_root) { + int k; + // root split + for (k=m3->mc_top; k>=0; k--) { + m3->mc_ki[k+1] = m3->mc_ki[k]; + m3->mc_pg[k+1] = m3->mc_pg[k]; + } + if (m3->mc_ki[0] >= split_indx) { + m3->mc_ki[0] = 1; + } else { + m3->mc_ki[0] = 0; + } + m3->mc_pg[0] = mc->mc_pg[0]; + m3->mc_snum++; + m3->mc_top++; + } + if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) + m3->mc_ki[mc->mc_top]++; + if (m3->mc_ki[mc->mc_top] >= fixup) { + m3->mc_pg[mc->mc_top] = rp; + m3->mc_ki[mc->mc_top] -= fixup; + m3->mc_ki[ptop] = mn.mc_ki[ptop]; + } + } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && + m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { + m3->mc_ki[ptop]++; } - } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && - m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { - m3->mc_ki[ptop]++; } } - } - DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); - return rc; + DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); + return rc; */ return nil } @@ -3532,77 +3535,77 @@ func (c *cursor) splitPage(newKey []byte, newData []byte, newpgno int, nflags in // @return 0 on success, non-zero on failure. func (c *cursor) drop0(subs int) error { /* - int rc; + int rc; - rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); - if (rc == MDB_SUCCESS) { - MDB_txn *txn = mc->mc_txn; - MDB_node *ni; - MDB_cursor mx; - unsigned int i; + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); + if (rc == MDB_SUCCESS) { + MDB_txn *txn = mc->mc_txn; + MDB_node *ni; + MDB_cursor mx; + unsigned int i; - // LEAF2 pages have no nodes, cannot have sub-DBs - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) - mdb_cursor_pop(mc); + // LEAF2 pages have no nodes, cannot have sub-DBs + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) + mdb_cursor_pop(mc); - mdb_cursor_copy(mc, &mx); - while (mc->mc_snum > 0) { - MDB_page *mp = mc->mc_pg[mc->mc_top]; - unsigned n = NUMKEYS(mp); - if (IS_LEAF(mp)) { - for (i=0; imn_flags & F_BIGDATA) { - MDB_page *omp; + mdb_cursor_copy(mc, &mx); + while (mc->mc_snum > 0) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + unsigned n = NUMKEYS(mp); + if (IS_LEAF(mp)) { + for (i=0; imn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + rc = mdb_page_get(txn, pg, &omp, NULL); + if (rc != 0) + return rc; + mdb_cassert(mc, IS_OVERFLOW(omp)); + rc = mdb_midl_append_range(&txn->mt_free_pgs, + pg, omp->mp_pages); + if (rc) + return rc; + } else if (subs && (ni->mn_flags & F_SUBDATA)) { + mdb_xcursor_init1(mc, ni); + rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (rc) + return rc; + } + } + } else { + if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) + return rc; + for (i=0; imt_free_pgs, - pg, omp->mp_pages); - if (rc) - return rc; - } else if (subs && (ni->mn_flags & F_SUBDATA)) { - mdb_xcursor_init1(mc, ni); - rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); - if (rc) - return rc; + ni = NODEPTR(mp, i); + pg = NODEPGNO(ni); + // free it + mdb_midl_xappend(txn->mt_free_pgs, pg); } } - } else { - if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) - return rc; - for (i=0; imt_free_pgs, pg); - } - } - if (!mc->mc_top) - break; - mc->mc_ki[mc->mc_top] = i; - rc = mdb_cursor_sibling(mc, 1); - if (rc) { - // no more siblings, go back to beginning - // of previous level. - mdb_cursor_pop(mc); - mc->mc_ki[0] = 0; - for (i=1; imc_snum; i++) { - mc->mc_ki[i] = 0; - mc->mc_pg[i] = mx.mc_pg[i]; + if (!mc->mc_top) + break; + mc->mc_ki[mc->mc_top] = i; + rc = mdb_cursor_sibling(mc, 1); + if (rc) { + // no more siblings, go back to beginning + // of previous level. + mdb_cursor_pop(mc); + mc->mc_ki[0] = 0; + for (i=1; imc_snum; i++) { + mc->mc_ki[i] = 0; + mc->mc_pg[i] = mx.mc_pg[i]; + } } } + // free it + rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); + } else if (rc == MDB_NOTFOUND) { + rc = MDB_SUCCESS; } - // free it - rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); - } else if (rc == MDB_NOTFOUND) { - rc = MDB_SUCCESS; - } - return rc; + return rc; */ return nil } diff --git a/db.go b/db.go index 3ab9f86..bf9b39f 100644 --- a/db.go +++ b/db.go @@ -1,5 +1,12 @@ package bolt +import ( + "os" + "sync" + "syscall" + "unsafe" +) + const ( NoSync = iota NoMetaSync @@ -8,59 +15,54 @@ const ( IntegerDupKey ) -var DatabaseAlreadyOpenedError = &Error{"Database already open"} +var DatabaseAlreadyOpenedError = &Error{"Database already open", nil} // TODO: #define MDB_FATAL_ERROR 0x80000000U /** Failed to update the meta page. Probably an I/O error. */ // TODO: #define MDB_ENV_ACTIVE 0x20000000U /** Some fields are initialized. */ // TODO: #define MDB_ENV_TXKEY 0x10000000U /** me_txkey is set */ // TODO: #define MDB_LIVE_READER 0x08000000U /** Have liveness lock in reader table */ -type DB interface { - syncEnabled bool - metaSyncEnabled bool -} - -type db struct { +type DB struct { sync.Mutex opened bool - file os.File - metafile os.File - buf []byte + file *os.File + metafile *os.File + data []byte + buf []byte + meta0 *meta + meta1 *meta pageSize int - readers []*reader - buckets []*bucket - xbuckets []*bucketx /**< array of static DB info */ - bucketFlags []int /**< array of flags from MDB_db.md_flags */ - path string - mmap []byte - mmapSize int /**< size of the data memory map */ - size int /**< current file size */ - meta1 []byte - meta2 []byte - pbuf []byte - transaction *transaction /**< current write transaction */ - maxPageNumber int /**< me_mapsize / me_psize */ - pageState pageStage /**< state of old pages from freeDB */ - dpages []*page /**< list of malloc'd blocks for re-use */ - freePages []int /** IDL of pages that became unused in a write txn */ - dirtyPages []int /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ - maxFreeOnePage int /** Max number of freelist items that can fit in a single overflow page */ - maxNodeSize int /** Max size of a node on a page */ - maxKeySize int /**< max size of a key */ + readers []*reader + buckets []*Bucket + // xbuckets []*bucketx /**< array of static DB info */ + bucketFlags []int /**< array of flags from MDB_db.md_flags */ + path string + mmapSize int /**< size of the data memory map */ + size int /**< current file size */ + pbuf []byte + transaction *transaction /**< current write transaction */ + maxPageNumber int /**< me_mapsize / me_psize */ + pageState pageState /**< state of old pages from freeDB */ + dpages []*page /**< list of malloc'd blocks for re-use */ + freePages []int /** IDL of pages that became unused in a write txn */ + dirtyPages []int /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + maxFreeOnePage int /** Max number of freelist items that can fit in a single overflow page */ + maxPageDataSize int + maxNodeSize int /** Max size of a node on a page */ + maxKeySize int /**< max size of a key */ } - -func NewDB() DB { - return &db{} +func NewDB() *DB { + return &DB{} } -func (db *db) Path() string { +func (db *DB) Path() string { return db.path } -func (db *db) Open(path string, mode os.FileMode) error { +func (db *DB) Open(path string, mode os.FileMode) error { var err error db.Lock() defer db.Unlock() @@ -72,24 +74,24 @@ func (db *db) Open(path string, mode os.FileMode) error { // Open data file and separate sync handler for metadata writes. db.path = path - if db.file, err = os.OpenFile(db.path, O_RDWR | O_CREAT, mode); err != nil { + if db.file, err = os.OpenFile(db.path, os.O_RDWR|os.O_CREATE, mode); err != nil { db.close() return err } - if db.metafile, err = os.OpenFile(db.path, O_RDWR | O_SYNC, mode); err != nil { + if db.metafile, err = os.OpenFile(db.path, os.O_RDWR|os.O_SYNC, mode); err != nil { db.close() return err } // Read enough data to get both meta pages. var m, m0, m1 *meta - var buf [headerSize + unsafe.Sizeof(meta)]byte - if _, err := db.file.ReadAt(buf, 0); err == nil { + var buf [pageHeaderSize + int(unsafe.Sizeof(meta{}))]byte + if _, err := db.file.ReadAt(buf[:], 0); err == nil { if m0, _ = db.page(buf[:], 0).meta(); m0 != nil { - db.pageSize = m0.free.pad + db.pageSize = int(m0.free.pad) } } - if _, err := db.file.ReadAt(buf, db.pageSize); err == nil { + if _, err := db.file.ReadAt(buf[:], int64(db.pageSize)); err == nil { m1, _ = db.page(buf[:], 0).meta() } if m0 != nil && m1 != nil { @@ -102,27 +104,16 @@ func (db *db) Open(path string, mode os.FileMode) error { // Initialize the page size for new environments. if m == nil { - db.pageSize = os.Getpagesize() - if db.pageSize > maxPageSize { - db.pageSize = maxPageSize + if err := db.init(); err != nil { + return err } } - // TODO: Check mapsize. - /* - // Was a mapsize configured? - if (!env->me_mapsize) { - // If this is a new environment, take the default, - // else use the size recorded in the existing env. - env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize; - } else if (env->me_mapsize < meta.mm_mapsize) { - // If the configured size is smaller, make sure it's - // still big enough. Silently round up to minimum if not. - size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; - if (env->me_mapsize < minsize) - env->me_mapsize = minsize; - } - */ + // Initialize db fields. + db.buf = make([]byte, db.pageSize) + db.maxPageDataSize = ((db.pageSize - pageHeaderSize) / int(unsafe.Sizeof(pgno(0)))) - 1 + db.maxNodeSize = (((db.pageSize - pageHeaderSize) / minKeyCount) & -2) - int(unsafe.Sizeof(indx(0))) + // TODO?: env->me_maxpg = env->me_mapsize / env->me_psize; // Memory map the data file. if err := db.mmap(); err != nil { @@ -130,725 +121,584 @@ func (db *db) Open(path string, mode os.FileMode) error { return err } - // Initialize the buffer. - db.buf = make([]byte, db.pageSize) + // TODO: Initialize meta. + // if (newenv) { + // i = mdb_env_init_meta(env, &meta); + // if (i != MDB_SUCCESS) { + // return i; + // } + // } // Mark the database as opened and return. db.opened = true return nil } -// Read the meta pages and return the latest. -func (db *db) readMeta() *meta { - m := &meta{} - m.read() +// int mdb_env_map(MDB_env *env, void *addr, int newsize) +func (db *DB) mmap() error { + var err error - /* - if ((i = mdb_env_read_header(env, &meta)) != 0) { - if (i != ENOENT) - return i; - DPUTS("new mdbenv"); - newenv = 1; - env->me_psize = env->me_os_psize; - if (env->me_psize > MAX_PAGESIZE) - env->me_psize = MAX_PAGESIZE; + // Determine the map size based on the file size. + var size int + if info, err := os.Stat(db.file.Name()); err != nil { + return err + } else if info.Size() < int64(db.pageSize*2) { + return &Error{"file size too small", nil} } else { - env->me_psize = meta.mm_psize; + size = int(info.Size()) } - - rc = mdb_env_map(env, meta.mm_address, newenv); - if (rc) - return rc; - - if (newenv) { - if (flags & MDB_FIXEDMAP) - meta.mm_address = env->me_map; - i = mdb_env_init_meta(env, &meta); - if (i != MDB_SUCCESS) { - return i; - } + // Memory-map the data file as a byte slice. + if db.data, err = syscall.Mmap(int(db.file.Fd()), 0, size, syscall.PROT_READ, syscall.MAP_SHARED); err != nil { + return err } - env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) - - sizeof(indx_t); -#if !(MDB_MAXKEYSIZE) - env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); -#endif - env->me_maxpg = env->me_mapsize / env->me_psize; + // TODO?: If nordahead, then: madvise(env->me_map, env->me_mapsize, MADV_RANDOM); -#if MDB_DEBUG - { - int toggle = mdb_env_pick_meta(env); - MDB_db *db = &env->me_metas[toggle]->mm_dbs[MAIN_DBI]; - - DPRINTF(("opened database version %u, pagesize %u", - env->me_metas[0]->mm_version, env->me_psize)); - DPRINTF(("using meta page %d", toggle)); - DPRINTF(("depth: %u", db->md_depth)); - DPRINTF(("entries: %"Z"u", db->md_entries)); - DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); - DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); - DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); - DPRINTF(("root: %"Z"u", db->md_root)); + // Save references to the meta pages. + if db.meta0, err = db.page(db.data, 0).meta(); err != nil { + return &Error{"meta0 error", err} + } + if db.meta1, err = db.page(db.data, 1).meta(); err != nil { + return &Error{"meta1 error", err} } -#endif - return MDB_SUCCESS; - */ return nil } +// init creates a new database file and initializes its meta pages. +func (db *DB) init() error { + // Set the page size to the OS page size unless that is larger than max page size. + db.pageSize = os.Getpagesize() + if db.pageSize > maxPageSize { + db.pageSize = maxPageSize + } + + // Create two meta pages on a buffer. + buf := make([]byte, db.pageSize*2) + for i := 0; i < 2; i++ { + p := db.page(buf[:], i) + p.id = pgno(i) + p.initMeta(db.pageSize) + } + + // Write the buffer to our data file. + if _, err := db.metafile.WriteAt(buf, 0); err != nil { + return err + } + + return nil +} + +func (db *DB) close() { + // TODO +} + // page retrieves a page reference from a given byte array based on the current page size. -func (db *db) page(b []byte, id int) *page { - return (*page)(unsafe.Pointer(b[id * db.pageSize])) +func (db *DB) page(b []byte, id int) *page { + return (*page)(unsafe.Pointer(&b[id*db.pageSize])) } - - - - // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ CONVERTED ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ // - - - - - - -func (db *db) freePage(p *page) { +func (db *DB) freePage(p *page) { /* - mp->mp_next = env->me_dpages; - VGMEMP_FREE(env, mp); - env->me_dpages = mp; + mp->mp_next = env->me_dpages; + VGMEMP_FREE(env, mp); + env->me_dpages = mp; */ } -func (db *db) freeDirtyPage(p *page) { +func (db *DB) freeDirtyPage(p *page) { /* - if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { - mdb_page_free(env, dp); - } else { - // large pages just get freed directly - VGMEMP_FREE(env, dp); - free(dp); - } - */ -} - -func (db *db) freeAllDirtyPages(p *page) { - /* - MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned i, n = dl[0].mid; - - for (i = 1; i <= n; i++) { - mdb_dpage_free(env, dl[i].mptr); - } - dl[0].mid = 0; - */ -} - -func (db *db) sync(force bool) error { - /* - int rc = 0; - if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { - if (env->me_flags & MDB_WRITEMAP) { - int flags = ((env->me_flags & MDB_MAPASYNC) && !force) - ? MS_ASYNC : MS_SYNC; - if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) - rc = ErrCode(); -#ifdef _WIN32 - else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) - rc = ErrCode(); -#endif + if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { + mdb_page_free(env, dp); } else { - if (MDB_FDATASYNC(env->me_fd)) - rc = ErrCode(); + // large pages just get freed directly + VGMEMP_FREE(env, dp); + free(dp); } - } - return rc; + */ +} + +func (db *DB) freeAllDirtyPages(p *page) { + /* + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned i, n = dl[0].mid; + + for (i = 1; i <= n; i++) { + mdb_dpage_free(env, dl[i].mptr); + } + dl[0].mid = 0; + */ +} + +func (db *DB) sync(force bool) error { + /* + int rc = 0; + if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { + if (env->me_flags & MDB_WRITEMAP) { + int flags = ((env->me_flags & MDB_MAPASYNC) && !force) + ? MS_ASYNC : MS_SYNC; + if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) + rc = ErrCode(); + #ifdef _WIN32 + else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) + rc = ErrCode(); + #endif + } else { + if (MDB_FDATASYNC(env->me_fd)) + rc = ErrCode(); + } + } + return rc; */ return nil } -func (db *db) Transaction(parent *transaction, flags int) (*transaction, error) { +func (db *DB) Transaction(parent *transaction, flags int) (*transaction, error) { /* - MDB_txn *txn; - MDB_ntxn *ntxn; - int rc, size, tsize = sizeof(MDB_txn); + MDB_txn *txn; + MDB_ntxn *ntxn; + int rc, size, tsize = sizeof(MDB_txn); - if (env->me_flags & MDB_FATAL_ERROR) { - DPUTS("environment had fatal error, must shutdown!"); - return MDB_PANIC; - } - if ((env->me_flags & MDB_RDONLY) && !(flags & MDB_RDONLY)) - return EACCES; - if (parent) { - // Nested transactions: Max 1 child, write txns only, no writemap - if (parent->mt_child || - (flags & MDB_RDONLY) || - (parent->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) || - (env->me_flags & MDB_WRITEMAP)) - { - return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; + if (env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + return MDB_PANIC; } - tsize = sizeof(MDB_ntxn); - } - size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1); - if (!(flags & MDB_RDONLY)) - size += env->me_maxdbs * sizeof(MDB_cursor *); + if ((env->me_flags & MDB_RDONLY) && !(flags & MDB_RDONLY)) + return EACCES; + if (parent) { + // Nested transactions: Max 1 child, write txns only, no writemap + if (parent->mt_child || + (flags & MDB_RDONLY) || + (parent->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) || + (env->me_flags & MDB_WRITEMAP)) + { + return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; + } + tsize = sizeof(MDB_ntxn); + } + size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1); + if (!(flags & MDB_RDONLY)) + size += env->me_maxdbs * sizeof(MDB_cursor *); - if ((txn = calloc(1, size)) == NULL) { - DPRINTF(("calloc: %s", strerror(ErrCode()))); - return ENOMEM; - } - txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); - if (flags & MDB_RDONLY) { - txn->mt_flags |= MDB_TXN_RDONLY; - txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs); - } else { - txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs); - } - txn->mt_env = env; - - if (parent) { - unsigned int i; - txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); - if (!txn->mt_u.dirty_list || - !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) - { - free(txn->mt_u.dirty_list); - free(txn); + if ((txn = calloc(1, size)) == NULL) { + DPRINTF(("calloc: %s", strerror(ErrCode()))); return ENOMEM; } - txn->mt_txnid = parent->mt_txnid; - txn->mt_dirty_room = parent->mt_dirty_room; - txn->mt_u.dirty_list[0].mid = 0; - txn->mt_spill_pgs = NULL; - txn->mt_next_pgno = parent->mt_next_pgno; - parent->mt_child = txn; - txn->mt_parent = parent; - txn->mt_numdbs = parent->mt_numdbs; - txn->mt_flags = parent->mt_flags; - txn->mt_dbxs = parent->mt_dbxs; - memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); - // Copy parent's mt_dbflags, but clear DB_NEW - for (i=0; imt_numdbs; i++) - txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; - rc = 0; - ntxn = (MDB_ntxn *)txn; - ntxn->mnt_pgstate = env->me_pgstate; // save parent me_pghead & co - if (env->me_pghead) { - size = MDB_IDL_SIZEOF(env->me_pghead); - env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); - if (env->me_pghead) - memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); - else - rc = ENOMEM; + txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); + if (flags & MDB_RDONLY) { + txn->mt_flags |= MDB_TXN_RDONLY; + txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs); + } else { + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs); + } + txn->mt_env = env; + + if (parent) { + unsigned int i; + txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); + if (!txn->mt_u.dirty_list || + !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) + { + free(txn->mt_u.dirty_list); + free(txn); + return ENOMEM; + } + txn->mt_txnid = parent->mt_txnid; + txn->mt_dirty_room = parent->mt_dirty_room; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_spill_pgs = NULL; + txn->mt_next_pgno = parent->mt_next_pgno; + parent->mt_child = txn; + txn->mt_parent = parent; + txn->mt_numdbs = parent->mt_numdbs; + txn->mt_flags = parent->mt_flags; + txn->mt_dbxs = parent->mt_dbxs; + memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + // Copy parent's mt_dbflags, but clear DB_NEW + for (i=0; imt_numdbs; i++) + txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; + rc = 0; + ntxn = (MDB_ntxn *)txn; + ntxn->mnt_pgstate = env->me_pgstate; // save parent me_pghead & co + if (env->me_pghead) { + size = MDB_IDL_SIZEOF(env->me_pghead); + env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); + if (env->me_pghead) + memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); + else + rc = ENOMEM; + } + if (!rc) + rc = mdb_cursor_shadow(parent, txn); + if (rc) + mdb_txn_reset0(txn, "beginchild-fail"); + } else { + rc = mdb_txn_renew0(txn); } - if (!rc) - rc = mdb_cursor_shadow(parent, txn); if (rc) - mdb_txn_reset0(txn, "beginchild-fail"); - } else { - rc = mdb_txn_renew0(txn); - } - if (rc) - free(txn); - else { - *ret = txn; - DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); - } + free(txn); + else { + *ret = txn; + DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); + } - return rc; + return rc; */ - return nil -} - -// Write the environment parameters of a freshly created DB environment. -// @param[in] env the environment handle -// @param[out] meta address of where to store the meta information -// @return 0 on success, non-zero on failure. -func (db *db) initMeta(meta *meta) error { - /* - MDB_page *p, *q; - int rc; - unsigned int psize; -#ifdef _WIN32 - DWORD len; - OVERLAPPED ov; - memset(&ov, 0, sizeof(ov)); -#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ - ov.Offset = pos; \ - rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) -#else - int len; -#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ - len = pwrite(fd, ptr, size, pos); \ - rc = (len >= 0); } while(0) -#endif - - DPUTS("writing new meta page"); - - psize = env->me_psize; - - meta->mm_magic = MDB_MAGIC; - meta->mm_version = MDB_DATA_VERSION; - meta->mm_mapsize = env->me_mapsize; - meta->mm_psize = psize; - meta->mm_last_pg = 1; - meta->mm_flags = env->me_flags & 0xffff; - meta->mm_flags |= MDB_INTEGERKEY; - meta->mm_dbs[0].md_root = P_INVALID; - meta->mm_dbs[1].md_root = P_INVALID; - - p = calloc(2, psize); - p->mp_pgno = 0; - p->mp_flags = P_META; - *(MDB_meta *)METADATA(p) = *meta; - - q = (MDB_page *)((char *)p + psize); - q->mp_pgno = 1; - q->mp_flags = P_META; - *(MDB_meta *)METADATA(q) = *meta; - - DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0); - if (!rc) - rc = ErrCode(); - else if ((unsigned) len == psize * 2) - rc = MDB_SUCCESS; - else - rc = ENOSPC; - free(p); - return rc; - */ - return nil + return nil, nil } // Check both meta pages to see which one is newer. // @param[in] env the environment handle // @return meta toggle (0 or 1). -func (db *db) pickMeta() int { +func (db *DB) pickMeta() int { /* - return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid); + return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid); */ return 0 } -func (db *db) Create() error { +func (db *DB) Create() error { /* - MDB_env *e; + MDB_env *e; - e = calloc(1, sizeof(MDB_env)); - if (!e) - return ENOMEM; + e = calloc(1, sizeof(MDB_env)); + if (!e) + return ENOMEM; - e->me_maxreaders = DEFAULT_READERS; - e->me_maxdbs = e->me_numdbs = 2; - e->me_fd = INVALID_HANDLE_VALUE; - e->me_lfd = INVALID_HANDLE_VALUE; - e->me_mfd = INVALID_HANDLE_VALUE; -#ifdef MDB_USE_POSIX_SEM - e->me_rmutex = SEM_FAILED; - e->me_wmutex = SEM_FAILED; -#endif - e->me_pid = getpid(); - GET_PAGESIZE(e->me_os_psize); - VGMEMP_CREATE(e,0,0); - *env = e; - return MDB_SUCCESS; + e->me_maxreaders = DEFAULT_READERS; + e->me_maxdbs = e->me_numdbs = 2; + e->me_fd = INVALID_HANDLE_VALUE; + e->me_lfd = INVALID_HANDLE_VALUE; + e->me_mfd = INVALID_HANDLE_VALUE; + #ifdef MDB_USE_POSIX_SEM + e->me_rmutex = SEM_FAILED; + e->me_wmutex = SEM_FAILED; + #endif + e->me_pid = getpid(); + GET_PAGESIZE(e->me_os_psize); + VGMEMP_CREATE(e,0,0); + *env = e; + return MDB_SUCCESS; */ return nil } -// int mdb_env_map(MDB_env *env, void *addr, int newsize) -func (db *db) mmap(newsize int) error { +func (db *DB) setMapSize(size int) error { /* - MDB_page *p; - unsigned int flags = env->me_flags; -#ifdef _WIN32 - int rc; - HANDLE mh; - LONG sizelo, sizehi; - sizelo = env->me_mapsize & 0xffffffff; - sizehi = env->me_mapsize >> 16 >> 16; // only needed on Win64 - - // Windows won't create mappings for zero length files. - // Just allocate the maxsize right now. - if (newsize) { - if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo - || !SetEndOfFile(env->me_fd) - || SetFilePointer(env->me_fd, 0, NULL, 0) != 0) - return ErrCode(); - } - mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? - PAGE_READWRITE : PAGE_READONLY, - sizehi, sizelo, NULL); - if (!mh) - return ErrCode(); - env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? - FILE_MAP_WRITE : FILE_MAP_READ, - 0, 0, env->me_mapsize, addr); - rc = env->me_map ? 0 : ErrCode(); - CloseHandle(mh); - if (rc) - return rc; -#else - int prot = PROT_READ; - if (flags & MDB_WRITEMAP) { - prot |= PROT_WRITE; - if (ftruncate(env->me_fd, env->me_mapsize) < 0) - return ErrCode(); - } - env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, - env->me_fd, 0); - if (env->me_map == MAP_FAILED) { - env->me_map = NULL; - return ErrCode(); - } - - if (flags & MDB_NORDAHEAD) { - // Turn off readahead. It's harmful when the DB is larger than RAM. -#ifdef MADV_RANDOM - madvise(env->me_map, env->me_mapsize, MADV_RANDOM); -#else -#ifdef POSIX_MADV_RANDOM - posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); -#endif // POSIX_MADV_RANDOM -#endif // MADV_RANDOM - } -#endif // _WIN32 - - // Can happen because the address argument to mmap() is just a - // hint. mmap() can pick another, e.g. if the range is in use. - // The MAP_FIXED flag would prevent that, but then mmap could - // instead unmap existing pages to make room for the new map. - if (addr && env->me_map != addr) - return EBUSY; // TODO: Make a new MDB_* error code? - - p = (MDB_page *)env->me_map; - env->me_metas[0] = METADATA(p); - env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); - - return MDB_SUCCESS; - */ - return nil -} - -func (db *db) setMapSize(size int) error { - /* - // If env is already open, caller is responsible for making - // sure there are no active txns. - if (env->me_map) { - int rc; - void *old; - if (env->me_txn) - return EINVAL; - if (!size) - size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize; - else if (size < env->me_mapsize) { - // If the configured size is smaller, make sure it's - // still big enough. Silently round up to minimum if not. - size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize; - if (size < minsize) - size = minsize; + // If env is already open, caller is responsible for making + // sure there are no active txns. + if (env->me_map) { + int rc; + void *old; + if (env->me_txn) + return EINVAL; + if (!size) + size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize; + else if (size < env->me_mapsize) { + // If the configured size is smaller, make sure it's + // still big enough. Silently round up to minimum if not. + size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize; + if (size < minsize) + size = minsize; + } + munmap(env->me_map, env->me_mapsize); + env->me_mapsize = size; + old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; + rc = mdb_env_map(env, old, 1); + if (rc) + return rc; } - munmap(env->me_map, env->me_mapsize); env->me_mapsize = size; - old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; - rc = mdb_env_map(env, old, 1); - if (rc) - return rc; - } - env->me_mapsize = size; - if (env->me_psize) - env->me_maxpg = env->me_mapsize / env->me_psize; - return MDB_SUCCESS; + if (env->me_psize) + env->me_maxpg = env->me_mapsize / env->me_psize; + return MDB_SUCCESS; */ return nil } -func (db *db) setMaxBucketCount(count int) error { +func (db *DB) setMaxBucketCount(count int) error { /* - if (env->me_map) - return EINVAL; - env->me_maxdbs = dbs + 2; // Named databases + main and free DB - return MDB_SUCCESS; + if (env->me_map) + return EINVAL; + env->me_maxdbs = dbs + 2; // Named databases + main and free DB + return MDB_SUCCESS; */ return nil } -func (db *db) setMaxReaderCount(count int) error { +func (db *DB) setMaxReaderCount(count int) error { /* - if (env->me_map || readers < 1) - return EINVAL; - env->me_maxreaders = readers; - return MDB_SUCCESS; + if (env->me_map || readers < 1) + return EINVAL; + env->me_maxreaders = readers; + return MDB_SUCCESS; */ + return nil } -func (db *db) getMaxReaderCount(count int) (int, error) { +func (db *DB) getMaxReaderCount(count int) (int, error) { /* - if (!env || !readers) - return EINVAL; - *readers = env->me_maxreaders; - return MDB_SUCCESS; + if (!env || !readers) + return EINVAL; + *readers = env->me_maxreaders; + return MDB_SUCCESS; */ return 0, nil } - // Destroy resources from mdb_env_open(), clear our readers & DBIs -func (db *db) close0(excl) { +func (db *DB) close0(excl int) { /* - int i; + int i; - if (!(env->me_flags & MDB_ENV_ACTIVE)) - return; + if (!(env->me_flags & MDB_ENV_ACTIVE)) + return; - // Doing this here since me_dbxs may not exist during mdb_env_close - for (i = env->me_maxdbs; --i > MAIN_DBI; ) - free(env->me_dbxs[i].md_name.mv_data); + // Doing this here since me_dbxs may not exist during mdb_env_close + for (i = env->me_maxdbs; --i > MAIN_DBI; ) + free(env->me_dbxs[i].md_name.mv_data); - free(env->me_pbuf); - free(env->me_dbflags); - free(env->me_dbxs); - free(env->me_path); - free(env->me_dirty_list); - mdb_midl_free(env->me_free_pgs); + free(env->me_pbuf); + free(env->me_dbflags); + free(env->me_dbxs); + free(env->me_path); + free(env->me_dirty_list); + mdb_midl_free(env->me_free_pgs); - if (env->me_flags & MDB_ENV_TXKEY) { - pthread_key_delete(env->me_txkey); -#ifdef _WIN32 - // Delete our key from the global list - for (i=0; ime_txkey) { - mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; - mdb_tls_nkeys--; - break; + if (env->me_flags & MDB_ENV_TXKEY) { + pthread_key_delete(env->me_txkey); + #ifdef _WIN32 + // Delete our key from the global list + for (i=0; ime_txkey) { + mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; + mdb_tls_nkeys--; + break; + } + #endif } -#endif - } - if (env->me_map) { - munmap(env->me_map, env->me_mapsize); - } - if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE) - (void) close(env->me_mfd); - if (env->me_fd != INVALID_HANDLE_VALUE) - (void) close(env->me_fd); - if (env->me_txns) { - MDB_PID_T pid = env->me_pid; - // Clearing readers is done in this function because - // me_txkey with its destructor must be disabled first. - for (i = env->me_numreaders; --i >= 0; ) - if (env->me_txns->mti_readers[i].mr_pid == pid) - env->me_txns->mti_readers[i].mr_pid = 0; -#ifdef _WIN32 - if (env->me_rmutex) { - CloseHandle(env->me_rmutex); - if (env->me_wmutex) CloseHandle(env->me_wmutex); - } - // Windows automatically destroys the mutexes when - // the last handle closes. -#elif defined(MDB_USE_POSIX_SEM) - if (env->me_rmutex != SEM_FAILED) { - sem_close(env->me_rmutex); - if (env->me_wmutex != SEM_FAILED) - sem_close(env->me_wmutex); - // If we have the filelock: If we are the - // only remaining user, clean up semaphores. - if (excl == 0) - mdb_env_excl_lock(env, &excl); - if (excl > 0) { - sem_unlink(env->me_txns->mti_rmname); - sem_unlink(env->me_txns->mti_wmname); + if (env->me_map) { + munmap(env->me_map, env->me_mapsize); + } + if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE) + (void) close(env->me_mfd); + if (env->me_fd != INVALID_HANDLE_VALUE) + (void) close(env->me_fd); + if (env->me_txns) { + MDB_PID_T pid = env->me_pid; + // Clearing readers is done in this function because + // me_txkey with its destructor must be disabled first. + for (i = env->me_numreaders; --i >= 0; ) + if (env->me_txns->mti_readers[i].mr_pid == pid) + env->me_txns->mti_readers[i].mr_pid = 0; + #ifdef _WIN32 + if (env->me_rmutex) { + CloseHandle(env->me_rmutex); + if (env->me_wmutex) CloseHandle(env->me_wmutex); + } + // Windows automatically destroys the mutexes when + // the last handle closes. + #elif defined(MDB_USE_POSIX_SEM) + if (env->me_rmutex != SEM_FAILED) { + sem_close(env->me_rmutex); + if (env->me_wmutex != SEM_FAILED) + sem_close(env->me_wmutex); + // If we have the filelock: If we are the + // only remaining user, clean up semaphores. + if (excl == 0) + mdb_env_excl_lock(env, &excl); + if (excl > 0) { + sem_unlink(env->me_txns->mti_rmname); + sem_unlink(env->me_txns->mti_wmname); + } + } + #endif + munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); + } + if (env->me_lfd != INVALID_HANDLE_VALUE) { + #ifdef _WIN32 + if (excl >= 0) { + // Unlock the lockfile. Windows would have unlocked it + // after closing anyway, but not necessarily at once. + UnlockFile(env->me_lfd, 0, 0, 1, 0); + } + #endif + (void) close(env->me_lfd); } - } -#endif - munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); - } - if (env->me_lfd != INVALID_HANDLE_VALUE) { -#ifdef _WIN32 - if (excl >= 0) { - // Unlock the lockfile. Windows would have unlocked it - // after closing anyway, but not necessarily at once. - UnlockFile(env->me_lfd, 0, 0, 1, 0); - } -#endif - (void) close(env->me_lfd); - } - env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); + env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); */ } -func (db *db) copyfd(handle int) error { +func (db *DB) copyfd(handle int) error { /* - MDB_txn *txn = NULL; - int rc; - size_t wsize; - char *ptr; -#ifdef _WIN32 - DWORD len, w2; -#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) -#else - ssize_t len; - size_t w2; -#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) -#endif + MDB_txn *txn = NULL; + int rc; + size_t wsize; + char *ptr; + #ifdef _WIN32 + DWORD len, w2; + #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) + #else + ssize_t len; + size_t w2; + #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) + #endif - // Do the lock/unlock of the reader mutex before starting the - // write txn. Otherwise other read txns could block writers. - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) - return rc; + // Do the lock/unlock of the reader mutex before starting the + // write txn. Otherwise other read txns could block writers. + rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + return rc; - if (env->me_txns) { - // We must start the actual read txn after blocking writers - mdb_txn_reset0(txn, "reset-stage1"); + if (env->me_txns) { + // We must start the actual read txn after blocking writers + mdb_txn_reset0(txn, "reset-stage1"); - // Temporarily block writers until we snapshot the meta pages - LOCK_MUTEX_W(env); + // Temporarily block writers until we snapshot the meta pages + LOCK_MUTEX_W(env); - rc = mdb_txn_renew0(txn); - if (rc) { - UNLOCK_MUTEX_W(env); - goto leave; - } - } + rc = mdb_txn_renew0(txn); + if (rc) { + UNLOCK_MUTEX_W(env); + goto leave; + } + } - wsize = env->me_psize * 2; - ptr = env->me_map; - w2 = wsize; - while (w2 > 0) { - DO_WRITE(rc, fd, ptr, w2, len); - if (!rc) { - rc = ErrCode(); - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - w2 -= len; - continue; - } else { - // Non-blocking or async handles are not supported - rc = EIO; - break; - } - } - if (env->me_txns) - UNLOCK_MUTEX_W(env); - - if (rc) - goto leave; - - wsize = txn->mt_next_pgno * env->me_psize - wsize; - while (wsize > 0) { - if (wsize > MAX_WRITE) - w2 = MAX_WRITE; - else + wsize = env->me_psize * 2; + ptr = env->me_map; w2 = wsize; - DO_WRITE(rc, fd, ptr, w2, len); - if (!rc) { - rc = ErrCode(); - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - wsize -= len; - continue; - } else { - rc = EIO; - break; + while (w2 > 0) { + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + w2 -= len; + continue; + } else { + // Non-blocking or async handles are not supported + rc = EIO; + break; + } + } + if (env->me_txns) + UNLOCK_MUTEX_W(env); + + if (rc) + goto leave; + + wsize = txn->mt_next_pgno * env->me_psize - wsize; + while (wsize > 0) { + if (wsize > MAX_WRITE) + w2 = MAX_WRITE; + else + w2 = wsize; + DO_WRITE(rc, fd, ptr, w2, len); + if (!rc) { + rc = ErrCode(); + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + + leave: + mdb_txn_abort(txn); + return rc; } - } -leave: - mdb_txn_abort(txn); - return rc; -} + int + mdb_env_copy(MDB_env *env, const char *path) + { + int rc, len; + char *lpath; + HANDLE newfd = INVALID_HANDLE_VALUE; -int -mdb_env_copy(MDB_env *env, const char *path) -{ - int rc, len; - char *lpath; - HANDLE newfd = INVALID_HANDLE_VALUE; + if (env->me_flags & MDB_NOSUBDIR) { + lpath = (char *)path; + } else { + len = strlen(path); + len += sizeof(DATANAME); + lpath = malloc(len); + if (!lpath) + return ENOMEM; + sprintf(lpath, "%s" DATANAME, path); + } - if (env->me_flags & MDB_NOSUBDIR) { - lpath = (char *)path; - } else { - len = strlen(path); - len += sizeof(DATANAME); - lpath = malloc(len); - if (!lpath) - return ENOMEM; - sprintf(lpath, "%s" DATANAME, path); - } + // The destination path must exist, but the destination file must not. + // We don't want the OS to cache the writes, since the source data is + // already in the OS cache. + #ifdef _WIN32 + newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, + FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); + #else + newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); + #endif + if (newfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } - // The destination path must exist, but the destination file must not. - // We don't want the OS to cache the writes, since the source data is - // already in the OS cache. -#ifdef _WIN32 - newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, - FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); -#else - newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); -#endif - if (newfd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); - goto leave; - } + #ifdef O_DIRECT + // Set O_DIRECT if the file system supports it + if ((rc = fcntl(newfd, F_GETFL)) != -1) + (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); + #endif + #ifdef F_NOCACHE // __APPLE__ + rc = fcntl(newfd, F_NOCACHE, 1); + if (rc) { + rc = ErrCode(); + goto leave; + } + #endif -#ifdef O_DIRECT - // Set O_DIRECT if the file system supports it - if ((rc = fcntl(newfd, F_GETFL)) != -1) - (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); -#endif -#ifdef F_NOCACHE // __APPLE__ - rc = fcntl(newfd, F_NOCACHE, 1); - if (rc) { - rc = ErrCode(); - goto leave; - } -#endif + rc = mdb_env_copyfd(env, newfd); - rc = mdb_env_copyfd(env, newfd); + leave: + if (!(env->me_flags & MDB_NOSUBDIR)) + free(lpath); + if (newfd != INVALID_HANDLE_VALUE) + if (close(newfd) < 0 && rc == MDB_SUCCESS) + rc = ErrCode(); -leave: - if (!(env->me_flags & MDB_NOSUBDIR)) - free(lpath); - if (newfd != INVALID_HANDLE_VALUE) - if (close(newfd) < 0 && rc == MDB_SUCCESS) - rc = ErrCode(); - - return rc; + return rc; */ return nil } -func (db *db) Close() { +func (db *DB) Close() { /* - MDB_page *dp; + MDB_page *dp; - if (env == NULL) - return; + if (env == NULL) + return; - VGMEMP_DESTROY(env); - while ((dp = env->me_dpages) != NULL) { - VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); - env->me_dpages = dp->mp_next; - free(dp); - } + VGMEMP_DESTROY(env); + while ((dp = env->me_dpages) != NULL) { + VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); + env->me_dpages = dp->mp_next; + free(dp); + } - mdb_env_close0(env, 0); - free(env); + mdb_env_close0(env, 0); + free(env); */ } @@ -862,17 +712,17 @@ func (db *db) Close() { // @param[in] key The key for the node. // @param[in] data The data for the node. // @return The number of bytes needed to store the node. -func (db *db) LeafSize(key []byte, data []byte) int { +func (db *DB) LeafSize(key []byte, data []byte) int { /* - size_t sz; + size_t sz; - sz = LEAFSIZE(key, data); - if (sz > env->me_nodemax) { - // put on overflow page - sz -= data->mv_size - sizeof(pgno_t); - } + sz = LEAFSIZE(key, data); + if (sz > env->me_nodemax) { + // put on overflow page + sz -= data->mv_size - sizeof(pgno_t); + } - return EVEN(sz + sizeof(indx_t)); + return EVEN(sz + sizeof(indx_t)); */ return 0 } @@ -886,183 +736,179 @@ func (db *db) LeafSize(key []byte, data []byte) int { // @param[in] env The environment handle. // @param[in] key The key for the node. // @return The number of bytes needed to store the node. -func (db *db) BranchSize(key []byte) int { +func (db *DB) BranchSize(key []byte) int { /* - size_t sz; + size_t sz; - sz = INDXSIZE(key); - if (sz > env->me_nodemax) { - // put on overflow page - // not implemented - // sz -= key->size - sizeof(pgno_t); - } + sz = INDXSIZE(key); + if (sz > env->me_nodemax) { + // put on overflow page + // not implemented + // sz -= key->size - sizeof(pgno_t); + } - return sz + sizeof(indx_t); + return sz + sizeof(indx_t); */ return 0 } -func (db *db) SetFlags(flag int, onoff bool) error { +func (db *DB) SetFlags(flag int, onoff bool) error { /* - if ((flag & CHANGEABLE) != flag) - return EINVAL; - if (onoff) - env->me_flags |= flag; - else - env->me_flags &= ~flag; - return MDB_SUCCESS; + if ((flag & CHANGEABLE) != flag) + return EINVAL; + if (onoff) + env->me_flags |= flag; + else + env->me_flags &= ~flag; + return MDB_SUCCESS; */ return nil } -func (db *db) Flags() int { - return db.flags -} - - -func (db *db) Stat() *Stat +func (db *DB) Stat() *Stat { /* - int toggle; + int toggle; - if (env == NULL || arg == NULL) - return EINVAL; + if (env == NULL || arg == NULL) + return EINVAL; - toggle = mdb_env_pick_meta(env); - stat := &Stat{} - stat->ms_psize = env->me_psize; - stat->ms_depth = db->md_depth; - stat->ms_branch_pages = db->md_branch_pages; - stat->ms_leaf_pages = db->md_leaf_pages; - stat->ms_overflow_pages = db->md_overflow_pages; - stat->ms_entries = db->md_entries; + toggle = mdb_env_pick_meta(env); + stat := &Stat{} + stat->ms_psize = env->me_psize; + stat->ms_depth = db->md_depth; + stat->ms_branch_pages = db->md_branch_pages; + stat->ms_leaf_pages = db->md_leaf_pages; + stat->ms_overflow_pages = db->md_overflow_pages; + stat->ms_entries = db->md_entries; - //return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], stat); - return stat + //return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], stat); + return stat */ + return nil } -func (db *db) Info() *Info { +func (db *DB) Info() *Info { /* - int toggle; + int toggle; - if (env == NULL || arg == NULL) - return EINVAL; + if (env == NULL || arg == NULL) + return EINVAL; - toggle = mdb_env_pick_meta(env); - arg->me_mapaddr = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : 0; - arg->me_mapsize = env->me_mapsize; - arg->me_maxreaders = env->me_maxreaders; + toggle = mdb_env_pick_meta(env); + arg->me_mapaddr = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : 0; + arg->me_mapsize = env->me_mapsize; + arg->me_maxreaders = env->me_maxreaders; - // me_numreaders may be zero if this process never used any readers. Use - // the shared numreader count if it exists. - arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : env->me_numreaders; + // me_numreaders may be zero if this process never used any readers. Use + // the shared numreader count if it exists. + arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : env->me_numreaders; - arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg; - arg->me_last_txnid = env->me_metas[toggle]->mm_txnid; - return MDB_SUCCESS; + arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg; + arg->me_last_txnid = env->me_metas[toggle]->mm_txnid; + return MDB_SUCCESS; */ return nil } // TODO: Move to bucket.go -func (db *db) CloseBucket(b Bucket) { +func (db *DB) CloseBucket(b Bucket) { /* - char *ptr; - if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs) - return; - ptr = env->me_dbxs[dbi].md_name.mv_data; - env->me_dbxs[dbi].md_name.mv_data = NULL; - env->me_dbxs[dbi].md_name.mv_size = 0; - env->me_dbflags[dbi] = 0; - free(ptr); + char *ptr; + if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs) + return; + ptr = env->me_dbxs[dbi].md_name.mv_data; + env->me_dbxs[dbi].md_name.mv_data = NULL; + env->me_dbxs[dbi].md_name.mv_size = 0; + env->me_dbflags[dbi] = 0; + free(ptr); */ } //int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) -func (db *db) getReaderList() error { +func (db *DB) getReaderList() error { /* - unsigned int i, rdrs; - MDB_reader *mr; - char buf[64]; - int rc = 0, first = 1; + unsigned int i, rdrs; + MDB_reader *mr; + char buf[64]; + int rc = 0, first = 1; - if (!env || !func) - return -1; - if (!env->me_txns) { - return func("(no reader locks)\n", ctx); - } - rdrs = env->me_txns->mti_numreaders; - mr = env->me_txns->mti_readers; - for (i=0; ime_txns) { + return func("(no reader locks)\n", ctx); + } + rdrs = env->me_txns->mti_numreaders; + mr = env->me_txns->mti_readers; + for (i=0; ime_txns) - return MDB_SUCCESS; - rdrs = env->me_txns->mti_numreaders; - pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); - if (!pids) - return ENOMEM; - pids[0] = 0; - mr = env->me_txns->mti_readers; - for (i=0; ime_pid) { - pid = mr[i].mr_pid; - if (mdb_pid_insert(pids, pid) == 0) { - if (!mdb_reader_pid(env, Pidcheck, pid)) { - LOCK_MUTEX_R(env); - // Recheck, a new process may have reused pid + if (!env) + return EINVAL; + if (dead) + *dead = 0; + if (!env->me_txns) + return MDB_SUCCESS; + rdrs = env->me_txns->mti_numreaders; + pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); + if (!pids) + return ENOMEM; + pids[0] = 0; + mr = env->me_txns->mti_readers; + for (i=0; ime_pid) { + pid = mr[i].mr_pid; + if (mdb_pid_insert(pids, pid) == 0) { if (!mdb_reader_pid(env, Pidcheck, pid)) { - for (j=i; jmm_txnid > meta->mm_txnid) - *meta = *m; - } - return 0; + if (off == 0 || m->mm_txnid > meta->mm_txnid) + *meta = *m; + } + return 0; */ return nil } diff --git a/node.go b/node.go index 928e0c9..bb7b6ad 100644 --- a/node.go +++ b/node.go @@ -44,48 +44,48 @@ func (n *node) size() int { // @param[in] indx The index of the subpage on the main page. func (n *node) shrink(index int) { /* - MDB_node *node; - MDB_page *sp, *xp; - char *base; - int nsize, delta; - indx_t i, numkeys, ptr; + MDB_node *node; + MDB_page *sp, *xp; + char *base; + int nsize, delta; + indx_t i, numkeys, ptr; - node = NODEPTR(mp, indx); - sp = (MDB_page *)NODEDATA(node); - delta = SIZELEFT(sp); - xp = (MDB_page *)((char *)sp + delta); + node = NODEPTR(mp, indx); + sp = (MDB_page *)NODEDATA(node); + delta = SIZELEFT(sp); + xp = (MDB_page *)((char *)sp + delta); - // shift subpage upward - if (IS_LEAF2(sp)) { - nsize = NUMKEYS(sp) * sp->mp_pad; - if (nsize & 1) - return; // do not make the node uneven-sized - memmove(METADATA(xp), METADATA(sp), nsize); - } else { - int i; - numkeys = NUMKEYS(sp); - for (i=numkeys-1; i>=0; i--) - xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; - } - xp->mp_upper = sp->mp_lower; - xp->mp_lower = sp->mp_lower; - xp->mp_flags = sp->mp_flags; - xp->mp_pad = sp->mp_pad; - COPY_PGNO(xp->mp_pgno, mp->mp_pgno); + // shift subpage upward + if (IS_LEAF2(sp)) { + nsize = NUMKEYS(sp) * sp->mp_pad; + if (nsize & 1) + return; // do not make the node uneven-sized + memmove(METADATA(xp), METADATA(sp), nsize); + } else { + int i; + numkeys = NUMKEYS(sp); + for (i=numkeys-1; i>=0; i--) + xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; + } + xp->mp_upper = sp->mp_lower; + xp->mp_lower = sp->mp_lower; + xp->mp_flags = sp->mp_flags; + xp->mp_pad = sp->mp_pad; + COPY_PGNO(xp->mp_pgno, mp->mp_pgno); - nsize = NODEDSZ(node) - delta; - SETDSZ(node, nsize); + nsize = NODEDSZ(node) - delta; + SETDSZ(node, nsize); - // shift lower nodes upward - ptr = mp->mp_ptrs[indx]; - numkeys = NUMKEYS(mp); - for (i = 0; i < numkeys; i++) { - if (mp->mp_ptrs[i] <= ptr) - mp->mp_ptrs[i] += delta; - } + // shift lower nodes upward + ptr = mp->mp_ptrs[indx]; + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] += delta; + } - base = (char *)mp + mp->mp_upper; - memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node)); - mp->mp_upper += delta; + base = (char *)mp + mp->mp_upper; + memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node)); + mp->mp_upper += delta; */ } diff --git a/os.go b/os.go new file mode 100644 index 0000000..266091b --- /dev/null +++ b/os.go @@ -0,0 +1,5 @@ +package bolt + +import ( + _ "os" +) diff --git a/page.go b/page.go index c811eeb..522f113 100644 --- a/page.go +++ b/page.go @@ -5,9 +5,11 @@ import ( ) const maxPageSize = 0x8000 +const minKeyCount = 2 var _page page -const headerSize = unsafe.Offsetof(_page.ptr) + +const pageHeaderSize = int(unsafe.Offsetof(_page.ptr)) const minPageKeys = 2 const fillThreshold = 250 // 25% @@ -20,13 +22,15 @@ const ( p_dirty = 0x10 /**< dirty page, also set for #P_SUBP pages */ p_sub = 0x40 p_keep = 0x8000 /**< leave this page alone during spill */ + + p_invalid = ^pgno(0) ) // maxCommitPages is the maximum number of pages to commit in one writev() call. -const maxCommitPages 64 +const maxCommitPages = 64 /* max bytes to write in one call */ -const maxWriteByteCount 0x80000000U // TODO: #define MAX_WRITE 0x80000000U >> (sizeof(ssize_t) == 4)) +const maxWriteByteCount uint = 0x80000000 // TODO: #define MAX_WRITE 0x80000000U >> (sizeof(ssize_t) == 4)) // TODO: // #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES @@ -42,26 +46,28 @@ const maxWriteByteCount 0x80000000U // TODO: #define MAX_WRITE 0x80000000U >> // TODO: #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ type pgno uint64 +type txnid uint64 +type indx uint16 type page struct { id pgno flags int - lower int - upper int + lower indx + upper indx overflow int ptr int } type pageState struct { - head int /**< Reclaimed freeDB pages, or NULL before use */ - last int /**< ID of last used record, or 0 if !mf_pghead */ + head int /**< Reclaimed freeDB pages, or NULL before use */ + last int /**< ID of last used record, or 0 if !mf_pghead */ } // meta returns a pointer to the metadata section of the page. func (p *page) meta() (*meta, error) { // Exit if page is not a meta page. - if (p.flags & p_meta) != 0 { - return InvalidMetaPageError + if (p.flags & p_meta) == 0 { + return nil, InvalidMetaPageError } // Cast the meta section and validate before returning. @@ -72,12 +78,17 @@ func (p *page) meta() (*meta, error) { return m, nil } - - - - - - +// initMeta initializes a page as a new meta page. +func (p *page) initMeta(pageSize int) { + p.flags = p_meta + m := (*meta)(unsafe.Pointer(&p.ptr)) + m.magic = magic + m.version = version + m.free.pad = uint32(pageSize) + m.pgno = 1 + m.free.root = p_invalid + m.main.root = p_invalid +} // nodeCount returns the number of nodes on the page. func (p *page) nodeCount() int { @@ -86,10 +97,5 @@ func (p *page) nodeCount() int { // remainingSize returns the number of bytes left in the page. func (p *page) remainingSize() int { - return p.header.upper - p.header.lower -} - -// remainingSize returns the number of bytes left in the page. -func (p *page) remainingSize() int { - return p.header.upper - p.header.lower + return int(p.upper - p.lower) } diff --git a/reader.go b/reader.go index c51517c..73dbc40 100644 --- a/reader.go +++ b/reader.go @@ -1,5 +1,5 @@ package bolt type reader struct { - int transactionID + txnid int } diff --git a/transaction.go b/transaction.go index 1064e8e..de3bd0d 100644 --- a/transaction.go +++ b/transaction.go @@ -16,7 +16,7 @@ type Transaction interface { type transaction struct { id int flags int - db *db + db *DB parent *transaction child *transaction nextPageNumber int @@ -25,7 +25,7 @@ type transaction struct { dirtyList []int reader *reader // TODO: bucketxs []*bucketx - buckets []*bucket + buckets []*Bucket bucketFlags []int cursors []*cursor // Implicit from slices? TODO: MDB_dbi mt_numdbs; @@ -35,41 +35,40 @@ type transaction struct { // ntxn represents a nested transaction. type ntxn struct { transaction *transaction /**< the transaction */ - pageState pageState /**< parent transaction's saved freestate */ + pageState pageState /**< parent transaction's saved freestate */ } - func (t *transaction) allocPage(num int) *page { /* - MDB_env *env = txn->mt_env; - MDB_page *ret = env->me_dpages; - size_t psize = env->me_psize, sz = psize, off; - // For ! #MDB_NOMEMINIT, psize counts how much to init. - // For a single page alloc, we init everything after the page header. - // For multi-page, we init the final page; if the caller needed that - // many pages they will be filling in at least up to the last page. - if (num == 1) { - if (ret) { + MDB_env *env = txn->mt_env; + MDB_page *ret = env->me_dpages; + size_t psize = env->me_psize, sz = psize, off; + // For ! #MDB_NOMEMINIT, psize counts how much to init. + // For a single page alloc, we init everything after the page header. + // For multi-page, we init the final page; if the caller needed that + // many pages they will be filling in at least up to the last page. + if (num == 1) { + if (ret) { + VGMEMP_ALLOC(env, ret, sz); + VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); + env->me_dpages = ret->mp_next; + return ret; + } + psize -= off = PAGEHDRSZ; + } else { + sz *= num; + off = sz - psize; + } + if ((ret = malloc(sz)) != NULL) { VGMEMP_ALLOC(env, ret, sz); - VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); - env->me_dpages = ret->mp_next; - return ret; + if (!(env->me_flags & MDB_NOMEMINIT)) { + memset((char *)ret + off, 0, psize); + ret->mp_pad = 0; + } + } else { + txn->mt_flags |= MDB_TXN_ERROR; } - psize -= off = PAGEHDRSZ; - } else { - sz *= num; - off = sz - psize; - } - if ((ret = malloc(sz)) != NULL) { - VGMEMP_ALLOC(env, ret, sz); - if (!(env->me_flags & MDB_NOMEMINIT)) { - memset((char *)ret + off, 0, psize); - ret->mp_pad = 0; - } - } else { - txn->mt_flags |= MDB_TXN_ERROR; - } - return ret; + return ret; */ return nil } @@ -77,19 +76,19 @@ func (t *transaction) allocPage(num int) *page { // Find oldest txnid still referenced. Expects txn->mt_txnid > 0. func (t *transaction) oldest() int { /* - int i; - txnid_t mr, oldest = txn->mt_txnid - 1; - if (txn->mt_env->me_txns) { - MDB_reader *r = txn->mt_env->me_txns->mti_readers; - for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { - if (r[i].mr_pid) { - mr = r[i].mr_txnid; - if (oldest > mr) - oldest = mr; + int i; + txnid_t mr, oldest = txn->mt_txnid - 1; + if (txn->mt_env->me_txns) { + MDB_reader *r = txn->mt_env->me_txns->mti_readers; + for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { + if (r[i].mr_pid) { + mr = r[i].mr_txnid; + if (oldest > mr) + oldest = mr; + } } } - } - return oldest; + return oldest; */ return 0 } @@ -97,19 +96,19 @@ func (t *transaction) oldest() int { // Add a page to the txn's dirty list func (t *transaction) dirty(p *page) { /* - MDB_ID2 mid; - int rc, (*insert)(MDB_ID2L, MDB_ID2 *); + MDB_ID2 mid; + int rc, (*insert)(MDB_ID2L, MDB_ID2 *); - if (txn->mt_env->me_flags & MDB_WRITEMAP) { - insert = mdb_mid2l_append; - } else { - insert = mdb_mid2l_insert; - } - mid.mid = mp->mp_pgno; - mid.mptr = mp; - rc = insert(txn->mt_u.dirty_list, &mid); - mdb_tassert(txn, rc == 0); - txn->mt_dirty_room--; + if (txn->mt_env->me_flags & MDB_WRITEMAP) { + insert = mdb_mid2l_append; + } else { + insert = mdb_mid2l_insert; + } + mid.mid = mp->mp_pgno; + mid.mptr = mp; + rc = insert(txn->mt_u.dirty_list, &mid); + mdb_tassert(txn, rc == 0); + txn->mt_dirty_room--; */ } @@ -122,53 +121,53 @@ func (t *transaction) dirty(p *page) { // mp wasn't spilled. func (t *transaction) unspill(p *page) *page { /* - MDB_env *env = txn->mt_env; - const MDB_txn *tx2; - unsigned x; - pgno_t pgno = mp->mp_pgno, pn = pgno << 1; + MDB_env *env = txn->mt_env; + const MDB_txn *tx2; + unsigned x; + pgno_t pgno = mp->mp_pgno, pn = pgno << 1; - for (tx2 = txn; tx2; tx2=tx2->mt_parent) { - if (!tx2->mt_spill_pgs) - continue; - x = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { - MDB_page *np; - int num; - if (txn->mt_dirty_room == 0) - return MDB_TXN_FULL; - if (IS_OVERFLOW(mp)) - num = mp->mp_pages; - else - num = 1; - if (env->me_flags & MDB_WRITEMAP) { - np = mp; - } else { - np = mdb_page_malloc(txn, num); - if (!np) - return ENOMEM; - if (num > 1) - memcpy(np, mp, num * env->me_psize); + for (tx2 = txn; tx2; tx2=tx2->mt_parent) { + if (!tx2->mt_spill_pgs) + continue; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + MDB_page *np; + int num; + if (txn->mt_dirty_room == 0) + return MDB_TXN_FULL; + if (IS_OVERFLOW(mp)) + num = mp->mp_pages; else - mdb_page_copy(np, mp, env->me_psize); + num = 1; + if (env->me_flags & MDB_WRITEMAP) { + np = mp; + } else { + np = mdb_page_malloc(txn, num); + if (!np) + return ENOMEM; + if (num > 1) + memcpy(np, mp, num * env->me_psize); + else + mdb_page_copy(np, mp, env->me_psize); + } + if (tx2 == txn) { + // If in current txn, this page is no longer spilled. + // If it happens to be the last page, truncate the spill list. + // Otherwise mark it as deleted by setting the LSB. + if (x == txn->mt_spill_pgs[0]) + txn->mt_spill_pgs[0]--; + else + txn->mt_spill_pgs[x] |= 1; + } // otherwise, if belonging to a parent txn, the + // page remains spilled until child commits + + mdb_page_dirty(txn, np); + np->mp_flags |= P_DIRTY; + *ret = np; + break; } - if (tx2 == txn) { - // If in current txn, this page is no longer spilled. - // If it happens to be the last page, truncate the spill list. - // Otherwise mark it as deleted by setting the LSB. - if (x == txn->mt_spill_pgs[0]) - txn->mt_spill_pgs[0]--; - else - txn->mt_spill_pgs[x] |= 1; - } // otherwise, if belonging to a parent txn, the - // page remains spilled until child commits - - mdb_page_dirty(txn, np); - np->mp_flags |= P_DIRTY; - *ret = np; - break; } - } - return MDB_SUCCESS; + return MDB_SUCCESS; */ return nil } @@ -176,37 +175,37 @@ func (t *transaction) unspill(p *page) *page { // Back up parent txn's cursors, then grab the originals for tracking func (t *transaction) shadow(dst *transaction) error { /* - MDB_cursor *mc, *bk; - MDB_xcursor *mx; - size_t size; - int i; + MDB_cursor *mc, *bk; + MDB_xcursor *mx; + size_t size; + int i; - for (i = src->mt_numdbs; --i >= 0; ) { - if ((mc = src->mt_cursors[i]) != NULL) { - size = sizeof(MDB_cursor); - if (mc->mc_xcursor) - size += sizeof(MDB_xcursor); - for (; mc; mc = bk->mc_next) { - bk = malloc(size); - if (!bk) - return ENOMEM; - *bk = *mc; - mc->mc_backup = bk; - mc->mc_db = &dst->mt_dbs[i]; - // Kill pointers into src - and dst to reduce abuse: The - // user may not use mc until dst ends. Otherwise we'd... - mc->mc_txn = NULL; // ...set this to dst - mc->mc_dbflag = NULL; // ...and &dst->mt_dbflags[i] - if ((mx = mc->mc_xcursor) != NULL) { - *(MDB_xcursor *)(bk+1) = *mx; - mx->mx_cursor.mc_txn = NULL; // ...and dst. + for (i = src->mt_numdbs; --i >= 0; ) { + if ((mc = src->mt_cursors[i]) != NULL) { + size = sizeof(MDB_cursor); + if (mc->mc_xcursor) + size += sizeof(MDB_xcursor); + for (; mc; mc = bk->mc_next) { + bk = malloc(size); + if (!bk) + return ENOMEM; + *bk = *mc; + mc->mc_backup = bk; + mc->mc_db = &dst->mt_dbs[i]; + // Kill pointers into src - and dst to reduce abuse: The + // user may not use mc until dst ends. Otherwise we'd... + mc->mc_txn = NULL; // ...set this to dst + mc->mc_dbflag = NULL; // ...and &dst->mt_dbflags[i] + if ((mx = mc->mc_xcursor) != NULL) { + *(MDB_xcursor *)(bk+1) = *mx; + mx->mx_cursor.mc_txn = NULL; // ...and dst. + } + mc->mc_next = dst->mt_cursors[i]; + dst->mt_cursors[i] = mc; } - mc->mc_next = dst->mt_cursors[i]; - dst->mt_cursors[i] = mc; } } - } - return MDB_SUCCESS; + return MDB_SUCCESS; */ return nil } @@ -217,36 +216,36 @@ func (t *transaction) shadow(dst *transaction) error { // @return 0 on success, non-zero on failure. func (t *transaction) closeCursors(merge bool) { /* - MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; - MDB_xcursor *mx; - int i; + MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; + MDB_xcursor *mx; + int i; - for (i = txn->mt_numdbs; --i >= 0; ) { - for (mc = cursors[i]; mc; mc = next) { - next = mc->mc_next; - if ((bk = mc->mc_backup) != NULL) { - if (merge) { - // Commit changes to parent txn - mc->mc_next = bk->mc_next; - mc->mc_backup = bk->mc_backup; - mc->mc_txn = bk->mc_txn; - mc->mc_db = bk->mc_db; - mc->mc_dbflag = bk->mc_dbflag; - if ((mx = mc->mc_xcursor) != NULL) - mx->mx_cursor.mc_txn = bk->mc_txn; - } else { - // Abort nested txn - *mc = *bk; - if ((mx = mc->mc_xcursor) != NULL) - *mx = *(MDB_xcursor *)(bk+1); + for (i = txn->mt_numdbs; --i >= 0; ) { + for (mc = cursors[i]; mc; mc = next) { + next = mc->mc_next; + if ((bk = mc->mc_backup) != NULL) { + if (merge) { + // Commit changes to parent txn + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbflag = bk->mc_dbflag; + if ((mx = mc->mc_xcursor) != NULL) + mx->mx_cursor.mc_txn = bk->mc_txn; + } else { + // Abort nested txn + *mc = *bk; + if ((mx = mc->mc_xcursor) != NULL) + *mx = *(MDB_xcursor *)(bk+1); + } + mc = bk; } - mc = bk; + // Only malloced cursors are permanently tracked. + free(mc); } - // Only malloced cursors are permanently tracked. - free(mc); + cursors[i] = NULL; } - cursors[i] = NULL; - } */ } @@ -255,168 +254,168 @@ func (t *transaction) closeCursors(merge bool) { // @return 0 on success, non-zero on failure. func (t *transaction) renew() error { /* - MDB_env *env = txn->mt_env; - MDB_txninfo *ti = env->me_txns; - MDB_meta *meta; - unsigned int i, nr; - uint16_t x; - int rc, new_notls = 0; + MDB_env *env = txn->mt_env; + MDB_txninfo *ti = env->me_txns; + MDB_meta *meta; + unsigned int i, nr; + uint16_t x; + int rc, new_notls = 0; - // Setup db info - txn->mt_numdbs = env->me_numdbs; - txn->mt_dbxs = env->me_dbxs; // mostly static anyway + // Setup db info + txn->mt_numdbs = env->me_numdbs; + txn->mt_dbxs = env->me_dbxs; // mostly static anyway - if (txn->mt_flags & MDB_TXN_RDONLY) { - if (!ti) { - meta = env->me_metas[ mdb_env_pick_meta(env) ]; - txn->mt_txnid = meta->mm_txnid; - txn->mt_u.reader = NULL; - } else { - MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : - pthread_getspecific(env->me_txkey); - if (r) { - if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) - return MDB_BAD_RSLOT; + if (txn->mt_flags & MDB_TXN_RDONLY) { + if (!ti) { + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + txn->mt_txnid = meta->mm_txnid; + txn->mt_u.reader = NULL; + } else { + MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : + pthread_getspecific(env->me_txkey); + if (r) { + if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) + return MDB_BAD_RSLOT; + } else { + MDB_PID_T pid = env->me_pid; + pthread_t tid = pthread_self(); + + if (!(env->me_flags & MDB_LIVE_READER)) { + rc = mdb_reader_pid(env, Pidset, pid); + if (rc) + return rc; + env->me_flags |= MDB_LIVE_READER; + } + + LOCK_MUTEX_R(env); + nr = ti->mti_numreaders; + for (i=0; imti_readers[i].mr_pid == 0) + break; + if (i == env->me_maxreaders) { + UNLOCK_MUTEX_R(env); + return MDB_READERS_FULL; + } + ti->mti_readers[i].mr_pid = pid; + ti->mti_readers[i].mr_tid = tid; + if (i == nr) + ti->mti_numreaders = ++nr; + // Save numreaders for un-mutexed mdb_env_close() + env->me_numreaders = nr; + UNLOCK_MUTEX_R(env); + + r = &ti->mti_readers[i]; + new_notls = (env->me_flags & MDB_NOTLS); + if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { + r->mr_pid = 0; + return rc; + } + } + txn->mt_txnid = r->mr_txnid = ti->mti_txnid; + txn->mt_u.reader = r; + meta = env->me_metas[txn->mt_txnid & 1]; + } } else { - MDB_PID_T pid = env->me_pid; - pthread_t tid = pthread_self(); + if (ti) { + LOCK_MUTEX_W(env); - if (!(env->me_flags & MDB_LIVE_READER)) { - rc = mdb_reader_pid(env, Pidset, pid); - if (rc) - return rc; - env->me_flags |= MDB_LIVE_READER; - } - - LOCK_MUTEX_R(env); - nr = ti->mti_numreaders; - for (i=0; imti_readers[i].mr_pid == 0) - break; - if (i == env->me_maxreaders) { - UNLOCK_MUTEX_R(env); - return MDB_READERS_FULL; - } - ti->mti_readers[i].mr_pid = pid; - ti->mti_readers[i].mr_tid = tid; - if (i == nr) - ti->mti_numreaders = ++nr; - // Save numreaders for un-mutexed mdb_env_close() - env->me_numreaders = nr; - UNLOCK_MUTEX_R(env); - - r = &ti->mti_readers[i]; - new_notls = (env->me_flags & MDB_NOTLS); - if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { - r->mr_pid = 0; - return rc; + txn->mt_txnid = ti->mti_txnid; + meta = env->me_metas[txn->mt_txnid & 1]; + } else { + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + txn->mt_txnid = meta->mm_txnid; } + txn->mt_txnid++; + #if MDB_DEBUG + if (txn->mt_txnid == mdb_debug_start) + mdb_debug = 1; + #endif + txn->mt_dirty_room = MDB_IDL_UM_MAX; + txn->mt_u.dirty_list = env->me_dirty_list; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_free_pgs = env->me_free_pgs; + txn->mt_free_pgs[0] = 0; + txn->mt_spill_pgs = NULL; + env->me_txn = txn; } - txn->mt_txnid = r->mr_txnid = ti->mti_txnid; - txn->mt_u.reader = r; - meta = env->me_metas[txn->mt_txnid & 1]; - } - } else { - if (ti) { - LOCK_MUTEX_W(env); - txn->mt_txnid = ti->mti_txnid; - meta = env->me_metas[txn->mt_txnid & 1]; - } else { - meta = env->me_metas[ mdb_env_pick_meta(env) ]; - txn->mt_txnid = meta->mm_txnid; - } - txn->mt_txnid++; -#if MDB_DEBUG - if (txn->mt_txnid == mdb_debug_start) - mdb_debug = 1; -#endif - txn->mt_dirty_room = MDB_IDL_UM_MAX; - txn->mt_u.dirty_list = env->me_dirty_list; - txn->mt_u.dirty_list[0].mid = 0; - txn->mt_free_pgs = env->me_free_pgs; - txn->mt_free_pgs[0] = 0; - txn->mt_spill_pgs = NULL; - env->me_txn = txn; - } + // Copy the DB info and flags + memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db)); - // Copy the DB info and flags - memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db)); + // Moved to here to avoid a data race in read TXNs + txn->mt_next_pgno = meta->mm_last_pg+1; - // Moved to here to avoid a data race in read TXNs - txn->mt_next_pgno = meta->mm_last_pg+1; + for (i=2; imt_numdbs; i++) { + x = env->me_dbflags[i]; + txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; + txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0; + } + txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; - for (i=2; imt_numdbs; i++) { - x = env->me_dbflags[i]; - txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; - txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0; - } - txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; + if (env->me_maxpg < txn->mt_next_pgno) { + mdb_txn_reset0(txn, "renew0-mapfail"); + if (new_notls) { + txn->mt_u.reader->mr_pid = 0; + txn->mt_u.reader = NULL; + } + return MDB_MAP_RESIZED; + } - if (env->me_maxpg < txn->mt_next_pgno) { - mdb_txn_reset0(txn, "renew0-mapfail"); - if (new_notls) { - txn->mt_u.reader->mr_pid = 0; - txn->mt_u.reader = NULL; - } - return MDB_MAP_RESIZED; - } - - return MDB_SUCCESS; + return MDB_SUCCESS; */ return nil } func (t *transaction) Renew() error { /* - int rc; + int rc; - if (!txn || txn->mt_dbxs) // A reset txn has mt_dbxs==NULL - return EINVAL; + if (!txn || txn->mt_dbxs) // A reset txn has mt_dbxs==NULL + return EINVAL; - if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { - DPUTS("environment had fatal error, must shutdown!"); - return MDB_PANIC; - } + if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { + DPUTS("environment had fatal error, must shutdown!"); + return MDB_PANIC; + } - rc = mdb_txn_renew0(txn); - if (rc == MDB_SUCCESS) { - DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); - } - return rc; + rc = mdb_txn_renew0(txn); + if (rc == MDB_SUCCESS) { + DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); + } + return rc; */ return nil } -func (t *transaction) DB() DB { +func (t *transaction) DB() *DB { return t.db } // Export or close DBI handles opened in this txn. func (t *transaction) updateBuckets(keep bool) { /* - int i; - MDB_dbi n = txn->mt_numdbs; - MDB_env *env = txn->mt_env; - unsigned char *tdbflags = txn->mt_dbflags; + int i; + MDB_dbi n = txn->mt_numdbs; + MDB_env *env = txn->mt_env; + unsigned char *tdbflags = txn->mt_dbflags; - for (i = n; --i >= 2;) { - if (tdbflags[i] & DB_NEW) { - if (keep) { - env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; - } else { - char *ptr = env->me_dbxs[i].md_name.mv_data; - env->me_dbxs[i].md_name.mv_data = NULL; - env->me_dbxs[i].md_name.mv_size = 0; - env->me_dbflags[i] = 0; - free(ptr); + for (i = n; --i >= 2;) { + if (tdbflags[i] & DB_NEW) { + if (keep) { + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; + } else { + char *ptr = env->me_dbxs[i].md_name.mv_data; + env->me_dbxs[i].md_name.mv_data = NULL; + env->me_dbxs[i].md_name.mv_size = 0; + env->me_dbflags[i] = 0; + free(ptr); + } } } - } - if (keep && env->me_numdbs < n) - env->me_numdbs = n; + if (keep && env->me_numdbs < n) + env->me_numdbs = n; */ } @@ -426,80 +425,80 @@ func (t *transaction) updateBuckets(keep bool) { // @param[in] act why the transaction is being reset func (t *transaction) reset(act string) { /* - MDB_env *env = txn->mt_env; + MDB_env *env = txn->mt_env; - // Close any DBI handles opened in this txn - mdb_dbis_update(txn, 0); + // Close any DBI handles opened in this txn + mdb_dbis_update(txn, 0); - DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", - act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); + DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - if (txn->mt_u.reader) { - txn->mt_u.reader->mr_txnid = (txnid_t)-1; - if (!(env->me_flags & MDB_NOTLS)) - txn->mt_u.reader = NULL; // txn does not own reader + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + if (txn->mt_u.reader) { + txn->mt_u.reader->mr_txnid = (txnid_t)-1; + if (!(env->me_flags & MDB_NOTLS)) + txn->mt_u.reader = NULL; // txn does not own reader + } + txn->mt_numdbs = 0; // close nothing if called again + txn->mt_dbxs = NULL; // mark txn as reset + } else { + mdb_cursors_close(txn, 0); + + if (!(env->me_flags & MDB_WRITEMAP)) { + mdb_dlist_free(txn); + } + mdb_midl_free(env->me_pghead); + + if (txn->mt_parent) { + txn->mt_parent->mt_child = NULL; + env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; + mdb_midl_free(txn->mt_free_pgs); + mdb_midl_free(txn->mt_spill_pgs); + free(txn->mt_u.dirty_list); + return; + } + + if (mdb_midl_shrink(&txn->mt_free_pgs)) + env->me_free_pgs = txn->mt_free_pgs; + env->me_pghead = NULL; + env->me_pglast = 0; + + env->me_txn = NULL; + // The writer mutex was locked in mdb_txn_begin. + if (env->me_txns) + UNLOCK_MUTEX_W(env); } - txn->mt_numdbs = 0; // close nothing if called again - txn->mt_dbxs = NULL; // mark txn as reset - } else { - mdb_cursors_close(txn, 0); - - if (!(env->me_flags & MDB_WRITEMAP)) { - mdb_dlist_free(txn); - } - mdb_midl_free(env->me_pghead); - - if (txn->mt_parent) { - txn->mt_parent->mt_child = NULL; - env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; - mdb_midl_free(txn->mt_free_pgs); - mdb_midl_free(txn->mt_spill_pgs); - free(txn->mt_u.dirty_list); - return; - } - - if (mdb_midl_shrink(&txn->mt_free_pgs)) - env->me_free_pgs = txn->mt_free_pgs; - env->me_pghead = NULL; - env->me_pglast = 0; - - env->me_txn = NULL; - // The writer mutex was locked in mdb_txn_begin. - if (env->me_txns) - UNLOCK_MUTEX_W(env); - } */ } func (t *transaction) Reset() { /* - if (txn == NULL) - return; + if (txn == NULL) + return; - // This call is only valid for read-only txns - if (!(txn->mt_flags & MDB_TXN_RDONLY)) - return; + // This call is only valid for read-only txns + if (!(txn->mt_flags & MDB_TXN_RDONLY)) + return; - mdb_txn_reset0(txn, "reset"); + mdb_txn_reset0(txn, "reset"); */ } func (t *transaction) Abort() { /* - if (txn == NULL) - return; + if (txn == NULL) + return; - if (txn->mt_child) - mdb_txn_abort(txn->mt_child); + if (txn->mt_child) + mdb_txn_abort(txn->mt_child); - mdb_txn_reset0(txn, "abort"); - // Free reader slot tied to this txn (if MDB_NOTLS && writable FS) - if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) - txn->mt_u.reader->mr_pid = 0; + mdb_txn_reset0(txn, "abort"); + // Free reader slot tied to this txn (if MDB_NOTLS && writable FS) + if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) + txn->mt_u.reader->mr_pid = 0; - free(txn); + free(txn); */ } @@ -507,154 +506,154 @@ func (t *transaction) Abort() { // This changes the freelist. Keep trying until it stabilizes. func (t *transaction) saveFreelist() error { /* - // env->me_pghead[] can grow and shrink during this call. - // env->me_pglast and txn->mt_free_pgs[] can only grow. - // Page numbers cannot disappear from txn->mt_free_pgs[]. - MDB_cursor mc; - MDB_env *env = txn->mt_env; - int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; - txnid_t pglast = 0, head_id = 0; - pgno_t freecnt = 0, *free_pgs, *mop; - ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; + // env->me_pghead[] can grow and shrink during this call. + // env->me_pglast and txn->mt_free_pgs[] can only grow. + // Page numbers cannot disappear from txn->mt_free_pgs[]. + MDB_cursor mc; + MDB_env *env = txn->mt_env; + int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; + txnid_t pglast = 0, head_id = 0; + pgno_t freecnt = 0, *free_pgs, *mop; + ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - if (env->me_pghead) { - // Make sure first page of freeDB is touched and on freelist - rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); - if (rc && rc != MDB_NOTFOUND) - return rc; - } - - // MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) - clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) - ? SSIZE_MAX : maxfree_1pg; - - for (;;) { - // Come back here after each Put() in case freelist changed - MDB_val key, data; - pgno_t *pgs; - ssize_t j; - - // If using records from freeDB which we have not yet - // deleted, delete them and any we reserved for me_pghead. - while (pglast < env->me_pglast) { - rc = mdb_cursor_first(&mc, &key, NULL); - if (rc) - return rc; - pglast = head_id = *(txnid_t *)key.mv_data; - total_room = head_room = 0; - mdb_tassert(txn, pglast <= env->me_pglast); - rc = mdb_cursor_del(&mc, 0); - if (rc) - return rc; - } - - // Save the IDL of pages freed by this txn, to a single record - if (freecnt < txn->mt_free_pgs[0]) { - if (!freecnt) { - // Make sure last page of freeDB is touched and on freelist - rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); + if (env->me_pghead) { + // Make sure first page of freeDB is touched and on freelist + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); if (rc && rc != MDB_NOTFOUND) return rc; } - free_pgs = txn->mt_free_pgs; - // Write to last page of freeDB - key.mv_size = sizeof(txn->mt_txnid); - key.mv_data = &txn->mt_txnid; - do { - freecnt = free_pgs[0]; - data.mv_size = MDB_IDL_SIZEOF(free_pgs); + + // MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) + clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) + ? SSIZE_MAX : maxfree_1pg; + + for (;;) { + // Come back here after each Put() in case freelist changed + MDB_val key, data; + pgno_t *pgs; + ssize_t j; + + // If using records from freeDB which we have not yet + // deleted, delete them and any we reserved for me_pghead. + while (pglast < env->me_pglast) { + rc = mdb_cursor_first(&mc, &key, NULL); + if (rc) + return rc; + pglast = head_id = *(txnid_t *)key.mv_data; + total_room = head_room = 0; + mdb_tassert(txn, pglast <= env->me_pglast); + rc = mdb_cursor_del(&mc, 0); + if (rc) + return rc; + } + + // Save the IDL of pages freed by this txn, to a single record + if (freecnt < txn->mt_free_pgs[0]) { + if (!freecnt) { + // Make sure last page of freeDB is touched and on freelist + rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + free_pgs = txn->mt_free_pgs; + // Write to last page of freeDB + key.mv_size = sizeof(txn->mt_txnid); + key.mv_data = &txn->mt_txnid; + do { + freecnt = free_pgs[0]; + data.mv_size = MDB_IDL_SIZEOF(free_pgs); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + // Retry if mt_free_pgs[] grew during the Put() + free_pgs = txn->mt_free_pgs; + } while (freecnt < free_pgs[0]); + mdb_midl_sort(free_pgs); + memcpy(data.mv_data, free_pgs, data.mv_size); + #if (MDB_DEBUG) > 1 + { + unsigned int i = free_pgs[0]; + DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); + for (; i; i--) + DPRINTF(("IDL %"Z"u", free_pgs[i])); + } + #endif + continue; + } + + mop = env->me_pghead; + mop_len = mop ? mop[0] : 0; + + // Reserve records for me_pghead[]. Split it if multi-page, + // to avoid searching freeDB for a page range. Use keys in + // range [1,me_pglast]: Smaller than txnid of oldest reader. + if (total_room >= mop_len) { + if (total_room == mop_len || --more < 0) + break; + } else if (head_room >= maxfree_1pg && head_id > 1) { + // Keep current record (overflow page), add a new one + head_id--; + head_room = 0; + } + // (Re)write {key = head_id, IDL length = head_room} + total_room -= head_room; + head_room = mop_len - total_room; + if (head_room > maxfree_1pg && head_id > 1) { + // Overflow multi-page for part of me_pghead + head_room /= head_id; // amortize page sizes + head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); + } else if (head_room < 0) { + // Rare case, not bothering to delete this record + head_room = 0; + } + key.mv_size = sizeof(head_id); + key.mv_data = &head_id; + data.mv_size = (head_room + 1) * sizeof(pgno_t); rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); if (rc) return rc; - // Retry if mt_free_pgs[] grew during the Put() - free_pgs = txn->mt_free_pgs; - } while (freecnt < free_pgs[0]); - mdb_midl_sort(free_pgs); - memcpy(data.mv_data, free_pgs, data.mv_size); -#if (MDB_DEBUG) > 1 - { - unsigned int i = free_pgs[0]; - DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", - txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); - for (; i; i--) - DPRINTF(("IDL %"Z"u", free_pgs[i])); + // IDL is initially empty, zero out at least the length + pgs = (pgno_t *)data.mv_data; + j = head_room > clean_limit ? head_room : 0; + do { + pgs[j] = 0; + } while (--j >= 0); + total_room += head_room; } -#endif - continue; - } - mop = env->me_pghead; - mop_len = mop ? mop[0] : 0; + // Fill in the reserved me_pghead records + rc = MDB_SUCCESS; + if (mop_len) { + MDB_val key, data; - // Reserve records for me_pghead[]. Split it if multi-page, - // to avoid searching freeDB for a page range. Use keys in - // range [1,me_pglast]: Smaller than txnid of oldest reader. - if (total_room >= mop_len) { - if (total_room == mop_len || --more < 0) - break; - } else if (head_room >= maxfree_1pg && head_id > 1) { - // Keep current record (overflow page), add a new one - head_id--; - head_room = 0; - } - // (Re)write {key = head_id, IDL length = head_room} - total_room -= head_room; - head_room = mop_len - total_room; - if (head_room > maxfree_1pg && head_id > 1) { - // Overflow multi-page for part of me_pghead - head_room /= head_id; // amortize page sizes - head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); - } else if (head_room < 0) { - // Rare case, not bothering to delete this record - head_room = 0; - } - key.mv_size = sizeof(head_id); - key.mv_data = &head_id; - data.mv_size = (head_room + 1) * sizeof(pgno_t); - rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); - if (rc) + mop += mop_len; + rc = mdb_cursor_first(&mc, &key, &data); + for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { + unsigned flags = MDB_CURRENT; + txnid_t id = *(txnid_t *)key.mv_data; + ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; + MDB_ID save; + + mdb_tassert(txn, len >= 0 && id <= env->me_pglast); + key.mv_data = &id; + if (len > mop_len) { + len = mop_len; + data.mv_size = (len + 1) * sizeof(MDB_ID); + flags = 0; + } + data.mv_data = mop -= len; + save = mop[0]; + mop[0] = len; + rc = mdb_cursor_put(&mc, &key, &data, flags); + mop[0] = save; + if (rc || !(mop_len -= len)) + break; + } + } return rc; - // IDL is initially empty, zero out at least the length - pgs = (pgno_t *)data.mv_data; - j = head_room > clean_limit ? head_room : 0; - do { - pgs[j] = 0; - } while (--j >= 0); - total_room += head_room; - } - - // Fill in the reserved me_pghead records - rc = MDB_SUCCESS; - if (mop_len) { - MDB_val key, data; - - mop += mop_len; - rc = mdb_cursor_first(&mc, &key, &data); - for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { - unsigned flags = MDB_CURRENT; - txnid_t id = *(txnid_t *)key.mv_data; - ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; - MDB_ID save; - - mdb_tassert(txn, len >= 0 && id <= env->me_pglast); - key.mv_data = &id; - if (len > mop_len) { - len = mop_len; - data.mv_size = (len + 1) * sizeof(MDB_ID); - flags = 0; - } - data.mv_data = mop -= len; - save = mop[0]; - mop[0] = len; - rc = mdb_cursor_put(&mc, &key, &data, flags); - mop[0] = save; - if (rc || !(mop_len -= len)) - break; - } - } - return rc; */ return nil } @@ -663,342 +662,342 @@ func (t *transaction) saveFreelist() error { // @param[in] txn the transaction that's being committed // @param[in] keep number of initial pages in dirty_list to keep dirty. // @return 0 on success, non-zero on failure. -func (t *transaction) flush(keep bool) { +func (t *transaction) flush(keep bool) error { /* - MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned psize = env->me_psize, j; - int i, pagecount = dl[0].mid, rc; - size_t size = 0, pos = 0; - pgno_t pgno = 0; - MDB_page *dp = NULL; -#ifdef _WIN32 - OVERLAPPED ov; -#else - struct iovec iov[MDB_COMMIT_PAGES]; - ssize_t wpos = 0, wsize = 0, wres; - size_t next_pos = 1; // impossible pos, so pos != next_pos - int n = 0; -#endif + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned psize = env->me_psize, j; + int i, pagecount = dl[0].mid, rc; + size_t size = 0, pos = 0; + pgno_t pgno = 0; + MDB_page *dp = NULL; + #ifdef _WIN32 + OVERLAPPED ov; + #else + struct iovec iov[MDB_COMMIT_PAGES]; + ssize_t wpos = 0, wsize = 0, wres; + size_t next_pos = 1; // impossible pos, so pos != next_pos + int n = 0; + #endif - j = i = keep; + j = i = keep; - if (env->me_flags & MDB_WRITEMAP) { - // Clear dirty flags - while (++i <= pagecount) { - dp = dl[i].mptr; - // Don't flush this page yet - if (dp->mp_flags & P_KEEP) { - dp->mp_flags ^= P_KEEP; - dl[++j] = dl[i]; - continue; - } - dp->mp_flags &= ~P_DIRTY; - } - goto done; - } - - // Write the pages - for (;;) { - if (++i <= pagecount) { - dp = dl[i].mptr; - // Don't flush this page yet - if (dp->mp_flags & P_KEEP) { - dp->mp_flags ^= P_KEEP; - dl[i].mid = 0; - continue; - } - pgno = dl[i].mid; - // clear dirty flag - dp->mp_flags &= ~P_DIRTY; - pos = pgno * psize; - size = psize; - if (IS_OVERFLOW(dp)) size *= dp->mp_pages; - } -#ifdef _WIN32 - else break; - - // Windows actually supports scatter/gather I/O, but only on - // unbuffered file handles. Since we're relying on the OS page - // cache for all our data, that's self-defeating. So we just - // write pages one at a time. We use the ov structure to set - // the write offset, to at least save the overhead of a Seek - // system call. - DPRINTF(("committing page %"Z"u", pgno)); - memset(&ov, 0, sizeof(ov)); - ov.Offset = pos & 0xffffffff; - ov.OffsetHigh = pos >> 16 >> 16; - if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { - rc = ErrCode(); - DPRINTF(("WriteFile: %d", rc)); - return rc; - } -#else - // Write up to MDB_COMMIT_PAGES dirty pages at a time. - if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { - if (n) { - // Write previous page(s) -#ifdef MDB_USE_PWRITEV - wres = pwritev(env->me_fd, iov, n, wpos); -#else - if (n == 1) { - wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); - } else { - if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { - rc = ErrCode(); - DPRINTF(("lseek: %s", strerror(rc))); - return rc; + if (env->me_flags & MDB_WRITEMAP) { + // Clear dirty flags + while (++i <= pagecount) { + dp = dl[i].mptr; + // Don't flush this page yet + if (dp->mp_flags & P_KEEP) { + dp->mp_flags ^= P_KEEP; + dl[++j] = dl[i]; + continue; } - wres = writev(env->me_fd, iov, n); + dp->mp_flags &= ~P_DIRTY; } -#endif - if (wres != wsize) { - if (wres < 0) { - rc = ErrCode(); - DPRINTF(("Write error: %s", strerror(rc))); - } else { - rc = EIO; // TODO: Use which error code? - DPUTS("short write, filesystem full?"); + goto done; + } + + // Write the pages + for (;;) { + if (++i <= pagecount) { + dp = dl[i].mptr; + // Don't flush this page yet + if (dp->mp_flags & P_KEEP) { + dp->mp_flags ^= P_KEEP; + dl[i].mid = 0; + continue; } + pgno = dl[i].mid; + // clear dirty flag + dp->mp_flags &= ~P_DIRTY; + pos = pgno * psize; + size = psize; + if (IS_OVERFLOW(dp)) size *= dp->mp_pages; + } + #ifdef _WIN32 + else break; + + // Windows actually supports scatter/gather I/O, but only on + // unbuffered file handles. Since we're relying on the OS page + // cache for all our data, that's self-defeating. So we just + // write pages one at a time. We use the ov structure to set + // the write offset, to at least save the overhead of a Seek + // system call. + DPRINTF(("committing page %"Z"u", pgno)); + memset(&ov, 0, sizeof(ov)); + ov.Offset = pos & 0xffffffff; + ov.OffsetHigh = pos >> 16 >> 16; + if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { + rc = ErrCode(); + DPRINTF(("WriteFile: %d", rc)); return rc; } - n = 0; - } - if (i > pagecount) - break; - wpos = pos; - wsize = 0; - } - DPRINTF(("committing page %"Z"u", pgno)); - next_pos = pos + size; - iov[n].iov_len = size; - iov[n].iov_base = (char *)dp; - wsize += size; - n++; -#endif // _WIN32 - } - - for (i = keep; ++i <= pagecount; ) { - dp = dl[i].mptr; - // This is a page we skipped above - if (!dl[i].mid) { - dl[++j] = dl[i]; - dl[j].mid = dp->mp_pgno; - continue; - } - mdb_dpage_free(env, dp); - } - -done: - i--; - txn->mt_dirty_room += i - j; - dl[0].mid = j; - return MDB_SUCCESS; -} - -int -mdb_txn_commit(MDB_txn *txn) -{ - int rc; - unsigned int i; - MDB_env *env; - - if (txn == NULL || txn->mt_env == NULL) - return EINVAL; - - if (txn->mt_child) { - rc = mdb_txn_commit(txn->mt_child); - txn->mt_child = NULL; - if (rc) - goto fail; - } - - env = txn->mt_env; - - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - mdb_dbis_update(txn, 1); - txn->mt_numdbs = 2; // so txn_abort() doesn't close any new handles - mdb_txn_abort(txn); - return MDB_SUCCESS; - } - - if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) { - DPUTS("error flag is set, can't commit"); - if (txn->mt_parent) - txn->mt_parent->mt_flags |= MDB_TXN_ERROR; - rc = MDB_BAD_TXN; - goto fail; - } - - if (txn->mt_parent) { - MDB_txn *parent = txn->mt_parent; - MDB_ID2L dst, src; - MDB_IDL pspill; - unsigned x, y, len, ps_len; - - // Append our free list to parent's - rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); - if (rc) - goto fail; - mdb_midl_free(txn->mt_free_pgs); - // Failures after this must either undo the changes - // to the parent or set MDB_TXN_ERROR in the parent. - - parent->mt_next_pgno = txn->mt_next_pgno; - parent->mt_flags = txn->mt_flags; - - // Merge our cursors into parent's and close them - mdb_cursors_close(txn, 1); - - // Update parent's DB table. - memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); - parent->mt_numdbs = txn->mt_numdbs; - parent->mt_dbflags[0] = txn->mt_dbflags[0]; - parent->mt_dbflags[1] = txn->mt_dbflags[1]; - for (i=2; imt_numdbs; i++) { - // preserve parent's DB_NEW status - x = parent->mt_dbflags[i] & DB_NEW; - parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; - } - - dst = parent->mt_u.dirty_list; - src = txn->mt_u.dirty_list; - // Remove anything in our dirty list from parent's spill list - if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { - x = y = ps_len; - pspill[0] = (pgno_t)-1; - // Mark our dirty pages as deleted in parent spill list - for (i=0, len=src[0].mid; ++i <= len; ) { - MDB_ID pn = src[i].mid << 1; - while (pn > pspill[x]) - x--; - if (pn == pspill[x]) { - pspill[x] = 1; - y = --x; + #else + // Write up to MDB_COMMIT_PAGES dirty pages at a time. + if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { + if (n) { + // Write previous page(s) + #ifdef MDB_USE_PWRITEV + wres = pwritev(env->me_fd, iov, n, wpos); + #else + if (n == 1) { + wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); + } else { + if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { + rc = ErrCode(); + DPRINTF(("lseek: %s", strerror(rc))); + return rc; + } + wres = writev(env->me_fd, iov, n); + } + #endif + if (wres != wsize) { + if (wres < 0) { + rc = ErrCode(); + DPRINTF(("Write error: %s", strerror(rc))); + } else { + rc = EIO; // TODO: Use which error code? + DPUTS("short write, filesystem full?"); + } + return rc; + } + n = 0; + } + if (i > pagecount) + break; + wpos = pos; + wsize = 0; } + DPRINTF(("committing page %"Z"u", pgno)); + next_pos = pos + size; + iov[n].iov_len = size; + iov[n].iov_base = (char *)dp; + wsize += size; + n++; + #endif // _WIN32 } - // Squash deleted pagenums if we deleted any - for (x=y; ++x <= ps_len; ) - if (!(pspill[x] & 1)) - pspill[++y] = pspill[x]; - pspill[0] = y; - } - // Find len = length of merging our dirty list with parent's - x = dst[0].mid; - dst[0].mid = 0; // simplify loops - if (parent->mt_parent) { - len = x + src[0].mid; - y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; - for (i = x; y && i; y--) { - pgno_t yp = src[y].mid; - while (yp < dst[i].mid) - i--; - if (yp == dst[i].mid) { - i--; - len--; + for (i = keep; ++i <= pagecount; ) { + dp = dl[i].mptr; + // This is a page we skipped above + if (!dl[i].mid) { + dl[++j] = dl[i]; + dl[j].mid = dp->mp_pgno; + continue; } + mdb_dpage_free(env, dp); } - } else { // Simplify the above for single-ancestor case - len = MDB_IDL_UM_MAX - txn->mt_dirty_room; - } - // Merge our dirty list with parent's - y = src[0].mid; - for (i = len; y; dst[i--] = src[y--]) { - pgno_t yp = src[y].mid; - while (yp < dst[x].mid) - dst[i--] = dst[x--]; - if (yp == dst[x].mid) - free(dst[x--].mptr); - } - mdb_tassert(txn, i == x); - dst[0].mid = len; - free(txn->mt_u.dirty_list); - parent->mt_dirty_room = txn->mt_dirty_room; - if (txn->mt_spill_pgs) { - if (parent->mt_spill_pgs) { - // TODO: Prevent failure here, so parent does not fail - rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); - if (rc) - parent->mt_flags |= MDB_TXN_ERROR; - mdb_midl_free(txn->mt_spill_pgs); - mdb_midl_sort(parent->mt_spill_pgs); - } else { - parent->mt_spill_pgs = txn->mt_spill_pgs; - } + + done: + i--; + txn->mt_dirty_room += i - j; + dl[0].mid = j; + return MDB_SUCCESS; } - parent->mt_child = NULL; - mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); - free(txn); - return rc; - } + int + mdb_txn_commit(MDB_txn *txn) + { + int rc; + unsigned int i; + MDB_env *env; - if (txn != env->me_txn) { - DPUTS("attempt to commit unknown transaction"); - rc = EINVAL; - goto fail; - } + if (txn == NULL || txn->mt_env == NULL) + return EINVAL; - mdb_cursors_close(txn, 0); - - if (!txn->mt_u.dirty_list[0].mid && - !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) - goto done; - - DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", - txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); - - // Update DB root pointers - if (txn->mt_numdbs > 2) { - MDB_cursor mc; - MDB_dbi i; - MDB_val data; - data.mv_size = sizeof(MDB_db); - - mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - for (i = 2; i < txn->mt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - data.mv_data = &txn->mt_dbs[i]; - rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); + if (txn->mt_child) { + rc = mdb_txn_commit(txn->mt_child); + txn->mt_child = NULL; if (rc) goto fail; } - } - } - rc = mdb_freelist_save(txn); - if (rc) - goto fail; + env = txn->mt_env; - mdb_midl_free(env->me_pghead); - env->me_pghead = NULL; - if (mdb_midl_shrink(&txn->mt_free_pgs)) - env->me_free_pgs = txn->mt_free_pgs; + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + mdb_dbis_update(txn, 1); + txn->mt_numdbs = 2; // so txn_abort() doesn't close any new handles + mdb_txn_abort(txn); + return MDB_SUCCESS; + } -#if (MDB_DEBUG) > 2 - mdb_audit(txn); -#endif + if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) { + DPUTS("error flag is set, can't commit"); + if (txn->mt_parent) + txn->mt_parent->mt_flags |= MDB_TXN_ERROR; + rc = MDB_BAD_TXN; + goto fail; + } - if ((rc = mdb_page_flush(txn, 0)) || - (rc = mdb_env_sync(env, 0)) || - (rc = mdb_env_write_meta(txn))) - goto fail; + if (txn->mt_parent) { + MDB_txn *parent = txn->mt_parent; + MDB_ID2L dst, src; + MDB_IDL pspill; + unsigned x, y, len, ps_len; -done: - env->me_pglast = 0; - env->me_txn = NULL; - mdb_dbis_update(txn, 1); + // Append our free list to parent's + rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); + if (rc) + goto fail; + mdb_midl_free(txn->mt_free_pgs); + // Failures after this must either undo the changes + // to the parent or set MDB_TXN_ERROR in the parent. - if (env->me_txns) - UNLOCK_MUTEX_W(env); - free(txn); + parent->mt_next_pgno = txn->mt_next_pgno; + parent->mt_flags = txn->mt_flags; - return MDB_SUCCESS; + // Merge our cursors into parent's and close them + mdb_cursors_close(txn, 1); -fail: - mdb_txn_abort(txn); - return rc; + // Update parent's DB table. + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + parent->mt_numdbs = txn->mt_numdbs; + parent->mt_dbflags[0] = txn->mt_dbflags[0]; + parent->mt_dbflags[1] = txn->mt_dbflags[1]; + for (i=2; imt_numdbs; i++) { + // preserve parent's DB_NEW status + x = parent->mt_dbflags[i] & DB_NEW; + parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; + } + + dst = parent->mt_u.dirty_list; + src = txn->mt_u.dirty_list; + // Remove anything in our dirty list from parent's spill list + if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { + x = y = ps_len; + pspill[0] = (pgno_t)-1; + // Mark our dirty pages as deleted in parent spill list + for (i=0, len=src[0].mid; ++i <= len; ) { + MDB_ID pn = src[i].mid << 1; + while (pn > pspill[x]) + x--; + if (pn == pspill[x]) { + pspill[x] = 1; + y = --x; + } + } + // Squash deleted pagenums if we deleted any + for (x=y; ++x <= ps_len; ) + if (!(pspill[x] & 1)) + pspill[++y] = pspill[x]; + pspill[0] = y; + } + + // Find len = length of merging our dirty list with parent's + x = dst[0].mid; + dst[0].mid = 0; // simplify loops + if (parent->mt_parent) { + len = x + src[0].mid; + y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; + for (i = x; y && i; y--) { + pgno_t yp = src[y].mid; + while (yp < dst[i].mid) + i--; + if (yp == dst[i].mid) { + i--; + len--; + } + } + } else { // Simplify the above for single-ancestor case + len = MDB_IDL_UM_MAX - txn->mt_dirty_room; + } + // Merge our dirty list with parent's + y = src[0].mid; + for (i = len; y; dst[i--] = src[y--]) { + pgno_t yp = src[y].mid; + while (yp < dst[x].mid) + dst[i--] = dst[x--]; + if (yp == dst[x].mid) + free(dst[x--].mptr); + } + mdb_tassert(txn, i == x); + dst[0].mid = len; + free(txn->mt_u.dirty_list); + parent->mt_dirty_room = txn->mt_dirty_room; + if (txn->mt_spill_pgs) { + if (parent->mt_spill_pgs) { + // TODO: Prevent failure here, so parent does not fail + rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); + if (rc) + parent->mt_flags |= MDB_TXN_ERROR; + mdb_midl_free(txn->mt_spill_pgs); + mdb_midl_sort(parent->mt_spill_pgs); + } else { + parent->mt_spill_pgs = txn->mt_spill_pgs; + } + } + + parent->mt_child = NULL; + mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); + free(txn); + return rc; + } + + if (txn != env->me_txn) { + DPUTS("attempt to commit unknown transaction"); + rc = EINVAL; + goto fail; + } + + mdb_cursors_close(txn, 0); + + if (!txn->mt_u.dirty_list[0].mid && + !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) + goto done; + + DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", + txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); + + // Update DB root pointers + if (txn->mt_numdbs > 2) { + MDB_cursor mc; + MDB_dbi i; + MDB_val data; + data.mv_size = sizeof(MDB_db); + + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + for (i = 2; i < txn->mt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + data.mv_data = &txn->mt_dbs[i]; + rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); + if (rc) + goto fail; + } + } + } + + rc = mdb_freelist_save(txn); + if (rc) + goto fail; + + mdb_midl_free(env->me_pghead); + env->me_pghead = NULL; + if (mdb_midl_shrink(&txn->mt_free_pgs)) + env->me_free_pgs = txn->mt_free_pgs; + + #if (MDB_DEBUG) > 2 + mdb_audit(txn); + #endif + + if ((rc = mdb_page_flush(txn, 0)) || + (rc = mdb_env_sync(env, 0)) || + (rc = mdb_env_write_meta(txn))) + goto fail; + + done: + env->me_pglast = 0; + env->me_txn = NULL; + mdb_dbis_update(txn, 1); + + if (env->me_txns) + UNLOCK_MUTEX_W(env); + free(txn); + + return MDB_SUCCESS; + + fail: + mdb_txn_abort(txn); + return rc; */ return nil } @@ -1008,118 +1007,118 @@ fail: // @return 0 on success, non-zero on failure. func (t *transaction) writeMeta() error { /* - MDB_env *env; - MDB_meta meta, metab, *mp; - off_t off; - int rc, len, toggle; - char *ptr; - HANDLE mfd; -#ifdef _WIN32 - OVERLAPPED ov; -#else - int r2; -#endif + MDB_env *env; + MDB_meta meta, metab, *mp; + off_t off; + int rc, len, toggle; + char *ptr; + HANDLE mfd; + #ifdef _WIN32 + OVERLAPPED ov; + #else + int r2; + #endif - toggle = txn->mt_txnid & 1; - DPRINTF(("writing meta page %d for root page %"Z"u", - toggle, txn->mt_dbs[MAIN_DBI].md_root)); + toggle = txn->mt_txnid & 1; + DPRINTF(("writing meta page %d for root page %"Z"u", + toggle, txn->mt_dbs[MAIN_DBI].md_root)); - env = txn->mt_env; - mp = env->me_metas[toggle]; + env = txn->mt_env; + mp = env->me_metas[toggle]; - if (env->me_flags & MDB_WRITEMAP) { - // Persist any increases of mapsize config - if (env->me_mapsize > mp->mm_mapsize) - mp->mm_mapsize = env->me_mapsize; - mp->mm_dbs[0] = txn->mt_dbs[0]; - mp->mm_dbs[1] = txn->mt_dbs[1]; - mp->mm_last_pg = txn->mt_next_pgno - 1; - mp->mm_txnid = txn->mt_txnid; - if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { - unsigned meta_size = env->me_psize; - rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; - ptr = env->me_map; - if (toggle) { -#ifndef _WIN32 // POSIX msync() requires ptr = start of OS page - if (meta_size < env->me_os_psize) - meta_size += meta_size; - else -#endif - ptr += meta_size; + if (env->me_flags & MDB_WRITEMAP) { + // Persist any increases of mapsize config + if (env->me_mapsize > mp->mm_mapsize) + mp->mm_mapsize = env->me_mapsize; + mp->mm_dbs[0] = txn->mt_dbs[0]; + mp->mm_dbs[1] = txn->mt_dbs[1]; + mp->mm_last_pg = txn->mt_next_pgno - 1; + mp->mm_txnid = txn->mt_txnid; + if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { + unsigned meta_size = env->me_psize; + rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + ptr = env->me_map; + if (toggle) { + #ifndef _WIN32 // POSIX msync() requires ptr = start of OS page + if (meta_size < env->me_os_psize) + meta_size += meta_size; + else + #endif + ptr += meta_size; + } + if (MDB_MSYNC(ptr, meta_size, rc)) { + rc = ErrCode(); + goto fail; + } + } + goto done; } - if (MDB_MSYNC(ptr, meta_size, rc)) { - rc = ErrCode(); - goto fail; + metab.mm_txnid = env->me_metas[toggle]->mm_txnid; + metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; + + ptr = (char *)&meta; + if (env->me_mapsize > mp->mm_mapsize) { + // Persist any increases of mapsize config + meta.mm_mapsize = env->me_mapsize; + off = offsetof(MDB_meta, mm_mapsize); + } else { + off = offsetof(MDB_meta, mm_dbs[0].md_depth); } - } - goto done; - } - metab.mm_txnid = env->me_metas[toggle]->mm_txnid; - metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; + len = sizeof(MDB_meta) - off; - ptr = (char *)&meta; - if (env->me_mapsize > mp->mm_mapsize) { - // Persist any increases of mapsize config - meta.mm_mapsize = env->me_mapsize; - off = offsetof(MDB_meta, mm_mapsize); - } else { - off = offsetof(MDB_meta, mm_dbs[0].md_depth); - } - len = sizeof(MDB_meta) - off; + ptr += off; + meta.mm_dbs[0] = txn->mt_dbs[0]; + meta.mm_dbs[1] = txn->mt_dbs[1]; + meta.mm_last_pg = txn->mt_next_pgno - 1; + meta.mm_txnid = txn->mt_txnid; - ptr += off; - meta.mm_dbs[0] = txn->mt_dbs[0]; - meta.mm_dbs[1] = txn->mt_dbs[1]; - meta.mm_last_pg = txn->mt_next_pgno - 1; - meta.mm_txnid = txn->mt_txnid; + if (toggle) + off += env->me_psize; + off += PAGEHDRSZ; - if (toggle) - off += env->me_psize; - off += PAGEHDRSZ; + // Write to the SYNC fd + mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ? + env->me_fd : env->me_mfd; + #ifdef _WIN32 + { + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) + rc = -1; + } + #else + rc = pwrite(mfd, ptr, len, off); + #endif + if (rc != len) { + rc = rc < 0 ? ErrCode() : EIO; + DPUTS("write failed, disk error?"); + // On a failure, the pagecache still contains the new data. + // Write some old data back, to prevent it from being used. + // Use the non-SYNC fd; we know it will fail anyway. + meta.mm_last_pg = metab.mm_last_pg; + meta.mm_txnid = metab.mm_txnid; + #ifdef _WIN32 + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + WriteFile(env->me_fd, ptr, len, NULL, &ov); + #else + r2 = pwrite(env->me_fd, ptr, len, off); + (void)r2; // Silence warnings. We don't care about pwrite's return value + #endif + fail: + env->me_flags |= MDB_FATAL_ERROR; + return rc; + } + done: + // Memory ordering issues are irrelevant; since the entire writer + // is wrapped by wmutex, all of these changes will become visible + // after the wmutex is unlocked. Since the DB is multi-version, + // readers will get consistent data regardless of how fresh or + // how stale their view of these values is. + if (env->me_txns) + env->me_txns->mti_txnid = txn->mt_txnid; - // Write to the SYNC fd - mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ? - env->me_fd : env->me_mfd; -#ifdef _WIN32 - { - memset(&ov, 0, sizeof(ov)); - ov.Offset = off; - if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) - rc = -1; - } -#else - rc = pwrite(mfd, ptr, len, off); -#endif - if (rc != len) { - rc = rc < 0 ? ErrCode() : EIO; - DPUTS("write failed, disk error?"); - // On a failure, the pagecache still contains the new data. - // Write some old data back, to prevent it from being used. - // Use the non-SYNC fd; we know it will fail anyway. - meta.mm_last_pg = metab.mm_last_pg; - meta.mm_txnid = metab.mm_txnid; -#ifdef _WIN32 - memset(&ov, 0, sizeof(ov)); - ov.Offset = off; - WriteFile(env->me_fd, ptr, len, NULL, &ov); -#else - r2 = pwrite(env->me_fd, ptr, len, off); - (void)r2; // Silence warnings. We don't care about pwrite's return value -#endif -fail: - env->me_flags |= MDB_FATAL_ERROR; - return rc; - } -done: - // Memory ordering issues are irrelevant; since the entire writer - // is wrapped by wmutex, all of these changes will become visible - // after the wmutex is unlocked. Since the DB is multi-version, - // readers will get consistent data regardless of how fresh or - // how stale their view of these values is. - if (env->me_txns) - env->me_txns->mti_txnid = txn->mt_txnid; - - return MDB_SUCCESS; + return MDB_SUCCESS; */ return nil } @@ -1132,53 +1131,53 @@ done: // @return 0 on success, non-zero on failure. func (t *transaction) getPage(id int) (*page, int, error) { /* - MDB_env *env = txn->mt_env; - MDB_page *p = NULL; - int level; + MDB_env *env = txn->mt_env; + MDB_page *p = NULL; + int level; - if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) { - MDB_txn *tx2 = txn; - level = 1; - do { - MDB_ID2L dl = tx2->mt_u.dirty_list; - unsigned x; - // Spilled pages were dirtied in this txn and flushed - // because the dirty list got full. Bring this page - // back in from the map (but don't unspill it here, - // leave that unless page_touch happens again). - if (tx2->mt_spill_pgs) { - MDB_ID pn = pgno << 1; - x = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { - p = (MDB_page *)(env->me_map + env->me_psize * pgno); - goto done; - } + if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) { + MDB_txn *tx2 = txn; + level = 1; + do { + MDB_ID2L dl = tx2->mt_u.dirty_list; + unsigned x; + // Spilled pages were dirtied in this txn and flushed + // because the dirty list got full. Bring this page + // back in from the map (but don't unspill it here, + // leave that unless page_touch happens again). + if (tx2->mt_spill_pgs) { + MDB_ID pn = pgno << 1; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + goto done; + } + } + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + p = dl[x].mptr; + goto done; + } + } + level++; + } while ((tx2 = tx2->mt_parent) != NULL); } - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - p = dl[x].mptr; - goto done; - } + + if (pgno < txn->mt_next_pgno) { + level = 0; + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + } else { + DPRINTF(("page %"Z"u not found", pgno)); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_NOTFOUND; } - level++; - } while ((tx2 = tx2->mt_parent) != NULL); - } - if (pgno < txn->mt_next_pgno) { - level = 0; - p = (MDB_page *)(env->me_map + env->me_psize * pgno); - } else { - DPRINTF(("page %"Z"u not found", pgno)); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_NOTFOUND; - } - -done: - *ret = p; - if (lvl) - *lvl = level; - return MDB_SUCCESS; + done: + *ret = p; + if (lvl) + *lvl = level; + return MDB_SUCCESS; */ return nil, 0, nil @@ -1191,349 +1190,354 @@ done: // @return 0 on success, non-zero on failure. func (t *transaction) readNode(leaf *node, data []byte) error { /* - MDB_page *omp; // overflow page - pgno_t pgno; - int rc; + MDB_page *omp; // overflow page + pgno_t pgno; + int rc; - if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { + if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { + data->mv_size = NODEDSZ(leaf); + data->mv_data = NODEDATA(leaf); + return MDB_SUCCESS; + } + + // Read overflow data. data->mv_size = NODEDSZ(leaf); - data->mv_data = NODEDATA(leaf); + memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); + if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) { + DPRINTF(("read overflow page %"Z"u failed", pgno)); + return rc; + } + data->mv_data = METADATA(omp); + return MDB_SUCCESS; - } - - // Read overflow data. - data->mv_size = NODEDSZ(leaf); - memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); - if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) { - DPRINTF(("read overflow page %"Z"u failed", pgno)); - return rc; - } - data->mv_data = METADATA(omp); - - return MDB_SUCCESS; */ return nil } func (t *transaction) Get(bucket Bucket, key []byte) ([]byte, error) { /* - MDB_cursor mc; - MDB_xcursor mx; - int exact = 0; - DKBUF; + MDB_cursor mc; + MDB_xcursor mx; + int exact = 0; + DKBUF; - if (key == NULL || data == NULL) - return EINVAL; + if (key == NULL || data == NULL) + return EINVAL; - DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); + DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) - return EINVAL; + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; - if (txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; - mdb_cursor_init(&mc, txn, dbi, &mx); - return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); + mdb_cursor_init(&mc, txn, dbi, &mx); + return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); */ return nil, nil } func (t *transaction) Cursor(b Bucket) (Cursor, error) { /* - MDB_cursor *mc; - size_t size = sizeof(MDB_cursor); + MDB_cursor *mc; + size_t size = sizeof(MDB_cursor); - if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) - return EINVAL; + if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; - if (txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; - // Allow read access to the freelist - if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) - return EINVAL; + // Allow read access to the freelist + if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EINVAL; - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) - size += sizeof(MDB_xcursor); + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) + size += sizeof(MDB_xcursor); - if ((mc = malloc(size)) != NULL) { - mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); - if (txn->mt_cursors) { - mc->mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = mc; - mc->mc_flags |= C_UNTRACK; + if ((mc = malloc(size)) != NULL) { + mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); + if (txn->mt_cursors) { + mc->mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = mc; + mc->mc_flags |= C_UNTRACK; + } + } else { + return ENOMEM; } - } else { - return ENOMEM; - } - *ret = mc; + *ret = mc; - return MDB_SUCCESS; + return MDB_SUCCESS; + */ + return nil, nil +} + +func (t *transaction) Renew1(c Cursor) error { + /* + if (txn == NULL || mc == NULL || mc->mc_dbi >= txn->mt_numdbs) + return EINVAL; + + if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) + return EINVAL; + + mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); + return MDB_SUCCESS; */ return nil } -func (t *transaction) Renew(c Cursor) error { +func (t *transaction) Delete(b *Bucket, key []byte, data []byte) error { /* - if (txn == NULL || mc == NULL || mc->mc_dbi >= txn->mt_numdbs) - return EINVAL; + MDB_cursor mc; + MDB_xcursor mx; + MDB_cursor_op op; + MDB_val rdata, *xdata; + int rc, exact; + DKBUF; - if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) - return EINVAL; + if (key == NULL) + return EINVAL; - mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); - return MDB_SUCCESS; - */ -} + DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); -func (t *transaction) Delete(b *bucket, key []byte, data []byte) error { - /* - MDB_cursor mc; - MDB_xcursor mx; - MDB_cursor_op op; - MDB_val rdata, *xdata; - int rc, exact; - DKBUF; + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; - if (key == NULL) - return EINVAL; + if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); + mdb_cursor_init(&mc, txn, dbi, &mx); - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) - return EINVAL; - - if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - mdb_cursor_init(&mc, txn, dbi, &mx); - - exact = 0; - if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { - // must ignore any data - data = NULL; - } - if (data) { - op = MDB_GET_BOTH; - rdata = *data; - xdata = &rdata; - } else { - op = MDB_SET; - xdata = NULL; - } - rc = mdb_cursor_set(&mc, key, xdata, op, &exact); - if (rc == 0) { - // let mdb_page_split know about this cursor if needed: - // delete will trigger a rebalance; if it needs to move - // a node from one page to another, it will have to - // update the parent's separator key(s). If the new sepkey - // is larger than the current one, the parent page may - // run out of space, triggering a split. We need this - // cursor to be consistent until the end of the rebalance. - mc.mc_flags |= C_UNTRACK; - mc.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &mc; - rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA); - txn->mt_cursors[dbi] = mc.mc_next; - } - return rc; + exact = 0; + if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { + // must ignore any data + data = NULL; + } + if (data) { + op = MDB_GET_BOTH; + rdata = *data; + xdata = &rdata; + } else { + op = MDB_SET; + xdata = NULL; + } + rc = mdb_cursor_set(&mc, key, xdata, op, &exact); + if (rc == 0) { + // let mdb_page_split know about this cursor if needed: + // delete will trigger a rebalance; if it needs to move + // a node from one page to another, it will have to + // update the parent's separator key(s). If the new sepkey + // is larger than the current one, the parent page may + // run out of space, triggering a split. We need this + // cursor to be consistent until the end of the rebalance. + mc.mc_flags |= C_UNTRACK; + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA); + txn->mt_cursors[dbi] = mc.mc_next; + } + return rc; */ return nil } func (t *transaction) Put(b Bucket, key []byte, data []byte, flags int) error { /* - MDB_cursor mc; - MDB_xcursor mx; + MDB_cursor mc; + MDB_xcursor mx; - if (key == NULL || data == NULL) - return EINVAL; + if (key == NULL || data == NULL) + return EINVAL; - if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) - return EINVAL; + if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; - if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) - return EINVAL; + if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) + return EINVAL; - mdb_cursor_init(&mc, txn, dbi, &mx); - return mdb_cursor_put(&mc, key, data, flags); + mdb_cursor_init(&mc, txn, dbi, &mx); + return mdb_cursor_put(&mc, key, data, flags); */ + return nil } -func (t *transaction) Bucket(name string, flags int) (Bucket, error) { +func (t *transaction) Bucket(name string, flags int) (*Bucket, error) { /* - MDB_val key, data; - MDB_dbi i; - MDB_cursor mc; - int rc, dbflag, exact; - unsigned int unused = 0; - size_t len; + MDB_val key, data; + MDB_dbi i; + MDB_cursor mc; + int rc, dbflag, exact; + unsigned int unused = 0; + size_t len; - if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) { - mdb_default_cmp(txn, FREE_DBI); - } + if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) { + mdb_default_cmp(txn, FREE_DBI); + } - if ((flags & VALID_FLAGS) != flags) - return EINVAL; - if (txn->mt_flags & MDB_TXN_ERROR) - return MDB_BAD_TXN; + if ((flags & VALID_FLAGS) != flags) + return EINVAL; + if (txn->mt_flags & MDB_TXN_ERROR) + return MDB_BAD_TXN; - // main DB? - if (!name) { - *dbi = MAIN_DBI; - if (flags & PERSISTENT_FLAGS) { - uint16_t f2 = flags & PERSISTENT_FLAGS; - // make sure flag changes get committed - if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { - txn->mt_dbs[MAIN_DBI].md_flags |= f2; - txn->mt_flags |= MDB_TXN_DIRTY; + // main DB? + if (!name) { + *dbi = MAIN_DBI; + if (flags & PERSISTENT_FLAGS) { + uint16_t f2 = flags & PERSISTENT_FLAGS; + // make sure flag changes get committed + if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { + txn->mt_dbs[MAIN_DBI].md_flags |= f2; + txn->mt_flags |= MDB_TXN_DIRTY; + } } - } - mdb_default_cmp(txn, MAIN_DBI); - return MDB_SUCCESS; - } - - if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { - mdb_default_cmp(txn, MAIN_DBI); - } - - // Is the DB already open? - len = strlen(name); - for (i=2; imt_numdbs; i++) { - if (!txn->mt_dbxs[i].md_name.mv_size) { - // Remember this free slot - if (!unused) unused = i; - continue; - } - if (len == txn->mt_dbxs[i].md_name.mv_size && - !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { - *dbi = i; + mdb_default_cmp(txn, MAIN_DBI); return MDB_SUCCESS; } - } - // If no free slot and max hit, fail - if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) - return MDB_DBS_FULL; - - // Cannot mix named databases with some mainDB flags - if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) - return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; - - // Find the DB info - dbflag = DB_NEW|DB_VALID; - exact = 0; - key.mv_size = len; - key.mv_data = (void *)name; - mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); - if (rc == MDB_SUCCESS) { - // make sure this is actually a DB - MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if (!(node->mn_flags & F_SUBDATA)) - return MDB_INCOMPATIBLE; - } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) { - // Create if requested - MDB_db dummy; - data.mv_size = sizeof(MDB_db); - data.mv_data = &dummy; - memset(&dummy, 0, sizeof(dummy)); - dummy.md_root = P_INVALID; - dummy.md_flags = flags & PERSISTENT_FLAGS; - rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); - dbflag |= DB_DIRTY; - } - - // OK, got info, add to table - if (rc == MDB_SUCCESS) { - unsigned int slot = unused ? unused : txn->mt_numdbs; - txn->mt_dbxs[slot].md_name.mv_data = strdup(name); - txn->mt_dbxs[slot].md_name.mv_size = len; - txn->mt_dbxs[slot].md_rel = NULL; - txn->mt_dbflags[slot] = dbflag; - memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); - *dbi = slot; - mdb_default_cmp(txn, slot); - if (!unused) { - txn->mt_numdbs++; + if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + mdb_default_cmp(txn, MAIN_DBI); } - } - return rc; + // Is the DB already open? + len = strlen(name); + for (i=2; imt_numdbs; i++) { + if (!txn->mt_dbxs[i].md_name.mv_size) { + // Remember this free slot + if (!unused) unused = i; + continue; + } + if (len == txn->mt_dbxs[i].md_name.mv_size && + !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { + *dbi = i; + return MDB_SUCCESS; + } + } + + // If no free slot and max hit, fail + if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) + return MDB_DBS_FULL; + + // Cannot mix named databases with some mainDB flags + if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) + return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; + + // Find the DB info + dbflag = DB_NEW|DB_VALID; + exact = 0; + key.mv_size = len; + key.mv_data = (void *)name; + mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); + rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); + if (rc == MDB_SUCCESS) { + // make sure this is actually a DB + MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if (!(node->mn_flags & F_SUBDATA)) + return MDB_INCOMPATIBLE; + } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) { + // Create if requested + MDB_db dummy; + data.mv_size = sizeof(MDB_db); + data.mv_data = &dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.md_root = P_INVALID; + dummy.md_flags = flags & PERSISTENT_FLAGS; + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); + dbflag |= DB_DIRTY; + } + + // OK, got info, add to table + if (rc == MDB_SUCCESS) { + unsigned int slot = unused ? unused : txn->mt_numdbs; + txn->mt_dbxs[slot].md_name.mv_data = strdup(name); + txn->mt_dbxs[slot].md_name.mv_size = len; + txn->mt_dbxs[slot].md_rel = NULL; + txn->mt_dbflags[slot] = dbflag; + memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); + *dbi = slot; + mdb_default_cmp(txn, slot); + if (!unused) { + txn->mt_numdbs++; + } + } + + return rc; */ return nil, nil } func (t *transaction) Stat(b Bucket) *Stat { - if (txn == NULL || arg == NULL || dbi >= txn->mt_numdbs) - return EINVAL; + /* + if (txn == NULL || arg == NULL || dbi >= txn->mt_numdbs) + return EINVAL; - if (txn->mt_dbflags[dbi] & DB_STALE) { - MDB_cursor mc; - MDB_xcursor mx; - /* Stale, must read the DB's root. cursor_init does it for us. */ - mdb_cursor_init(&mc, txn, dbi, &mx); - } - return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); + if (txn->mt_dbflags[dbi] & DB_STALE) { + MDB_cursor mc; + MDB_xcursor mx; + // Stale, must read the DB's root. cursor_init does it for us. + mdb_cursor_init(&mc, txn, dbi, &mx); + } + return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); + */ + return nil } func (t *transaction) BucketFlags(b Bucket) (int, error) { /* - // We could return the flags for the FREE_DBI too but what's the point? - if (txn == NULL || dbi < MAIN_DBI || dbi >= txn->mt_numdbs) - return EINVAL; - *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; - return MDB_SUCCESS; + // We could return the flags for the FREE_DBI too but what's the point? + if (txn == NULL || dbi < MAIN_DBI || dbi >= txn->mt_numdbs) + return EINVAL; + *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; + return MDB_SUCCESS; */ return 0, nil } -func (t *transaction) Drop(b Bucket int del) error { +func (t *transaction) Drop(b *Bucket, del int) error { /* - MDB_cursor *mc, *m2; - int rc; + MDB_cursor *mc, *m2; + int rc; - if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1 || !(txn->mt_dbflags[dbi] & DB_VALID)) - return EINVAL; + if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1 || !(txn->mt_dbflags[dbi] & DB_VALID)) + return EINVAL; - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) - return EACCES; + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + return EACCES; - rc = mdb_cursor_open(txn, dbi, &mc); - if (rc) - return rc; + rc = mdb_cursor_open(txn, dbi, &mc); + if (rc) + return rc; - rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); - // Invalidate the dropped DB's cursors - for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) - m2->mc_flags &= ~(C_INITIALIZED|C_EOF); - if (rc) - goto leave; + rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); + // Invalidate the dropped DB's cursors + for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + m2->mc_flags &= ~(C_INITIALIZED|C_EOF); + if (rc) + goto leave; - // Can't delete the main DB - if (del && dbi > MAIN_DBI) { - rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL); - if (!rc) { - txn->mt_dbflags[dbi] = DB_STALE; - mdb_dbi_close(txn->mt_env, dbi); - } - } else { - // reset the DB record, mark it dirty - txn->mt_dbflags[dbi] |= DB_DIRTY; - txn->mt_dbs[dbi].md_depth = 0; - txn->mt_dbs[dbi].md_branch_pages = 0; - txn->mt_dbs[dbi].md_leaf_pages = 0; - txn->mt_dbs[dbi].md_overflow_pages = 0; - txn->mt_dbs[dbi].md_entries = 0; - txn->mt_dbs[dbi].md_root = P_INVALID; + // Can't delete the main DB + if (del && dbi > MAIN_DBI) { + rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL); + if (!rc) { + txn->mt_dbflags[dbi] = DB_STALE; + mdb_dbi_close(txn->mt_env, dbi); + } + } else { + // reset the DB record, mark it dirty + txn->mt_dbflags[dbi] |= DB_DIRTY; + txn->mt_dbs[dbi].md_depth = 0; + txn->mt_dbs[dbi].md_branch_pages = 0; + txn->mt_dbs[dbi].md_leaf_pages = 0; + txn->mt_dbs[dbi].md_overflow_pages = 0; + txn->mt_dbs[dbi].md_entries = 0; + txn->mt_dbs[dbi].md_root = P_INVALID; - txn->mt_flags |= MDB_TXN_DIRTY; - } -leave: - mdb_cursor_close(mc); - return rc; + txn->mt_flags |= MDB_TXN_DIRTY; + } + leave: + mdb_cursor_close(mc); + return rc; */ return nil }