Hash selection for database is incorrect #1
parent
20cd3d83f1
commit
b2298602df
4
NEXT
4
NEXT
|
@ -1,4 +0,0 @@
|
|||
- Panic in verification that 2*eps is less than bucket width.
|
||||
Make sure the "if" equality is correct.
|
||||
|
||||
- Make sure hash contains ":" between bucket numbers. Or the analogy to ":".
|
|
@ -0,0 +1,17 @@
|
|||
# Hashing float vectors in N-dimensions
|
||||
|
||||
This is a working beta version.
|
||||
|
||||
### Algorithm
|
||||
|
||||
https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
|
||||
|
||||
### How to use
|
||||
|
||||
about.go contains a short instruction.
|
||||
|
||||
Please fork if you want use it long term, as the API is not finalized yet.
|
||||
|
||||
### TODO
|
||||
|
||||
Add an example, for example related to package "images".
|
14
about.go
14
about.go
|
@ -1,4 +1,14 @@
|
|||
package hyper
|
||||
|
||||
// Package hyper discretizes n-dimensional space and generates hashes,
|
||||
// so that fast approximate search of nearest points in n-space is possible.
|
||||
// Package hyper allows fast approximate search of nearest
|
||||
// neighbour vectors in n-dimensional space.
|
||||
// Package functions discretize a vector and generate a set
|
||||
// of fuzzy hashes, as described in the following paper:
|
||||
// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
|
||||
|
||||
// A typical sequence of functions when using the package is:
|
||||
// 1) Params, 2) Hypercubes, 3) FVN1a to get the central hash,
|
||||
// and Hashes64 with FVN1a as the hash argument to get
|
||||
// the full hash set.
|
||||
|
||||
// You can also define own function for hashing hypercubes.
|
||||
|
|
100
buckets.go
100
buckets.go
|
@ -1,100 +0,0 @@
|
|||
package hyper
|
||||
|
||||
// Params returns discretization parameters.
|
||||
// numBuckets represents number of discretization buckets into which all values
|
||||
// will fall. Ids of those buckets will be used to create hashes.
|
||||
// min and max are minimum and maximum possible values of discretized variable.
|
||||
// bucketWidth is width of the discretization bucket.
|
||||
// bucketPct is percentage of bucketWidth to allow for an error of discretized
|
||||
// variable (a specific value of a discretized variable may fall into 2 buckets
|
||||
// simultaneosly).
|
||||
// eps is actual width corresponding to the bucketWidth bucketPct on the discretized
|
||||
// variable axis.
|
||||
func Params(numBuckets int, min, max, bucketPct float64) (bucketWidth, eps float64) {
|
||||
if bucketPct >= 0.5 {
|
||||
panic("Error: bucketPct must be less than 50%. Recommendation: decrease numBuckets instead.")
|
||||
}
|
||||
bucketWidth = (max - min) / float64(numBuckets)
|
||||
eps = bucketPct * bucketWidth
|
||||
return bucketWidth, eps
|
||||
}
|
||||
|
||||
// Buckets generates a set of slices of all possible bucket ids
|
||||
// as permutations based on n-dimensional space discretization.
|
||||
// point are values for each of those n dimensions.
|
||||
// min and max are minimum and maximum possible values of discretized
|
||||
// point components. The assumption is that min and max are the same for all
|
||||
// dimensions (in the context of the Buckets function).
|
||||
// bucketWidth and eps are defined in the Params function.
|
||||
func Buckets(point []float64, min, max, bucketWidth, eps float64) (tree [][]int) {
|
||||
|
||||
// Bucket ids. Default bucket is b.
|
||||
var (
|
||||
val float64 // Sample value (one axis of n-space).
|
||||
bL, bR int // Left and right bucket ids.
|
||||
treeCopy [][]int // Bucket tree copy.
|
||||
length int
|
||||
)
|
||||
|
||||
// For each component of the point.
|
||||
for k := 0; k < len(point); k++ {
|
||||
val = point[k]
|
||||
|
||||
bL = int((val - eps) / bucketWidth)
|
||||
bR = int((val + eps) / bucketWidth)
|
||||
|
||||
if val-eps < min { // No bucket for smaller than min.
|
||||
bL = bR
|
||||
} else if val+eps > max { // No bucket for larger than max.
|
||||
bR = bL
|
||||
}
|
||||
|
||||
if bL == bR { // No branching.
|
||||
if len(tree) == 0 {
|
||||
tree = append(tree, []int{bL})
|
||||
} else {
|
||||
length = len(tree)
|
||||
for i := 0; i < length; i++ {
|
||||
// Constructing buckets set.
|
||||
tree[i] = append(tree[i], bL)
|
||||
}
|
||||
}
|
||||
|
||||
} else { // Branching.
|
||||
treeCopy = make([][]int, len(tree))
|
||||
copy(treeCopy, tree)
|
||||
|
||||
if len(tree) == 0 {
|
||||
tree = append(tree, []int{bL})
|
||||
} else {
|
||||
length = len(tree)
|
||||
for i := 0; i < length; i++ {
|
||||
tree[i] = append(tree[i], bL)
|
||||
}
|
||||
}
|
||||
|
||||
if len(treeCopy) == 0 {
|
||||
treeCopy = append(treeCopy, []int{bR})
|
||||
} else {
|
||||
length = len(treeCopy)
|
||||
for i := 0; i < length; i++ {
|
||||
treeCopy[i] = append(treeCopy[i], bR)
|
||||
}
|
||||
}
|
||||
|
||||
tree = append(tree, treeCopy...)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Verification that branching works correctly and no buckets are lost.
|
||||
// TODO: Disable once whole package got tested on large image sets.
|
||||
length = len(point)
|
||||
for i := 0; i < len(tree); i++ {
|
||||
if len(tree[i]) != length {
|
||||
panic(`Buckets slice length must be equal to len(point).`)
|
||||
}
|
||||
}
|
||||
|
||||
return tree
|
||||
}
|
|
@ -1,46 +0,0 @@
|
|||
package hyper
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParams(t *testing.T) {
|
||||
numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25
|
||||
bucketWidth, eps := Params(numBuckets, min, max, bucketPct)
|
||||
wantBucketWidth, wantEps := 25.5, 6.375
|
||||
if bucketWidth != wantBucketWidth {
|
||||
t.Errorf(`Got bucketWidth %v, want %v`, bucketWidth, wantBucketWidth)
|
||||
}
|
||||
if eps != wantEps {
|
||||
t.Errorf(`Got eps %v, want %v`, eps, wantEps)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParamsPanic(t *testing.T) {
|
||||
defer func() { recover() }()
|
||||
// Intentionally forbiden value for bucketPct.
|
||||
numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.51
|
||||
_, _ = Params(numBuckets, min, max, bucketPct)
|
||||
// Never reaches here if Params panics.
|
||||
t.Errorf("Params did not panic on bucketPct > 0.5")
|
||||
}
|
||||
|
||||
func TestBuckets(t *testing.T) {
|
||||
numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25
|
||||
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
|
||||
bucketWidth, eps := Params(numBuckets, min, max, bucketPct)
|
||||
got := Buckets(values, min, max, bucketWidth, eps)
|
||||
want := [][]int{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9},
|
||||
{0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Errorf(`Got %v, want %v. Number of buckets is %v.`, got, want, numBuckets)
|
||||
}
|
||||
|
||||
values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001}
|
||||
got = Buckets(values, min, max, bucketWidth, eps)
|
||||
want = [][]int{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Errorf(`Got %v, want %v. Number of buckets is %v.`, got, want, numBuckets)
|
||||
}
|
||||
}
|
|
@ -9,9 +9,8 @@ import (
|
|||
// Hash64 can be any function of this kind.
|
||||
type Hash64 func(buckets []int) uint64
|
||||
|
||||
// Default is the default Hash64 function for this package.
|
||||
// It returns a FVN-1a hash for a slice of bucket numbers.
|
||||
func Default(buckets []int) uint64 {
|
||||
// FVN1a is the default hash in this package.
|
||||
func FVN1a(buckets []int) uint64 {
|
||||
var b bytes.Buffer
|
||||
gob.NewEncoder(&b).Encode(buckets)
|
||||
hash := fnv.New64a()
|
||||
|
|
|
@ -7,7 +7,7 @@ import (
|
|||
|
||||
func TestDefault(t *testing.T) {
|
||||
buckets := []int{5, 59, 255, 9, 7, 12, 22, 31}
|
||||
hash := Default(buckets)
|
||||
hash := FVN1a(buckets)
|
||||
want := uint64(13992349377752315208)
|
||||
if hash != want {
|
||||
t.Errorf(`Got %v, want %v`, hash, want)
|
||||
|
@ -20,7 +20,7 @@ func TestHashes64(t *testing.T) {
|
|||
{1, 0, 7, 3, 0, 0, 9},
|
||||
{0, 0, 8, 3, 0, 0, 9},
|
||||
{1, 0, 8, 3, 0, 0, 9}}
|
||||
hs := Hashes64(tree, Default)
|
||||
hs := Hashes64(tree, FVN1a)
|
||||
want := []uint64{
|
||||
14647827280143437043,
|
||||
17530493565529410009,
|
||||
|
|
|
@ -0,0 +1,117 @@
|
|||
package hyper
|
||||
|
||||
// Params returns discretization parameters.
|
||||
// numBuckets represents number of discretization buckets into
|
||||
// which all values will fall. Ids of those buckets will be used
|
||||
// to create hashes.
|
||||
// min and max are minimum and maximum possible values
|
||||
// of discretized variable.
|
||||
// bucketWidth is width of the discretization bucket.
|
||||
// bucketPct is percentage of bucketWidth to allow for an error
|
||||
// of discretized variable (a specific value of a discretized
|
||||
// variable may fall into 2 buckets simultaneosly).
|
||||
// eps is actual width corresponding to the bucketWidth bucketPct
|
||||
// on the discretized variable axis.
|
||||
func Params(
|
||||
numBuckets int, min, max, bucketPct float64) (bucketWidth, eps float64) {
|
||||
if bucketPct >= 0.5 {
|
||||
panic(`Error: bucketPct must be less than 50%.
|
||||
Recommendation: decrease numBuckets instead.`)
|
||||
}
|
||||
bucketWidth = (max - min) / float64(numBuckets)
|
||||
eps = bucketPct * bucketWidth
|
||||
return bucketWidth, eps
|
||||
}
|
||||
|
||||
// Hypercubes returns a set of hypercubes, which represent
|
||||
// fuzzy discretization of one n-dimensional vector, as described in
|
||||
// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
|
||||
// One hupercube is defined by bucket numbers in each dimension.
|
||||
// The function also returns the central hypercube (in which
|
||||
// the vector end is located).
|
||||
// min and max are minimum and maximum possible values of
|
||||
// the vector components. The assumption is that min and max
|
||||
// are the same for all dimensions.
|
||||
// bucketWidth and eps are defined in the Params function.
|
||||
func Hypercubes(
|
||||
vector []float64, min, max, bucketWidth, eps float64) (
|
||||
set [][]int, central []int) {
|
||||
|
||||
var (
|
||||
bC, bS int // Central and side bucket ids.
|
||||
setCopy [][]int // Set copy.
|
||||
length int
|
||||
branching bool // Branching flag.
|
||||
)
|
||||
|
||||
// For each component of the vector.
|
||||
for _, val := range vector {
|
||||
|
||||
bC = int(val / bucketWidth)
|
||||
central = append(central, bC)
|
||||
branching = false
|
||||
|
||||
// Value is in the lower uncertainty interval.
|
||||
if val-float64(bC)*bucketWidth < eps {
|
||||
bS = bC - 1
|
||||
if val-eps >= min {
|
||||
branching = true
|
||||
}
|
||||
|
||||
// Value is in the upper uncertainty interval.
|
||||
} else if float64(bC+1)*bucketWidth-val < eps {
|
||||
bS = bC + 1
|
||||
if val+eps <= max {
|
||||
branching = true
|
||||
}
|
||||
}
|
||||
|
||||
if branching {
|
||||
setCopy = make([][]int, len(set))
|
||||
copy(setCopy, set)
|
||||
|
||||
if len(set) == 0 {
|
||||
set = append(set, []int{bC})
|
||||
} else {
|
||||
length = len(set)
|
||||
for i := 0; i < length; i++ {
|
||||
set[i] = append(set[i], bC)
|
||||
}
|
||||
}
|
||||
|
||||
if len(setCopy) == 0 {
|
||||
setCopy = append(setCopy, []int{bS})
|
||||
} else {
|
||||
length = len(setCopy)
|
||||
for i := 0; i < length; i++ {
|
||||
setCopy[i] = append(setCopy[i], bS)
|
||||
}
|
||||
}
|
||||
|
||||
set = append(set, setCopy...)
|
||||
|
||||
} else {
|
||||
|
||||
if len(set) == 0 {
|
||||
set = append(set, []int{bC})
|
||||
} else {
|
||||
length = len(set)
|
||||
for i := 0; i < length; i++ {
|
||||
set[i] = append(set[i], bC)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Real use case verification that branching works correctly
|
||||
// and no buckets are lost for a very large number of vectors.
|
||||
// TODO: Remove once tested.
|
||||
length = len(vector)
|
||||
for i := 0; i < len(set); i++ {
|
||||
if len(set[i]) != length {
|
||||
panic(`Number of hypercube coordinates must equal to len(vector).`)
|
||||
}
|
||||
}
|
||||
|
||||
return set, central
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
package hyper
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParams(t *testing.T) {
|
||||
numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25
|
||||
bucketWidth, eps := Params(numBuckets, min, max, bucketPct)
|
||||
wantBucketWidth, wantEps := 25.5, 6.375
|
||||
if bucketWidth != wantBucketWidth {
|
||||
t.Errorf(`Got bucketWidth %v, want %v`, bucketWidth, wantBucketWidth)
|
||||
}
|
||||
if eps != wantEps {
|
||||
t.Errorf(`Got eps %v, want %v`, eps, wantEps)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParamsPanic(t *testing.T) {
|
||||
defer func() { recover() }()
|
||||
// Intentionally forbiden value for bucketPct.
|
||||
numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.51
|
||||
_, _ = Params(numBuckets, min, max, bucketPct)
|
||||
// Never reaches here if Params panics.
|
||||
t.Errorf("Params did not panic on bucketPct > 0.5")
|
||||
}
|
||||
|
||||
func TestHypercubes(t *testing.T) {
|
||||
numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25
|
||||
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
|
||||
bucketWidth, eps := Params(numBuckets, min, max, bucketPct)
|
||||
gotCubes, gotCentral := Hypercubes(values, min, max, bucketWidth, eps)
|
||||
wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9},
|
||||
{1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}}
|
||||
wantCentral := []int{1, 0, 8, 3, 0, 0, 9}
|
||||
if !reflect.DeepEqual(gotCubes, wantCubes) {
|
||||
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
|
||||
}
|
||||
if !reflect.DeepEqual(gotCentral, wantCentral) {
|
||||
t.Errorf(`Got %v, want %v.`, gotCentral, wantCentral)
|
||||
}
|
||||
if centralIsNotInTheSet(gotCubes, gotCentral) {
|
||||
t.Errorf(`Central %v is not in the set %v.`, gotCentral, gotCubes)
|
||||
}
|
||||
|
||||
values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001}
|
||||
gotCubes, gotCentral = Hypercubes(values, min, max, bucketWidth, eps)
|
||||
wantCubes = [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}}
|
||||
wantCentral = []int{0, 1, 2}
|
||||
if !reflect.DeepEqual(gotCubes, wantCubes) {
|
||||
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
|
||||
}
|
||||
if !reflect.DeepEqual(gotCentral, wantCentral) {
|
||||
t.Errorf(`Got %v, want %v.`, gotCentral, wantCentral)
|
||||
}
|
||||
if centralIsNotInTheSet(gotCubes, wantCentral) {
|
||||
t.Errorf(`Central %v is not in the set %v.`, gotCentral, gotCubes)
|
||||
}
|
||||
}
|
||||
|
||||
func centralIsNotInTheSet(set [][]int, central []int) bool {
|
||||
for _, cube := range set {
|
||||
counter := 0
|
||||
for i, c := range central {
|
||||
if cube[i] == c {
|
||||
counter++
|
||||
}
|
||||
}
|
||||
if counter == len(central) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
Loading…
Reference in New Issue