Hash selection for database is incorrect #1

2021-12-15 05:21:55 +01:00 · 2021-12-15 05:21:55 +01:00 · b2298602df
parent 20cd3d83f1
commit b2298602df
9 changed files with 225 additions and 157 deletions
--- a/4
+++ b/4
@ -1,4 +0,0 @@
- Panic in verification that 2*eps is less than bucket width.
-  Make sure the "if" equality is correct.
-
- Make sure hash contains ":" between bucket numbers. Or the analogy to ":".
--- a/README.md
+++ b/README.md
@ -0,0 +1,17 @@
+# Hashing float vectors in N-dimensions
+
+This is a working beta version.
+
+### Algorithm
+
+https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
+
+### How to use
+
+about.go contains a short instruction.
+
+Please fork if you want use it long term, as the API is not finalized yet.
+
+### TODO
+
+Add an example, for example related to package "images".
--- a/about.go
+++ b/about.go
@ -1,4 +1,14 @@
 package hyper

-// Package hyper discretizes n-dimensional space and generates hashes,
-// so that fast approximate search of nearest points in n-space is possible.
+// Package hyper allows fast approximate search of nearest
+// neighbour vectors in n-dimensional space.
+// Package functions discretize a vector and generate a set
+// of fuzzy hashes, as described in the following paper:
+// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
+
+// A typical sequence of functions when using the package is:
+// 1) Params, 2) Hypercubes, 3) FVN1a to get the central hash,
+// and Hashes64 with FVN1a as the hash argument to get
+// the full hash set.
+
+// You can also define own function for hashing hypercubes.
--- a/buckets.go
+++ b/buckets.go
@ -1,100 +0,0 @@
-package hyper
-
-// Params returns discretization parameters.
-// numBuckets represents number of discretization buckets into which all values
-// will fall. Ids of those buckets will be used to create hashes.
-// min and max are minimum and maximum possible values of discretized variable.
-// bucketWidth is width of the discretization bucket.
-// bucketPct is percentage of bucketWidth to allow for an error of discretized
-// variable (a specific value of a discretized variable may fall into 2 buckets
-// simultaneosly).
-// eps is actual width corresponding to the bucketWidth bucketPct on the discretized
-// variable axis.
-func Params(numBuckets int, min, max, bucketPct float64) (bucketWidth, eps float64) {
-	if bucketPct >= 0.5 {
-		panic("Error: bucketPct must be less than 50%. Recommendation: decrease numBuckets instead.")
-	}
-	bucketWidth = (max - min) / float64(numBuckets)
-	eps = bucketPct * bucketWidth
-	return bucketWidth, eps
-}
-
-// Buckets generates a set of slices of all possible bucket ids
-// as permutations based on n-dimensional space discretization.
-// point are values for each of those n dimensions.
-// min and max are minimum and maximum possible values of discretized
-// point components. The assumption is that min and max are the same for all
-// dimensions (in the context of the Buckets function).
-// bucketWidth and eps are defined in the Params function.
-func Buckets(point []float64, min, max, bucketWidth, eps float64) (tree [][]int) {
-
-	// Bucket ids. Default bucket is b.
-	var (
-		val      float64 // Sample value (one axis of n-space).
-		bL, bR   int     // Left and right bucket ids.
-		treeCopy [][]int // Bucket tree copy.
-		length   int
-	)
-
-	// For each component of the point.
-	for k := 0; k < len(point); k++ {
-		val = point[k]
-
-		bL = int((val - eps) / bucketWidth)
-		bR = int((val + eps) / bucketWidth)
-
-		if val-eps < min { // No bucket for smaller than min.
-			bL = bR
-		} else if val+eps > max { // No bucket for larger than max.
-			bR = bL
-		}
-
-		if bL == bR { // No branching.
-			if len(tree) == 0 {
-				tree = append(tree, []int{bL})
-			} else {
-				length = len(tree)
-				for i := 0; i < length; i++ {
-					// Constructing buckets set.
-					tree[i] = append(tree[i], bL)
-				}
-			}
-
-		} else { // Branching.
-			treeCopy = make([][]int, len(tree))
-			copy(treeCopy, tree)
-
-			if len(tree) == 0 {
-				tree = append(tree, []int{bL})
-			} else {
-				length = len(tree)
-				for i := 0; i < length; i++ {
-					tree[i] = append(tree[i], bL)
-				}
-			}
-
-			if len(treeCopy) == 0 {
-				treeCopy = append(treeCopy, []int{bR})
-			} else {
-				length = len(treeCopy)
-				for i := 0; i < length; i++ {
-					treeCopy[i] = append(treeCopy[i], bR)
-				}
-			}
-
-			tree = append(tree, treeCopy...)
-		}
-
-	}
-
-	// Verification that branching works correctly and no buckets are lost.
-	// TODO: Disable once whole package got tested on large image sets.
-	length = len(point)
-	for i := 0; i < len(tree); i++ {
-		if len(tree[i]) != length {
-			panic(`Buckets slice length must be equal to len(point).`)
-		}
-	}
-
-	return tree
-}
--- a/buckets_test.go
+++ b/buckets_test.go
@ -1,46 +0,0 @@
-package hyper
-
-import (
-	"reflect"
-	"testing"
-)
-
-func TestParams(t *testing.T) {
-	numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25
-	bucketWidth, eps := Params(numBuckets, min, max, bucketPct)
-	wantBucketWidth, wantEps := 25.5, 6.375
-	if bucketWidth != wantBucketWidth {
-		t.Errorf(`Got bucketWidth %v, want %v`, bucketWidth, wantBucketWidth)
-	}
-	if eps != wantEps {
-		t.Errorf(`Got eps %v, want %v`, eps, wantEps)
-	}
-}
-
-func TestParamsPanic(t *testing.T) {
-	defer func() { recover() }()
-	// Intentionally forbiden value for bucketPct.
-	numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.51
-	_, _ = Params(numBuckets, min, max, bucketPct)
-	// Never reaches here if Params panics.
-	t.Errorf("Params did not panic on bucketPct > 0.5")
-}
-
-func TestBuckets(t *testing.T) {
-	numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25
-	values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
-	bucketWidth, eps := Params(numBuckets, min, max, bucketPct)
-	got := Buckets(values, min, max, bucketWidth, eps)
-	want := [][]int{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9},
-		{0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}}
-	if !reflect.DeepEqual(got, want) {
-		t.Errorf(`Got %v, want %v. Number of buckets is %v.`, got, want, numBuckets)
-	}
-
-	values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001}
-	got = Buckets(values, min, max, bucketWidth, eps)
-	want = [][]int{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}}
-	if !reflect.DeepEqual(got, want) {
-		t.Errorf(`Got %v, want %v. Number of buckets is %v.`, got, want, numBuckets)
-	}
-}
--- a/hashes.go
+++ b/hashes.go
@ -9,9 +9,8 @@ import (
 // Hash64 can be any function of this kind.
 type Hash64 func(buckets []int) uint64

-// Default is the default Hash64 function for this package.
-// It returns a FVN-1a hash for a slice of bucket numbers.
-func Default(buckets []int) uint64 {
+// FVN1a is the default hash in this package.
+func FVN1a(buckets []int) uint64 {
 	var b bytes.Buffer
 	gob.NewEncoder(&b).Encode(buckets)
 	hash := fnv.New64a()
--- a/hashes_test.go
+++ b/hashes_test.go
@ -7,7 +7,7 @@ import (

 func TestDefault(t *testing.T) {
 	buckets := []int{5, 59, 255, 9, 7, 12, 22, 31}
-	hash := Default(buckets)
+	hash := FVN1a(buckets)
 	want := uint64(13992349377752315208)
 	if hash != want {
 		t.Errorf(`Got %v, want %v`, hash, want)
@ -20,7 +20,7 @@ func TestHashes64(t *testing.T) {
 		{1, 0, 7, 3, 0, 0, 9},
 		{0, 0, 8, 3, 0, 0, 9},
 		{1, 0, 8, 3, 0, 0, 9}}
-	hs := Hashes64(tree, Default)
+	hs := Hashes64(tree, FVN1a)
 	want := []uint64{
 		14647827280143437043,
 		17530493565529410009,
--- a/hypercubes.go
+++ b/hypercubes.go
@ -0,0 +1,117 @@
+package hyper
+
+// Params returns discretization parameters.
+// numBuckets represents number of discretization buckets into
+// which all values will fall. Ids of those buckets will be used
+// to create hashes.
+// min and max are minimum and maximum possible values
+// of discretized variable.
+// bucketWidth is width of the discretization bucket.
+// bucketPct is percentage of bucketWidth to allow for an error
+// of discretized variable (a specific value of a discretized
+// variable may fall into 2 buckets simultaneosly).
+// eps is actual width corresponding to the bucketWidth bucketPct
+// on the discretized variable axis.
+func Params(
+	numBuckets int, min, max, bucketPct float64) (bucketWidth, eps float64) {
+	if bucketPct >= 0.5 {
+		panic(`Error: bucketPct must be less than 50%.
+			Recommendation: decrease numBuckets instead.`)
+	}
+	bucketWidth = (max - min) / float64(numBuckets)
+	eps = bucketPct * bucketWidth
+	return bucketWidth, eps
+}
+
+// Hypercubes returns a set of hypercubes, which represent
+// fuzzy discretization of one n-dimensional vector, as described in
+// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
+// One hupercube is defined by bucket numbers in each dimension.
+// The function also returns the central hypercube (in which
+// the vector end is located).
+// min and max are minimum and maximum possible values of
+// the vector components. The assumption is that min and max
+// are the same for all dimensions.
+// bucketWidth and eps are defined in the Params function.
+func Hypercubes(
+	vector []float64, min, max, bucketWidth, eps float64) (
+	set [][]int, central []int) {
+
+	var (
+		bC, bS    int     // Central and side bucket ids.
+		setCopy   [][]int // Set copy.
+		length    int
+		branching bool // Branching flag.
+	)
+
+	// For each component of the vector.
+	for _, val := range vector {
+
+		bC = int(val / bucketWidth)
+		central = append(central, bC)
+		branching = false
+
+		// Value is in the lower uncertainty interval.
+		if val-float64(bC)*bucketWidth < eps {
+			bS = bC - 1
+			if val-eps >= min {
+				branching = true
+			}
+
+			// Value is in the upper uncertainty interval.
+		} else if float64(bC+1)*bucketWidth-val < eps {
+			bS = bC + 1
+			if val+eps <= max {
+				branching = true
+			}
+		}
+
+		if branching {
+			setCopy = make([][]int, len(set))
+			copy(setCopy, set)
+
+			if len(set) == 0 {
+				set = append(set, []int{bC})
+			} else {
+				length = len(set)
+				for i := 0; i < length; i++ {
+					set[i] = append(set[i], bC)
+				}
+			}
+
+			if len(setCopy) == 0 {
+				setCopy = append(setCopy, []int{bS})
+			} else {
+				length = len(setCopy)
+				for i := 0; i < length; i++ {
+					setCopy[i] = append(setCopy[i], bS)
+				}
+			}
+
+			set = append(set, setCopy...)
+
+		} else {
+
+			if len(set) == 0 {
+				set = append(set, []int{bC})
+			} else {
+				length = len(set)
+				for i := 0; i < length; i++ {
+					set[i] = append(set[i], bC)
+				}
+			}
+		}
+	}
+
+	// Real use case verification that branching works correctly
+	// and no buckets are lost for a very large number of vectors.
+	// TODO: Remove once tested.
+	length = len(vector)
+	for i := 0; i < len(set); i++ {
+		if len(set[i]) != length {
+			panic(`Number of hypercube coordinates must equal to len(vector).`)
+		}
+	}
+
+	return set, central
+}
--- a/hypercubes_test.go
+++ b/hypercubes_test.go
@ -0,0 +1,75 @@
+package hyper
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestParams(t *testing.T) {
+	numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25
+	bucketWidth, eps := Params(numBuckets, min, max, bucketPct)
+	wantBucketWidth, wantEps := 25.5, 6.375
+	if bucketWidth != wantBucketWidth {
+		t.Errorf(`Got bucketWidth %v, want %v`, bucketWidth, wantBucketWidth)
+	}
+	if eps != wantEps {
+		t.Errorf(`Got eps %v, want %v`, eps, wantEps)
+	}
+}
+
+func TestParamsPanic(t *testing.T) {
+	defer func() { recover() }()
+	// Intentionally forbiden value for bucketPct.
+	numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.51
+	_, _ = Params(numBuckets, min, max, bucketPct)
+	// Never reaches here if Params panics.
+	t.Errorf("Params did not panic on bucketPct > 0.5")
+}
+
+func TestHypercubes(t *testing.T) {
+	numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25
+	values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
+	bucketWidth, eps := Params(numBuckets, min, max, bucketPct)
+	gotCubes, gotCentral := Hypercubes(values, min, max, bucketWidth, eps)
+	wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9},
+		{1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}}
+	wantCentral := []int{1, 0, 8, 3, 0, 0, 9}
+	if !reflect.DeepEqual(gotCubes, wantCubes) {
+		t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
+	}
+	if !reflect.DeepEqual(gotCentral, wantCentral) {
+		t.Errorf(`Got %v, want %v.`, gotCentral, wantCentral)
+	}
+	if centralIsNotInTheSet(gotCubes, gotCentral) {
+		t.Errorf(`Central %v is not in the set %v.`, gotCentral, gotCubes)
+	}
+
+	values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001}
+	gotCubes, gotCentral = Hypercubes(values, min, max, bucketWidth, eps)
+	wantCubes = [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}}
+	wantCentral = []int{0, 1, 2}
+	if !reflect.DeepEqual(gotCubes, wantCubes) {
+		t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
+	}
+	if !reflect.DeepEqual(gotCentral, wantCentral) {
+		t.Errorf(`Got %v, want %v.`, gotCentral, wantCentral)
+	}
+	if centralIsNotInTheSet(gotCubes, wantCentral) {
+		t.Errorf(`Central %v is not in the set %v.`, gotCentral, gotCubes)
+	}
+}
+
+func centralIsNotInTheSet(set [][]int, central []int) bool {
+	for _, cube := range set {
+		counter := 0
+		for i, c := range central {
+			if cube[i] == c {
+				counter++
+			}
+		}
+		if counter == len(central) {
+			return false
+		}
+	}
+	return true
+}