From 79a4252757190d6d4804dee478147a50c67df150 Mon Sep 17 00:00:00 2001
From: Vitali Fedulov <fedulov.vitali@gmail.com>
Date: Mon, 20 Dec 2021 21:43:47 +0100
Subject: [PATCH] major changes and fixes

---
 README.md          |   2 +-
 about.go           |   4 +-
 hypercubes.go      | 131 +++++++++++++++++++++++++++++----------------
 hypercubes_test.go |  57 ++++++++++----------
 4 files changed, 119 insertions(+), 75 deletions(-)

diff --git a/README.md b/README.md
index da23a46..4ee5a14 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Hashing float vectors in N-dimensions
 
-CODE IS UNDER CONSTRUCTION: still there are some bugs. Should be ready within a few days.
+UNDER CONSTRUCTION: still there are some bugs. Should be ready within a few days.
 
 ### Algorithm
 
diff --git a/about.go b/about.go
index 710acad..a849e3b 100644
--- a/about.go
+++ b/about.go
@@ -7,9 +7,9 @@ package hyper
 // https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
 
 // To use the package follow the sequence:
-// 1) Params, 2) CubeSet or CentralCube, depending which one
+// 1) CubeSet or CentralCube, depending which one
 // is used for a database record and which one for a query,
-// 3) HashSet and Decimal to get corresponding hash set
+// 2) HashSet and Decimal to get corresponding hash set
 // and central hash from results of (2). If Decimal hash
 // is not suitable because of very large number of buckets
 // or dimensions,  use FNV1a to get both the hash set and
diff --git a/hypercubes.go b/hypercubes.go
index ceb1a51..4407a7a 100644
--- a/hypercubes.go
+++ b/hypercubes.go
@@ -1,21 +1,15 @@
 package hyper
 
-// Params helps with discretization parameters.
-// numBuckets is number of buckets per dimension.
-// min and max are value limits per dimension.
-// epsPercent is the uncertainty interval expressed as fraction
-// of bucketWidth.
-// eps is the absolute value of the uncertainty interval epsilon.
-func Params(
-	numBuckets int, min, max, epsPercent float64) (
-	bucketWidth, eps float64) {
-	if epsPercent >= 0.5 {
-		panic(`Error: epsPercent must be less than 50%.
-			Recommendation: decrease numBuckets instead.`)
+// rescale is a helper function to offset and rescale all values
+// to [0, numBuckets] range.
+func rescale(vector []float64, numBuckets int, min, max float64) []float64 {
+	rescaled := make([]float64, len(vector))
+	amp := max - min
+	for i := range vector {
+		// Offset to zero and rescale to [0, numBuckets] range.
+		rescaled[i] = (vector[i] - min) * float64(numBuckets) / amp
 	}
-	bucketWidth = (max - min) / float64(numBuckets)
-	eps = epsPercent * bucketWidth
-	return bucketWidth, eps
+	return rescaled
 }
 
 // CubeSet returns a set of hypercubes, which represent
@@ -26,39 +20,71 @@ func Params(
 // min and max are minimum and maximum possible values of
 // the vector components. The assumption is that min and max
 // are the same for all dimensions.
-// bucketWidth and eps are defined in the Params function.
-func CubeSet(
-	vector []float64, min, max, bucketWidth, eps float64) (
-	set [][]int) {
+// numBuckets is number of buckets per dimension.
+// min and max are value limits per dimension.
+// epsPercent is the uncertainty interval expressed as
+// a fraction of bucketWidth (for example 0.25 for eps = 1/4
+// of bucketWidth).
+func CubeSet(vector []float64, min, max, epsPercent float64,
+	numBuckets int) (set [][]int) {
+
+	if epsPercent >= 0.5 {
+		panic(`Error: epsPercent must be less than 0.5.`)
+	}
 
 	var (
-		bC, bS    int     // Central and side bucket ids.
+		bC, bS    int     // Central and side bucket number.
+		bL, bR    int     // Left and right bucket number.
 		setCopy   [][]int // Set copy.
 		length    int
 		branching bool // Branching flag.
 	)
 
-	// For each component of the vector.
-	for _, val := range vector {
+	// Rescaling vector to avoid potential mistakes with
+	// divisions and offsets later on.
+	rescaled := rescale(vector, numBuckets, min, max)
+	// After the rescale value range of the vector are
+	// [0, numBuckets], and not [min, max].
+
+	// min = 0.0 from now on.
+	max = float64(numBuckets)
+
+	for _, val := range rescaled {
 
-		bC = int(val / bucketWidth)
 		branching = false
 
-		// Value is in the lower uncertainty interval.
-		if val-float64(bC)*bucketWidth < eps {
-			bS = bC - 1
-			if val-eps > min {
-				branching = true
-			}
+		bL = int(val - epsPercent)
+		bR = int(val + epsPercent)
 
-			// Value is in the upper uncertainty interval.
-		} else if float64(bC+1)*bucketWidth-val < eps {
-			bS = bC + 1
-			if val+eps < max {
-				branching = true
-			}
+		// Get extreme values out of the way.
+		if val-epsPercent <= 0.0 { // This means that val >= 0.
+			bC = bR
+			goto branchingCheck // No branching.
 		}
 
+		// Get extreme values out of the way.
+		if val+epsPercent >= max { // This means that val =< max.
+			// Above max = numBuckets.
+			bC = bL
+			goto branchingCheck // No branching.
+		}
+
+		if bL == bR {
+			bC = bL
+			goto branchingCheck // No branching.
+
+		} else { // Meaning bL != bR and not any condition above.
+			bC = int(val)
+			if bL == bC {
+				bS = bR // So we have bC, have not lost bL, and get bR.
+			} else { // That is when bL != bC
+				bS = bL // So we have bC, have bL, and since can only have
+				// 2 buckets possible, bC is our bR (bR not lost).
+			}
+			branching = true
+		}
+
+	branchingCheck:
 		if branching {
 			setCopy = make([][]int, len(set))
 			copy(setCopy, set)
@@ -84,7 +110,6 @@ func CubeSet(
 			set = append(set, setCopy...)
 
 		} else {
-
 			if len(set) == 0 {
 				set = append(set, []int{bC})
 			} else {
@@ -112,17 +137,33 @@ func CubeSet(
 
 // CentralCube returns the hypercube containing the vector end.
 // Arguments are the same as for the CubeSet function.
-func CentralCube(
-	vector []float64, min, max, bucketWidth, eps float64) (
-	central []int) {
+func CentralCube(vector []float64, min, max, epsPercent float64,
+	numBuckets int) (central []int) {
 
-	var bC int // Central bucket ids.
-
-	// For each component of the vector.
-	for _, val := range vector {
-		bC = int(val / bucketWidth)
-		central = append(central, bC)
+	if epsPercent >= 0.5 {
+		panic(`Error: epsPercent must be less than 0.5.`)
 	}
 
+	var bC int // Central bucket numbers.
+
+	// Rescaling vector to avoid potential mistakes with
+	// divisions and offsets later on.
+	rescaled := rescale(vector, numBuckets, min, max)
+	// After the rescale value range of the vector are
+	// [0, numBuckets], and not [min, max].
+
+	// min = 0.0 from now on.
+	max = float64(numBuckets)
+
+	for _, val := range rescaled {
+		bC = int(val)
+		if val-epsPercent <= 0.0 { //  This means that val >= 0.
+			bC = int(val + epsPercent)
+		}
+		if val+epsPercent >= max { // Meaning val =< max.
+			bC = int(val - epsPercent)
+		}
+		central = append(central, bC)
+	}
 	return central
 }
diff --git a/hypercubes_test.go b/hypercubes_test.go
index 43f265c..f11626c 100644
--- a/hypercubes_test.go
+++ b/hypercubes_test.go
@@ -5,33 +5,35 @@ import (
 	"testing"
 )
 
-func TestParams(t *testing.T) {
-	numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25
-	bucketWidth, eps := Params(numBuckets, min, max, epsPercent)
-	wantBucketWidth, wantEps := 25.5, 6.375
-	if bucketWidth != wantBucketWidth {
-		t.Errorf(`Got bucketWidth %v, want %v.`, bucketWidth, wantBucketWidth)
-	}
-	if eps != wantEps {
-		t.Errorf(`Got eps %v, want %v.`, eps, wantEps)
+func TestRescale(t *testing.T) { // Testing panic.
+	numBuckets, min, max, _ := 10, 0.0, 255.0, 0.25
+	vector := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 255.0}
+	rescaled := rescale(vector, numBuckets, min, max)
+	got := rescaled
+	want := []float64{
+		1, 0.0003921568627450981, 8.24705882352941,
+		3.6823529411764704, 0.25882352941176473,
+		0.3568627450980392, 10}
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf(`Got %v, want %v.`, got, want)
 	}
 }
 
-func TestParamsPanic(t *testing.T) {
+func TestCubeSet1(t *testing.T) { // Testing panic.
 	defer func() { recover() }()
 	// Intentionally forbiden value for epsPercent.
-	numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.51
-	_, _ = Params(numBuckets, min, max, epsPercent)
+	values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
+	min, max, epsPercent, numBuckets := 0.0, 255.0, 0.51, 10
+	_ = CubeSet(values, min, max, epsPercent, numBuckets)
 	// Never reaches here if Params panics.
 	t.Errorf("Params did not panic on epsPercent > 0.5")
 }
 
-func TestHypercubes1(t *testing.T) {
+func TestCubeSet2(t *testing.T) {
 	numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25
 	values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
-	bucketWidth, eps := Params(numBuckets, min, max, epsPercent)
-	gotCubes := CubeSet(values, min, max, bucketWidth, eps)
-	gotCentral := CentralCube(values, min, max, bucketWidth, eps)
+	gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
+	gotCentral := CentralCube(values, min, max, epsPercent, numBuckets)
 	wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9},
 		{1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}}
 	wantCentral := []int{1, 0, 8, 3, 0, 0, 9}
@@ -44,12 +46,16 @@ func TestHypercubes1(t *testing.T) {
 	if centralIsNotInTheSet(gotCubes, gotCentral) {
 		t.Errorf(`Central %v is not in the set %v.`, gotCentral, gotCubes)
 	}
+}
 
-	values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001}
-	gotCubes = CubeSet(values, min, max, bucketWidth, eps)
-	gotCentral = CentralCube(values, min, max, bucketWidth, eps)
-	wantCubes = [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}}
-	wantCentral = []int{0, 1, 2}
+// Testing bucket borders.
+func TestCubeSet3(t *testing.T) {
+	numBuckets, min, max, epsPercent := 4, 0.0, 4.0, 0.25
+	values := []float64{0.01, 2 * 0.999, 2 * 1.001}
+	gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
+	gotCentral := CentralCube(values, min, max, epsPercent, numBuckets)
+	wantCubes := [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}}
+	wantCentral := []int{0, 1, 2}
 	if !reflect.DeepEqual(gotCubes, wantCubes) {
 		t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
 	}
@@ -62,14 +68,11 @@ func TestHypercubes1(t *testing.T) {
 }
 
 // Testing extreme buckets.
-func TestHypercubes2(t *testing.T) {
+func TestCubeSet4(t *testing.T) {
 	values := []float64{255.0, 0.0, 255.0, 0.0, 255.0, 0.0, 255.0}
 	numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25
-	bucketWidth, eps := Params(numBuckets, min, max, epsPercent)
-	gotCubes := CubeSet(values, min, max, bucketWidth, eps)
-	wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9},
-		{1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}}
-	t.Error(bucketWidth, eps)
+	gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
+	wantCubes := [][]int{{3, 0, 3, 0, 3, 0, 3}}
 	if !reflect.DeepEqual(gotCubes, wantCubes) {
 		t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
 	}