From 79a4252757190d6d4804dee478147a50c67df150 Mon Sep 17 00:00:00 2001 From: Vitali Fedulov Date: Mon, 20 Dec 2021 21:43:47 +0100 Subject: [PATCH] major changes and fixes --- README.md | 2 +- about.go | 4 +- hypercubes.go | 131 +++++++++++++++++++++++++++++---------------- hypercubes_test.go | 57 ++++++++++---------- 4 files changed, 119 insertions(+), 75 deletions(-) diff --git a/README.md b/README.md index da23a46..4ee5a14 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Hashing float vectors in N-dimensions -CODE IS UNDER CONSTRUCTION: still there are some bugs. Should be ready within a few days. +UNDER CONSTRUCTION: still there are some bugs. Should be ready within a few days. ### Algorithm diff --git a/about.go b/about.go index 710acad..a849e3b 100644 --- a/about.go +++ b/about.go @@ -7,9 +7,9 @@ package hyper // https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html // To use the package follow the sequence: -// 1) Params, 2) CubeSet or CentralCube, depending which one +// 1) CubeSet or CentralCube, depending which one // is used for a database record and which one for a query, -// 3) HashSet and Decimal to get corresponding hash set +// 2) HashSet and Decimal to get corresponding hash set // and central hash from results of (2). If Decimal hash // is not suitable because of very large number of buckets // or dimensions, use FNV1a to get both the hash set and diff --git a/hypercubes.go b/hypercubes.go index ceb1a51..4407a7a 100644 --- a/hypercubes.go +++ b/hypercubes.go @@ -1,21 +1,15 @@ package hyper -// Params helps with discretization parameters. -// numBuckets is number of buckets per dimension. -// min and max are value limits per dimension. -// epsPercent is the uncertainty interval expressed as fraction -// of bucketWidth. -// eps is the absolute value of the uncertainty interval epsilon. -func Params( - numBuckets int, min, max, epsPercent float64) ( - bucketWidth, eps float64) { - if epsPercent >= 0.5 { - panic(`Error: epsPercent must be less than 50%. - Recommendation: decrease numBuckets instead.`) +// rescale is a helper function to offset and rescale all values +// to [0, numBuckets] range. +func rescale(vector []float64, numBuckets int, min, max float64) []float64 { + rescaled := make([]float64, len(vector)) + amp := max - min + for i := range vector { + // Offset to zero and rescale to [0, numBuckets] range. + rescaled[i] = (vector[i] - min) * float64(numBuckets) / amp } - bucketWidth = (max - min) / float64(numBuckets) - eps = epsPercent * bucketWidth - return bucketWidth, eps + return rescaled } // CubeSet returns a set of hypercubes, which represent @@ -26,39 +20,71 @@ func Params( // min and max are minimum and maximum possible values of // the vector components. The assumption is that min and max // are the same for all dimensions. -// bucketWidth and eps are defined in the Params function. -func CubeSet( - vector []float64, min, max, bucketWidth, eps float64) ( - set [][]int) { +// numBuckets is number of buckets per dimension. +// min and max are value limits per dimension. +// epsPercent is the uncertainty interval expressed as +// a fraction of bucketWidth (for example 0.25 for eps = 1/4 +// of bucketWidth). +func CubeSet(vector []float64, min, max, epsPercent float64, + numBuckets int) (set [][]int) { + + if epsPercent >= 0.5 { + panic(`Error: epsPercent must be less than 0.5.`) + } var ( - bC, bS int // Central and side bucket ids. + bC, bS int // Central and side bucket number. + bL, bR int // Left and right bucket number. setCopy [][]int // Set copy. length int branching bool // Branching flag. ) - // For each component of the vector. - for _, val := range vector { + // Rescaling vector to avoid potential mistakes with + // divisions and offsets later on. + rescaled := rescale(vector, numBuckets, min, max) + // After the rescale value range of the vector are + // [0, numBuckets], and not [min, max]. + + // min = 0.0 from now on. + max = float64(numBuckets) + + for _, val := range rescaled { - bC = int(val / bucketWidth) branching = false - // Value is in the lower uncertainty interval. - if val-float64(bC)*bucketWidth < eps { - bS = bC - 1 - if val-eps > min { - branching = true - } + bL = int(val - epsPercent) + bR = int(val + epsPercent) - // Value is in the upper uncertainty interval. - } else if float64(bC+1)*bucketWidth-val < eps { - bS = bC + 1 - if val+eps < max { - branching = true - } + // Get extreme values out of the way. + if val-epsPercent <= 0.0 { // This means that val >= 0. + bC = bR + goto branchingCheck // No branching. } + // Get extreme values out of the way. + if val+epsPercent >= max { // This means that val =< max. + // Above max = numBuckets. + bC = bL + goto branchingCheck // No branching. + } + + if bL == bR { + bC = bL + goto branchingCheck // No branching. + + } else { // Meaning bL != bR and not any condition above. + bC = int(val) + if bL == bC { + bS = bR // So we have bC, have not lost bL, and get bR. + } else { // That is when bL != bC + bS = bL // So we have bC, have bL, and since can only have + // 2 buckets possible, bC is our bR (bR not lost). + } + branching = true + } + + branchingCheck: if branching { setCopy = make([][]int, len(set)) copy(setCopy, set) @@ -84,7 +110,6 @@ func CubeSet( set = append(set, setCopy...) } else { - if len(set) == 0 { set = append(set, []int{bC}) } else { @@ -112,17 +137,33 @@ func CubeSet( // CentralCube returns the hypercube containing the vector end. // Arguments are the same as for the CubeSet function. -func CentralCube( - vector []float64, min, max, bucketWidth, eps float64) ( - central []int) { +func CentralCube(vector []float64, min, max, epsPercent float64, + numBuckets int) (central []int) { - var bC int // Central bucket ids. - - // For each component of the vector. - for _, val := range vector { - bC = int(val / bucketWidth) - central = append(central, bC) + if epsPercent >= 0.5 { + panic(`Error: epsPercent must be less than 0.5.`) } + var bC int // Central bucket numbers. + + // Rescaling vector to avoid potential mistakes with + // divisions and offsets later on. + rescaled := rescale(vector, numBuckets, min, max) + // After the rescale value range of the vector are + // [0, numBuckets], and not [min, max]. + + // min = 0.0 from now on. + max = float64(numBuckets) + + for _, val := range rescaled { + bC = int(val) + if val-epsPercent <= 0.0 { // This means that val >= 0. + bC = int(val + epsPercent) + } + if val+epsPercent >= max { // Meaning val =< max. + bC = int(val - epsPercent) + } + central = append(central, bC) + } return central } diff --git a/hypercubes_test.go b/hypercubes_test.go index 43f265c..f11626c 100644 --- a/hypercubes_test.go +++ b/hypercubes_test.go @@ -5,33 +5,35 @@ import ( "testing" ) -func TestParams(t *testing.T) { - numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25 - bucketWidth, eps := Params(numBuckets, min, max, epsPercent) - wantBucketWidth, wantEps := 25.5, 6.375 - if bucketWidth != wantBucketWidth { - t.Errorf(`Got bucketWidth %v, want %v.`, bucketWidth, wantBucketWidth) - } - if eps != wantEps { - t.Errorf(`Got eps %v, want %v.`, eps, wantEps) +func TestRescale(t *testing.T) { // Testing panic. + numBuckets, min, max, _ := 10, 0.0, 255.0, 0.25 + vector := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 255.0} + rescaled := rescale(vector, numBuckets, min, max) + got := rescaled + want := []float64{ + 1, 0.0003921568627450981, 8.24705882352941, + 3.6823529411764704, 0.25882352941176473, + 0.3568627450980392, 10} + if !reflect.DeepEqual(got, want) { + t.Errorf(`Got %v, want %v.`, got, want) } } -func TestParamsPanic(t *testing.T) { +func TestCubeSet1(t *testing.T) { // Testing panic. defer func() { recover() }() // Intentionally forbiden value for epsPercent. - numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.51 - _, _ = Params(numBuckets, min, max, epsPercent) + values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9} + min, max, epsPercent, numBuckets := 0.0, 255.0, 0.51, 10 + _ = CubeSet(values, min, max, epsPercent, numBuckets) // Never reaches here if Params panics. t.Errorf("Params did not panic on epsPercent > 0.5") } -func TestHypercubes1(t *testing.T) { +func TestCubeSet2(t *testing.T) { numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25 values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9} - bucketWidth, eps := Params(numBuckets, min, max, epsPercent) - gotCubes := CubeSet(values, min, max, bucketWidth, eps) - gotCentral := CentralCube(values, min, max, bucketWidth, eps) + gotCubes := CubeSet(values, min, max, epsPercent, numBuckets) + gotCentral := CentralCube(values, min, max, epsPercent, numBuckets) wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}} wantCentral := []int{1, 0, 8, 3, 0, 0, 9} @@ -44,12 +46,16 @@ func TestHypercubes1(t *testing.T) { if centralIsNotInTheSet(gotCubes, gotCentral) { t.Errorf(`Central %v is not in the set %v.`, gotCentral, gotCubes) } +} - values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001} - gotCubes = CubeSet(values, min, max, bucketWidth, eps) - gotCentral = CentralCube(values, min, max, bucketWidth, eps) - wantCubes = [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}} - wantCentral = []int{0, 1, 2} +// Testing bucket borders. +func TestCubeSet3(t *testing.T) { + numBuckets, min, max, epsPercent := 4, 0.0, 4.0, 0.25 + values := []float64{0.01, 2 * 0.999, 2 * 1.001} + gotCubes := CubeSet(values, min, max, epsPercent, numBuckets) + gotCentral := CentralCube(values, min, max, epsPercent, numBuckets) + wantCubes := [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}} + wantCentral := []int{0, 1, 2} if !reflect.DeepEqual(gotCubes, wantCubes) { t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes) } @@ -62,14 +68,11 @@ func TestHypercubes1(t *testing.T) { } // Testing extreme buckets. -func TestHypercubes2(t *testing.T) { +func TestCubeSet4(t *testing.T) { values := []float64{255.0, 0.0, 255.0, 0.0, 255.0, 0.0, 255.0} numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25 - bucketWidth, eps := Params(numBuckets, min, max, epsPercent) - gotCubes := CubeSet(values, min, max, bucketWidth, eps) - wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9}, - {1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}} - t.Error(bucketWidth, eps) + gotCubes := CubeSet(values, min, max, epsPercent, numBuckets) + wantCubes := [][]int{{3, 0, 3, 0, 3, 0, 3}} if !reflect.DeepEqual(gotCubes, wantCubes) { t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes) }