major changes and fixes

master
Vitali Fedulov 2021-12-20 21:43:47 +01:00
parent 90e66c0408
commit 79a4252757
4 changed files with 119 additions and 75 deletions

View File

@ -1,6 +1,6 @@
# Hashing float vectors in N-dimensions
CODE IS UNDER CONSTRUCTION: still there are some bugs. Should be ready within a few days.
UNDER CONSTRUCTION: still there are some bugs. Should be ready within a few days.
### Algorithm

View File

@ -7,9 +7,9 @@ package hyper
// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
// To use the package follow the sequence:
// 1) Params, 2) CubeSet or CentralCube, depending which one
// 1) CubeSet or CentralCube, depending which one
// is used for a database record and which one for a query,
// 3) HashSet and Decimal to get corresponding hash set
// 2) HashSet and Decimal to get corresponding hash set
// and central hash from results of (2). If Decimal hash
// is not suitable because of very large number of buckets
// or dimensions, use FNV1a to get both the hash set and

View File

@ -1,21 +1,15 @@
package hyper
// Params helps with discretization parameters.
// numBuckets is number of buckets per dimension.
// min and max are value limits per dimension.
// epsPercent is the uncertainty interval expressed as fraction
// of bucketWidth.
// eps is the absolute value of the uncertainty interval epsilon.
func Params(
numBuckets int, min, max, epsPercent float64) (
bucketWidth, eps float64) {
if epsPercent >= 0.5 {
panic(`Error: epsPercent must be less than 50%.
Recommendation: decrease numBuckets instead.`)
// rescale is a helper function to offset and rescale all values
// to [0, numBuckets] range.
func rescale(vector []float64, numBuckets int, min, max float64) []float64 {
rescaled := make([]float64, len(vector))
amp := max - min
for i := range vector {
// Offset to zero and rescale to [0, numBuckets] range.
rescaled[i] = (vector[i] - min) * float64(numBuckets) / amp
}
bucketWidth = (max - min) / float64(numBuckets)
eps = epsPercent * bucketWidth
return bucketWidth, eps
return rescaled
}
// CubeSet returns a set of hypercubes, which represent
@ -26,39 +20,71 @@ func Params(
// min and max are minimum and maximum possible values of
// the vector components. The assumption is that min and max
// are the same for all dimensions.
// bucketWidth and eps are defined in the Params function.
func CubeSet(
vector []float64, min, max, bucketWidth, eps float64) (
set [][]int) {
// numBuckets is number of buckets per dimension.
// min and max are value limits per dimension.
// epsPercent is the uncertainty interval expressed as
// a fraction of bucketWidth (for example 0.25 for eps = 1/4
// of bucketWidth).
func CubeSet(vector []float64, min, max, epsPercent float64,
numBuckets int) (set [][]int) {
if epsPercent >= 0.5 {
panic(`Error: epsPercent must be less than 0.5.`)
}
var (
bC, bS int // Central and side bucket ids.
bC, bS int // Central and side bucket number.
bL, bR int // Left and right bucket number.
setCopy [][]int // Set copy.
length int
branching bool // Branching flag.
)
// For each component of the vector.
for _, val := range vector {
// Rescaling vector to avoid potential mistakes with
// divisions and offsets later on.
rescaled := rescale(vector, numBuckets, min, max)
// After the rescale value range of the vector are
// [0, numBuckets], and not [min, max].
// min = 0.0 from now on.
max = float64(numBuckets)
for _, val := range rescaled {
bC = int(val / bucketWidth)
branching = false
// Value is in the lower uncertainty interval.
if val-float64(bC)*bucketWidth < eps {
bS = bC - 1
if val-eps > min {
branching = true
}
bL = int(val - epsPercent)
bR = int(val + epsPercent)
// Value is in the upper uncertainty interval.
} else if float64(bC+1)*bucketWidth-val < eps {
bS = bC + 1
if val+eps < max {
branching = true
}
// Get extreme values out of the way.
if val-epsPercent <= 0.0 { // This means that val >= 0.
bC = bR
goto branchingCheck // No branching.
}
// Get extreme values out of the way.
if val+epsPercent >= max { // This means that val =< max.
// Above max = numBuckets.
bC = bL
goto branchingCheck // No branching.
}
if bL == bR {
bC = bL
goto branchingCheck // No branching.
} else { // Meaning bL != bR and not any condition above.
bC = int(val)
if bL == bC {
bS = bR // So we have bC, have not lost bL, and get bR.
} else { // That is when bL != bC
bS = bL // So we have bC, have bL, and since can only have
// 2 buckets possible, bC is our bR (bR not lost).
}
branching = true
}
branchingCheck:
if branching {
setCopy = make([][]int, len(set))
copy(setCopy, set)
@ -84,7 +110,6 @@ func CubeSet(
set = append(set, setCopy...)
} else {
if len(set) == 0 {
set = append(set, []int{bC})
} else {
@ -112,17 +137,33 @@ func CubeSet(
// CentralCube returns the hypercube containing the vector end.
// Arguments are the same as for the CubeSet function.
func CentralCube(
vector []float64, min, max, bucketWidth, eps float64) (
central []int) {
func CentralCube(vector []float64, min, max, epsPercent float64,
numBuckets int) (central []int) {
var bC int // Central bucket ids.
// For each component of the vector.
for _, val := range vector {
bC = int(val / bucketWidth)
central = append(central, bC)
if epsPercent >= 0.5 {
panic(`Error: epsPercent must be less than 0.5.`)
}
var bC int // Central bucket numbers.
// Rescaling vector to avoid potential mistakes with
// divisions and offsets later on.
rescaled := rescale(vector, numBuckets, min, max)
// After the rescale value range of the vector are
// [0, numBuckets], and not [min, max].
// min = 0.0 from now on.
max = float64(numBuckets)
for _, val := range rescaled {
bC = int(val)
if val-epsPercent <= 0.0 { // This means that val >= 0.
bC = int(val + epsPercent)
}
if val+epsPercent >= max { // Meaning val =< max.
bC = int(val - epsPercent)
}
central = append(central, bC)
}
return central
}

View File

@ -5,33 +5,35 @@ import (
"testing"
)
func TestParams(t *testing.T) {
numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25
bucketWidth, eps := Params(numBuckets, min, max, epsPercent)
wantBucketWidth, wantEps := 25.5, 6.375
if bucketWidth != wantBucketWidth {
t.Errorf(`Got bucketWidth %v, want %v.`, bucketWidth, wantBucketWidth)
}
if eps != wantEps {
t.Errorf(`Got eps %v, want %v.`, eps, wantEps)
func TestRescale(t *testing.T) { // Testing panic.
numBuckets, min, max, _ := 10, 0.0, 255.0, 0.25
vector := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 255.0}
rescaled := rescale(vector, numBuckets, min, max)
got := rescaled
want := []float64{
1, 0.0003921568627450981, 8.24705882352941,
3.6823529411764704, 0.25882352941176473,
0.3568627450980392, 10}
if !reflect.DeepEqual(got, want) {
t.Errorf(`Got %v, want %v.`, got, want)
}
}
func TestParamsPanic(t *testing.T) {
func TestCubeSet1(t *testing.T) { // Testing panic.
defer func() { recover() }()
// Intentionally forbiden value for epsPercent.
numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.51
_, _ = Params(numBuckets, min, max, epsPercent)
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
min, max, epsPercent, numBuckets := 0.0, 255.0, 0.51, 10
_ = CubeSet(values, min, max, epsPercent, numBuckets)
// Never reaches here if Params panics.
t.Errorf("Params did not panic on epsPercent > 0.5")
}
func TestHypercubes1(t *testing.T) {
func TestCubeSet2(t *testing.T) {
numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
bucketWidth, eps := Params(numBuckets, min, max, epsPercent)
gotCubes := CubeSet(values, min, max, bucketWidth, eps)
gotCentral := CentralCube(values, min, max, bucketWidth, eps)
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
gotCentral := CentralCube(values, min, max, epsPercent, numBuckets)
wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9},
{1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}}
wantCentral := []int{1, 0, 8, 3, 0, 0, 9}
@ -44,12 +46,16 @@ func TestHypercubes1(t *testing.T) {
if centralIsNotInTheSet(gotCubes, gotCentral) {
t.Errorf(`Central %v is not in the set %v.`, gotCentral, gotCubes)
}
}
values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001}
gotCubes = CubeSet(values, min, max, bucketWidth, eps)
gotCentral = CentralCube(values, min, max, bucketWidth, eps)
wantCubes = [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}}
wantCentral = []int{0, 1, 2}
// Testing bucket borders.
func TestCubeSet3(t *testing.T) {
numBuckets, min, max, epsPercent := 4, 0.0, 4.0, 0.25
values := []float64{0.01, 2 * 0.999, 2 * 1.001}
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
gotCentral := CentralCube(values, min, max, epsPercent, numBuckets)
wantCubes := [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}}
wantCentral := []int{0, 1, 2}
if !reflect.DeepEqual(gotCubes, wantCubes) {
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
}
@ -62,14 +68,11 @@ func TestHypercubes1(t *testing.T) {
}
// Testing extreme buckets.
func TestHypercubes2(t *testing.T) {
func TestCubeSet4(t *testing.T) {
values := []float64{255.0, 0.0, 255.0, 0.0, 255.0, 0.0, 255.0}
numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25
bucketWidth, eps := Params(numBuckets, min, max, epsPercent)
gotCubes := CubeSet(values, min, max, bucketWidth, eps)
wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9},
{1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}}
t.Error(bucketWidth, eps)
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
wantCubes := [][]int{{3, 0, 3, 0, 3, 0, 3}}
if !reflect.DeepEqual(gotCubes, wantCubes) {
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
}