major changes and fixes
parent
90e66c0408
commit
79a4252757
|
@ -1,6 +1,6 @@
|
|||
# Hashing float vectors in N-dimensions
|
||||
|
||||
CODE IS UNDER CONSTRUCTION: still there are some bugs. Should be ready within a few days.
|
||||
UNDER CONSTRUCTION: still there are some bugs. Should be ready within a few days.
|
||||
|
||||
### Algorithm
|
||||
|
||||
|
|
4
about.go
4
about.go
|
@ -7,9 +7,9 @@ package hyper
|
|||
// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
|
||||
|
||||
// To use the package follow the sequence:
|
||||
// 1) Params, 2) CubeSet or CentralCube, depending which one
|
||||
// 1) CubeSet or CentralCube, depending which one
|
||||
// is used for a database record and which one for a query,
|
||||
// 3) HashSet and Decimal to get corresponding hash set
|
||||
// 2) HashSet and Decimal to get corresponding hash set
|
||||
// and central hash from results of (2). If Decimal hash
|
||||
// is not suitable because of very large number of buckets
|
||||
// or dimensions, use FNV1a to get both the hash set and
|
||||
|
|
131
hypercubes.go
131
hypercubes.go
|
@ -1,21 +1,15 @@
|
|||
package hyper
|
||||
|
||||
// Params helps with discretization parameters.
|
||||
// numBuckets is number of buckets per dimension.
|
||||
// min and max are value limits per dimension.
|
||||
// epsPercent is the uncertainty interval expressed as fraction
|
||||
// of bucketWidth.
|
||||
// eps is the absolute value of the uncertainty interval epsilon.
|
||||
func Params(
|
||||
numBuckets int, min, max, epsPercent float64) (
|
||||
bucketWidth, eps float64) {
|
||||
if epsPercent >= 0.5 {
|
||||
panic(`Error: epsPercent must be less than 50%.
|
||||
Recommendation: decrease numBuckets instead.`)
|
||||
// rescale is a helper function to offset and rescale all values
|
||||
// to [0, numBuckets] range.
|
||||
func rescale(vector []float64, numBuckets int, min, max float64) []float64 {
|
||||
rescaled := make([]float64, len(vector))
|
||||
amp := max - min
|
||||
for i := range vector {
|
||||
// Offset to zero and rescale to [0, numBuckets] range.
|
||||
rescaled[i] = (vector[i] - min) * float64(numBuckets) / amp
|
||||
}
|
||||
bucketWidth = (max - min) / float64(numBuckets)
|
||||
eps = epsPercent * bucketWidth
|
||||
return bucketWidth, eps
|
||||
return rescaled
|
||||
}
|
||||
|
||||
// CubeSet returns a set of hypercubes, which represent
|
||||
|
@ -26,39 +20,71 @@ func Params(
|
|||
// min and max are minimum and maximum possible values of
|
||||
// the vector components. The assumption is that min and max
|
||||
// are the same for all dimensions.
|
||||
// bucketWidth and eps are defined in the Params function.
|
||||
func CubeSet(
|
||||
vector []float64, min, max, bucketWidth, eps float64) (
|
||||
set [][]int) {
|
||||
// numBuckets is number of buckets per dimension.
|
||||
// min and max are value limits per dimension.
|
||||
// epsPercent is the uncertainty interval expressed as
|
||||
// a fraction of bucketWidth (for example 0.25 for eps = 1/4
|
||||
// of bucketWidth).
|
||||
func CubeSet(vector []float64, min, max, epsPercent float64,
|
||||
numBuckets int) (set [][]int) {
|
||||
|
||||
if epsPercent >= 0.5 {
|
||||
panic(`Error: epsPercent must be less than 0.5.`)
|
||||
}
|
||||
|
||||
var (
|
||||
bC, bS int // Central and side bucket ids.
|
||||
bC, bS int // Central and side bucket number.
|
||||
bL, bR int // Left and right bucket number.
|
||||
setCopy [][]int // Set copy.
|
||||
length int
|
||||
branching bool // Branching flag.
|
||||
)
|
||||
|
||||
// For each component of the vector.
|
||||
for _, val := range vector {
|
||||
// Rescaling vector to avoid potential mistakes with
|
||||
// divisions and offsets later on.
|
||||
rescaled := rescale(vector, numBuckets, min, max)
|
||||
// After the rescale value range of the vector are
|
||||
// [0, numBuckets], and not [min, max].
|
||||
|
||||
// min = 0.0 from now on.
|
||||
max = float64(numBuckets)
|
||||
|
||||
for _, val := range rescaled {
|
||||
|
||||
bC = int(val / bucketWidth)
|
||||
branching = false
|
||||
|
||||
// Value is in the lower uncertainty interval.
|
||||
if val-float64(bC)*bucketWidth < eps {
|
||||
bS = bC - 1
|
||||
if val-eps > min {
|
||||
branching = true
|
||||
}
|
||||
bL = int(val - epsPercent)
|
||||
bR = int(val + epsPercent)
|
||||
|
||||
// Value is in the upper uncertainty interval.
|
||||
} else if float64(bC+1)*bucketWidth-val < eps {
|
||||
bS = bC + 1
|
||||
if val+eps < max {
|
||||
branching = true
|
||||
}
|
||||
// Get extreme values out of the way.
|
||||
if val-epsPercent <= 0.0 { // This means that val >= 0.
|
||||
bC = bR
|
||||
goto branchingCheck // No branching.
|
||||
}
|
||||
|
||||
// Get extreme values out of the way.
|
||||
if val+epsPercent >= max { // This means that val =< max.
|
||||
// Above max = numBuckets.
|
||||
bC = bL
|
||||
goto branchingCheck // No branching.
|
||||
}
|
||||
|
||||
if bL == bR {
|
||||
bC = bL
|
||||
goto branchingCheck // No branching.
|
||||
|
||||
} else { // Meaning bL != bR and not any condition above.
|
||||
bC = int(val)
|
||||
if bL == bC {
|
||||
bS = bR // So we have bC, have not lost bL, and get bR.
|
||||
} else { // That is when bL != bC
|
||||
bS = bL // So we have bC, have bL, and since can only have
|
||||
// 2 buckets possible, bC is our bR (bR not lost).
|
||||
}
|
||||
branching = true
|
||||
}
|
||||
|
||||
branchingCheck:
|
||||
if branching {
|
||||
setCopy = make([][]int, len(set))
|
||||
copy(setCopy, set)
|
||||
|
@ -84,7 +110,6 @@ func CubeSet(
|
|||
set = append(set, setCopy...)
|
||||
|
||||
} else {
|
||||
|
||||
if len(set) == 0 {
|
||||
set = append(set, []int{bC})
|
||||
} else {
|
||||
|
@ -112,17 +137,33 @@ func CubeSet(
|
|||
|
||||
// CentralCube returns the hypercube containing the vector end.
|
||||
// Arguments are the same as for the CubeSet function.
|
||||
func CentralCube(
|
||||
vector []float64, min, max, bucketWidth, eps float64) (
|
||||
central []int) {
|
||||
func CentralCube(vector []float64, min, max, epsPercent float64,
|
||||
numBuckets int) (central []int) {
|
||||
|
||||
var bC int // Central bucket ids.
|
||||
|
||||
// For each component of the vector.
|
||||
for _, val := range vector {
|
||||
bC = int(val / bucketWidth)
|
||||
central = append(central, bC)
|
||||
if epsPercent >= 0.5 {
|
||||
panic(`Error: epsPercent must be less than 0.5.`)
|
||||
}
|
||||
|
||||
var bC int // Central bucket numbers.
|
||||
|
||||
// Rescaling vector to avoid potential mistakes with
|
||||
// divisions and offsets later on.
|
||||
rescaled := rescale(vector, numBuckets, min, max)
|
||||
// After the rescale value range of the vector are
|
||||
// [0, numBuckets], and not [min, max].
|
||||
|
||||
// min = 0.0 from now on.
|
||||
max = float64(numBuckets)
|
||||
|
||||
for _, val := range rescaled {
|
||||
bC = int(val)
|
||||
if val-epsPercent <= 0.0 { // This means that val >= 0.
|
||||
bC = int(val + epsPercent)
|
||||
}
|
||||
if val+epsPercent >= max { // Meaning val =< max.
|
||||
bC = int(val - epsPercent)
|
||||
}
|
||||
central = append(central, bC)
|
||||
}
|
||||
return central
|
||||
}
|
||||
|
|
|
@ -5,33 +5,35 @@ import (
|
|||
"testing"
|
||||
)
|
||||
|
||||
func TestParams(t *testing.T) {
|
||||
numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25
|
||||
bucketWidth, eps := Params(numBuckets, min, max, epsPercent)
|
||||
wantBucketWidth, wantEps := 25.5, 6.375
|
||||
if bucketWidth != wantBucketWidth {
|
||||
t.Errorf(`Got bucketWidth %v, want %v.`, bucketWidth, wantBucketWidth)
|
||||
}
|
||||
if eps != wantEps {
|
||||
t.Errorf(`Got eps %v, want %v.`, eps, wantEps)
|
||||
func TestRescale(t *testing.T) { // Testing panic.
|
||||
numBuckets, min, max, _ := 10, 0.0, 255.0, 0.25
|
||||
vector := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 255.0}
|
||||
rescaled := rescale(vector, numBuckets, min, max)
|
||||
got := rescaled
|
||||
want := []float64{
|
||||
1, 0.0003921568627450981, 8.24705882352941,
|
||||
3.6823529411764704, 0.25882352941176473,
|
||||
0.3568627450980392, 10}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Errorf(`Got %v, want %v.`, got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParamsPanic(t *testing.T) {
|
||||
func TestCubeSet1(t *testing.T) { // Testing panic.
|
||||
defer func() { recover() }()
|
||||
// Intentionally forbiden value for epsPercent.
|
||||
numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.51
|
||||
_, _ = Params(numBuckets, min, max, epsPercent)
|
||||
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
|
||||
min, max, epsPercent, numBuckets := 0.0, 255.0, 0.51, 10
|
||||
_ = CubeSet(values, min, max, epsPercent, numBuckets)
|
||||
// Never reaches here if Params panics.
|
||||
t.Errorf("Params did not panic on epsPercent > 0.5")
|
||||
}
|
||||
|
||||
func TestHypercubes1(t *testing.T) {
|
||||
func TestCubeSet2(t *testing.T) {
|
||||
numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25
|
||||
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
|
||||
bucketWidth, eps := Params(numBuckets, min, max, epsPercent)
|
||||
gotCubes := CubeSet(values, min, max, bucketWidth, eps)
|
||||
gotCentral := CentralCube(values, min, max, bucketWidth, eps)
|
||||
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
|
||||
gotCentral := CentralCube(values, min, max, epsPercent, numBuckets)
|
||||
wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9},
|
||||
{1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}}
|
||||
wantCentral := []int{1, 0, 8, 3, 0, 0, 9}
|
||||
|
@ -44,12 +46,16 @@ func TestHypercubes1(t *testing.T) {
|
|||
if centralIsNotInTheSet(gotCubes, gotCentral) {
|
||||
t.Errorf(`Central %v is not in the set %v.`, gotCentral, gotCubes)
|
||||
}
|
||||
}
|
||||
|
||||
values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001}
|
||||
gotCubes = CubeSet(values, min, max, bucketWidth, eps)
|
||||
gotCentral = CentralCube(values, min, max, bucketWidth, eps)
|
||||
wantCubes = [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}}
|
||||
wantCentral = []int{0, 1, 2}
|
||||
// Testing bucket borders.
|
||||
func TestCubeSet3(t *testing.T) {
|
||||
numBuckets, min, max, epsPercent := 4, 0.0, 4.0, 0.25
|
||||
values := []float64{0.01, 2 * 0.999, 2 * 1.001}
|
||||
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
|
||||
gotCentral := CentralCube(values, min, max, epsPercent, numBuckets)
|
||||
wantCubes := [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}}
|
||||
wantCentral := []int{0, 1, 2}
|
||||
if !reflect.DeepEqual(gotCubes, wantCubes) {
|
||||
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
|
||||
}
|
||||
|
@ -62,14 +68,11 @@ func TestHypercubes1(t *testing.T) {
|
|||
}
|
||||
|
||||
// Testing extreme buckets.
|
||||
func TestHypercubes2(t *testing.T) {
|
||||
func TestCubeSet4(t *testing.T) {
|
||||
values := []float64{255.0, 0.0, 255.0, 0.0, 255.0, 0.0, 255.0}
|
||||
numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25
|
||||
bucketWidth, eps := Params(numBuckets, min, max, epsPercent)
|
||||
gotCubes := CubeSet(values, min, max, bucketWidth, eps)
|
||||
wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9},
|
||||
{1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}}
|
||||
t.Error(bucketWidth, eps)
|
||||
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
|
||||
wantCubes := [][]int{{3, 0, 3, 0, 3, 0, 3}}
|
||||
if !reflect.DeepEqual(gotCubes, wantCubes) {
|
||||
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue