switch to methods

master v1.0.0
Vitali Fedulov 2022-01-19 09:08:56 +01:00
parent 0d4c3c110c
commit c42552770d
6 changed files with 104 additions and 111 deletions

View File

@ -1,11 +1,11 @@
# Hashing float vectors in N-dimensions # Hashing float vectors in N-dimensions
This is an early beta version. Package hyper allows fast approximate search of nearest neighbour vectors in n-dimensional space.
### Algorithm **This is an early beta version**. Description below will be improved (TODO). See tests for examples.
https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html Package functions discretize a vector and generate a set of hashes, as described in the following document: https://similar.pictures/algorithm-for-hashing-high-dimensional-float-vectors.html
### How to use To use the package follow the sequence of functions/methods:
1) CubeSet or CentralCube, depending which one is used for a database record and which one for a query.
about.go contains a short instruction. 2) HashSet and DecimalHash to get corresponding hash set and central hash from results of (2). If DecimalHash is not suitable because of very large number of buckets or dimensions, use FNV1aHash to get both the hash set and the central hash).

View File

@ -1,16 +0,0 @@
package hyper
// Package hyper allows fast approximate search of nearest
// neighbour vectors in n-dimensional space.
// Package functions discretize a vector and generate a set
// of fuzzy hashes, as described in the following document:
// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
// To use the package follow the sequence:
// 1) CubeSet or CentralCube, depending which one
// is used for a database record and which one for a query,
// 2) HashSet and Decimal to get corresponding hash set
// and central hash from results of (2). If Decimal hash
// is not suitable because of very large number of buckets
// or dimensions, use FNV1a to get both the hash set and
// the central hash).

View File

@ -1,24 +1,18 @@
package hyper package hyper
// rescale is a helper function to offset and rescale all values // Hypercube is represented by a slice of its coordinates.
// to [0, numBuckets] range. type Cube []int
func rescale(vector []float64, numBuckets int, min, max float64) []float64 { type Cubes []Cube
rescaled := make([]float64, len(vector))
amp := max - min
for i := range vector {
// Offset to zero and rescale to [0, numBuckets] range.
rescaled[i] = (vector[i] - min) * float64(numBuckets) / amp
}
return rescaled
}
// clone makes a totally independent copy of a 2D slice. // Parameters of space discretization.
func clone(src [][]int) (dst [][]int) { type Params struct {
dst = make([][]int, len(src)) // Value limits per dimension. For example 0, 255 for pixel values.
for i := range src { Min, Max float64
dst[i] = append([]int{}, src[i]...) // Uncertainty interval expressed as a fraction of bucketWidth
} // (for example 0.25 for eps = 1/4 of bucketWidth).
return dst EpsPercent float64
// Number of buckets per dimension.
NumBuckets int
} }
// CubeSet returns a set of hypercubes, which represent // CubeSet returns a set of hypercubes, which represent
@ -29,49 +23,43 @@ func clone(src [][]int) (dst [][]int) {
// min and max are minimum and maximum possible values of // min and max are minimum and maximum possible values of
// the vector components. The assumption is that min and max // the vector components. The assumption is that min and max
// are the same for all dimensions. // are the same for all dimensions.
// numBuckets is number of buckets per dimension. func CubeSet(vector []float64, params Params) (set Cubes) {
// min and max are value limits per dimension.
// epsPercent is the uncertainty interval expressed as
// a fraction of bucketWidth (for example 0.25 for eps = 1/4
// of bucketWidth).
func CubeSet(vector []float64, min, max, epsPercent float64,
numBuckets int) (set [][]int) {
if epsPercent >= 0.5 { if params.EpsPercent >= 0.5 {
panic(`Error: epsPercent must be less than 0.5.`) panic(`Error: EpsPercent must be less than 0.5.`)
} }
var ( var (
bC int // Central bucket number. bC int // Central bucket number.
bL, bR int // Left and right bucket number. bL, bR int // Left and right bucket number.
setL, setR [][]int // Set copies. setL, setR Cubes // Set clones (for Left and Right).
branching bool // Branching flag. branching bool // Branching flag.
) )
// Rescaling vector to avoid potential mistakes with // Rescaling vector to avoid potential mistakes with
// divisions and offsets later on. // divisions and offsets later on.
rescaled := rescale(vector, numBuckets, min, max) rescaled := rescale(vector, params)
// After the rescale value range of the vector are // After the rescale value range of the vector are
// [0, numBuckets], and not [min, max]. // [0, numBuckets], and not [min, max].
// min = 0.0 from now on. // min = 0.0 from now on.
max = float64(numBuckets) max := float64(params.NumBuckets)
for _, val := range rescaled { for _, val := range rescaled {
branching = false branching = false
bL = int(val - epsPercent) bL = int(val - params.EpsPercent)
bR = int(val + epsPercent) bR = int(val + params.EpsPercent)
// Get extreme values out of the way. // Get extreme values out of the way.
if val-epsPercent <= 0.0 { // This means that val >= 0. if val-params.EpsPercent <= 0.0 { // This means that val >= 0.
bC = bR bC = bR
goto branchingCheck // No branching. goto branchingCheck // No branching.
} }
// Get extreme values out of the way. // Get extreme values out of the way.
if val+epsPercent >= max { // This means that val =< max. if val+params.EpsPercent >= max { // This means that val =< max.
// Above max = numBuckets. // Above max = numBuckets.
bC = bL bC = bL
goto branchingCheck // No branching. goto branchingCheck // No branching.
@ -135,33 +123,54 @@ func CubeSet(vector []float64, min, max, epsPercent float64,
// CentralCube returns the hypercube containing the vector end. // CentralCube returns the hypercube containing the vector end.
// Arguments are the same as for the CubeSet function. // Arguments are the same as for the CubeSet function.
func CentralCube(vector []float64, min, max, epsPercent float64, func CentralCube(vector []float64, params Params) (central Cube) {
numBuckets int) (central []int) {
if epsPercent >= 0.5 { if params.EpsPercent >= 0.5 {
panic(`Error: epsPercent must be less than 0.5.`) panic(`Error: EpsPercent must be less than 0.5.`)
} }
var bC int // Central bucket numbers. var bC int // Central bucket numbers.
// Rescaling vector to avoid potential mistakes with // Rescaling vector to avoid potential mistakes with
// divisions and offsets later on. // divisions and offsets later on.
rescaled := rescale(vector, numBuckets, min, max) rescaled := rescale(vector, params)
// After the rescale value range of the vector are // After the rescale value range of the vector are
// [0, numBuckets], and not [min, max]. // [0, numBuckets], and not [min, max].
// min = 0.0 from now on. // min = 0.0 from now on.
max = float64(numBuckets) max := float64(params.NumBuckets)
for _, val := range rescaled { for _, val := range rescaled {
bC = int(val) bC = int(val)
if val-epsPercent <= 0.0 { // This means that val >= 0. if val-params.EpsPercent <= 0.0 { // This means that val >= 0.
bC = int(val + epsPercent) bC = int(val + params.EpsPercent)
} }
if val+epsPercent >= max { // Meaning val =< max. if val+params.EpsPercent >= max { // Meaning val =< max.
bC = int(val - epsPercent) bC = int(val - params.EpsPercent)
} }
central = append(central, bC) central = append(central, bC)
} }
return central return central
} }
// rescale is a helper function to offset and rescale all values
// to [0, numBuckets] range.
func rescale(vector []float64, params Params) []float64 {
rescaled := make([]float64, len(vector))
amp := params.Max - params.Min
for i := range vector {
// Offset to zero and rescale to [0, numBuckets] range.
rescaled[i] =
(vector[i] - params.Min) * float64(params.NumBuckets) / amp
}
return rescaled
}
// clone makes an unlinked copy of a 2D slice.
func clone(src Cubes) (dst Cubes) {
dst = make(Cubes, len(src))
for i := range src {
dst[i] = append(Cube{}, src[i]...)
}
return dst
}

View File

@ -5,7 +5,7 @@ import (
"testing" "testing"
) )
func centralIsNotInTheSet(set [][]int, central []int) bool { func centralIsNotInTheSet(set Cubes, central Cube) bool {
for _, cube := range set { for _, cube := range set {
counter := 0 counter := 0
for i, c := range central { for i, c := range central {
@ -21,9 +21,9 @@ func centralIsNotInTheSet(set [][]int, central []int) bool {
} }
func TestRescale(t *testing.T) { // Testing panic. func TestRescale(t *testing.T) { // Testing panic.
numBuckets, min, max, _ := 10, 0.0, 255.0, 0.25
vector := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 255.0} vector := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 255.0}
rescaled := rescale(vector, numBuckets, min, max) params := Params{0.0, 255.0, 0.25, 10}
rescaled := rescale(vector, params)
got := rescaled got := rescaled
want := []float64{ want := []float64{
1, 0.0003921568627450981, 8.24705882352941, 1, 0.0003921568627450981, 8.24705882352941,
@ -38,20 +38,20 @@ func TestCubeSet1(t *testing.T) { // Testing panic.
defer func() { recover() }() defer func() { recover() }()
// Intentionally forbiden value for epsPercent. // Intentionally forbiden value for epsPercent.
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9} values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
min, max, epsPercent, numBuckets := 0.0, 255.0, 0.51, 10 params := Params{0.0, 255.0, 0.51, 10}
_ = CubeSet(values, min, max, epsPercent, numBuckets) _ = CubeSet(values, params)
// Never reaches here if Params panics. // Never reaches here if Params panics.
t.Errorf("Params did not panic on epsPercent > 0.5") t.Errorf("Params did not panic on epsPercent > 0.5")
} }
func TestCubeSet2(t *testing.T) { func TestCubeSet2(t *testing.T) {
numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25 params := Params{0.0, 255.0, 0.25, 10}
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9} values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets) gotCubes := CubeSet(values, params)
gotCentral := CentralCube(values, min, max, epsPercent, numBuckets) gotCentral := CentralCube(values, params)
wantCubes := [][]int{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9}, wantCubes := Cubes{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9},
{0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}} {0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}}
wantCentral := []int{1, 0, 8, 3, 0, 0, 9} wantCentral := Cube{1, 0, 8, 3, 0, 0, 9}
if !reflect.DeepEqual(gotCubes, wantCubes) { if !reflect.DeepEqual(gotCubes, wantCubes) {
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes) t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
} }
@ -65,12 +65,12 @@ func TestCubeSet2(t *testing.T) {
// Testing bucket borders. // Testing bucket borders.
func TestCubeSet3(t *testing.T) { func TestCubeSet3(t *testing.T) {
numBuckets, min, max, epsPercent := 4, 0.0, 4.0, 0.25 params := Params{0.0, 4.0, 0.25, 4}
values := []float64{0.01, 2 * 0.999, 2 * 1.001} values := []float64{0.01, 2 * 0.999, 2 * 1.001}
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets) gotCubes := CubeSet(values, params)
gotCentral := CentralCube(values, min, max, epsPercent, numBuckets) gotCentral := CentralCube(values, params)
wantCubes := [][]int{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}} wantCubes := Cubes{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}}
wantCentral := []int{0, 1, 2} wantCentral := Cube{0, 1, 2}
if !reflect.DeepEqual(gotCubes, wantCubes) { if !reflect.DeepEqual(gotCubes, wantCubes) {
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes) t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
} }
@ -85,9 +85,9 @@ func TestCubeSet3(t *testing.T) {
// Testing extreme buckets. // Testing extreme buckets.
func TestCubeSet4(t *testing.T) { func TestCubeSet4(t *testing.T) {
values := []float64{255.0, 0.0, 255.0, 0.0, 255.0, 0.0, 255.0} values := []float64{255.0, 0.0, 255.0, 0.0, 255.0, 0.0, 255.0}
numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25 params := Params{0.0, 255.0, 0.25, 4}
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets) gotCubes := CubeSet(values, params)
wantCubes := [][]int{{3, 0, 3, 0, 3, 0, 3}} wantCubes := Cubes{{3, 0, 3, 0, 3, 0, 3}}
if !reflect.DeepEqual(gotCubes, wantCubes) { if !reflect.DeepEqual(gotCubes, wantCubes) {
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes) t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
} }
@ -97,9 +97,9 @@ var vector = []float64{
0, 183, 148, 21, 47, 16, 69, 45, 151, 64, 181} 0, 183, 148, 21, 47, 16, 69, 45, 151, 64, 181}
func TestCubeSet5(t *testing.T) { func TestCubeSet5(t *testing.T) {
numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25 params := Params{0.0, 255.0, 0.25, 4}
gotCubes := CubeSet(vector, min, max, epsPercent, numBuckets) gotCubes := CubeSet(vector, params)
wantCubes := [][]int{ wantCubes := Cubes{
{0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2}, {0, 3, 2, 0, 0, 0, 0, 0, 2, 0, 2}, {0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2}, {0, 3, 2, 0, 0, 0, 0, 0, 2, 0, 2},
{0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 2}, {0, 3, 2, 0, 0, 0, 1, 0, 2, 0, 2}, {0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 2}, {0, 3, 2, 0, 0, 0, 1, 0, 2, 0, 2},
{0, 2, 2, 0, 0, 0, 0, 0, 2, 1, 2}, {0, 3, 2, 0, 0, 0, 0, 0, 2, 1, 2}, {0, 2, 2, 0, 0, 0, 0, 0, 2, 1, 2}, {0, 3, 2, 0, 0, 0, 0, 0, 2, 1, 2},

View File

@ -6,22 +6,22 @@ import (
"hash/fnv" "hash/fnv"
) )
// Decimal hashes hypercubes without collisions. IMPORTANT: // DecimalHash hashes hypercubes without collisions. IMPORTANT:
// To work correctly, the number of buckets must be // To work correctly, the number of buckets must be
// less than 11 and the number of dimensions less than 20. // less than 11 and the number of dimensions less than 20.
// Else at certain unexpected moment you might get a hash // Else at certain unexpected moment you might get a hash
// value overflow. // value overflow.
func Decimal(cube []int) (h uint64) { func (cube Cube) DecimalHash() (h uint64) {
for _, v := range cube { for _, v := range cube {
h = h*10 + uint64(v) h = h*10 + uint64(v)
} }
return h return h
} }
// FNV1a hashes hypercubes with rare collisions, // FNV1aHash hashes hypercubes with rare collisions,
// and should be used when Decimal cannot be used // and should be used when Decimal cannot be used
// because of very large number of buckets or dimensions. // because of very large number of buckets or dimensions.
func FNV1a(cube []int) uint64 { func (cube Cube) FNV1aHash() uint64 {
var b bytes.Buffer var b bytes.Buffer
gob.NewEncoder(&b).Encode(cube) gob.NewEncoder(&b).Encode(cube)
hash := fnv.New64a() hash := fnv.New64a()
@ -30,11 +30,11 @@ func FNV1a(cube []int) uint64 {
} }
// HashFunc can be any function (also user-defined). // HashFunc can be any function (also user-defined).
type HashFunc func(hypercube []int) uint64 type HashFunc func(cube Cube) uint64
// Hash64Set returns a set of hashes for a hypercube set // Hash64Set returns a set of hashes for a hypercube set
// and a concrete hash function. // and a concrete hash function.
func HashSet(cubeSet [][]int, hashFunc HashFunc) ( func (cubeSet Cubes) HashSet(hashFunc HashFunc) (
hs []uint64) { hs []uint64) {
for i := 0; i < len(cubeSet); i++ { for i := 0; i < len(cubeSet); i++ {
hs = append(hs, hashFunc(cubeSet[i])) hs = append(hs, hashFunc(cubeSet[i]))

View File

@ -5,37 +5,37 @@ import (
"testing" "testing"
) )
func TestDecimal(t *testing.T) { func TestDecimalHash(t *testing.T) {
hypercube := []int{3, 2, 0, 1, 1, 4, 1, 0} cube := Cube{3, 2, 0, 1, 1, 4, 1, 0}
hash := Decimal(hypercube) hash := cube.DecimalHash()
want := uint64(32011410) want := uint64(32011410)
if hash != want { if hash != want {
t.Errorf(`Got %v, want %v.`, hash, want) t.Errorf(`Got %v, want %v.`, hash, want)
} }
} }
func TestFNV1a(t *testing.T) { func TestFNV1aHash(t *testing.T) {
buckets := []int{5, 59, 255, 9, 7, 12, 22, 31} cube := Cube{5, 59, 255, 9, 7, 12, 22, 31}
hash := FNV1a(buckets) hash := cube.FNV1aHash()
want := uint64(13992349377752315208) want := uint64(1659788114117494335)
if hash != want { if hash != want {
t.Errorf(`Got %v, want %v.`, hash, want) t.Errorf(`Got %v, want %v.`, hash, want)
} }
} }
func TestHashSet(t *testing.T) { func TestHashSet(t *testing.T) {
tree := [][]int{ cubes := Cubes{
{0, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9},
{1, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9},
{0, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9},
{1, 0, 8, 3, 0, 0, 9}} {1, 0, 8, 3, 0, 0, 9}}
hs := HashSet(tree, FNV1a) hashSet := cubes.HashSet((Cube).FNV1aHash)
want := []uint64{ want := []uint64{
14647827280143437043, 6172277127052188606,
17530493565529410009, 3265650857171344968,
7065940388079601005, 13730239218993256724,
13953051952027146823} 6843127655045710906}
if !reflect.DeepEqual(hs, want) { if !reflect.DeepEqual(hashSet, want) {
t.Errorf(`Got %v, want %v.`, hs, want) t.Errorf(`Got %v, want %v.`, hashSet, want)
} }
} }