parent
0d4c3c110c
commit
c42552770d
12
README.md
12
README.md
|
@ -1,11 +1,11 @@
|
|||
# Hashing float vectors in N-dimensions
|
||||
|
||||
This is an early beta version.
|
||||
Package hyper allows fast approximate search of nearest neighbour vectors in n-dimensional space.
|
||||
|
||||
### Algorithm
|
||||
**This is an early beta version**. Description below will be improved (TODO). See tests for examples.
|
||||
|
||||
https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
|
||||
Package functions discretize a vector and generate a set of hashes, as described in the following document: https://similar.pictures/algorithm-for-hashing-high-dimensional-float-vectors.html
|
||||
|
||||
### How to use
|
||||
|
||||
about.go contains a short instruction.
|
||||
To use the package follow the sequence of functions/methods:
|
||||
1) CubeSet or CentralCube, depending which one is used for a database record and which one for a query.
|
||||
2) HashSet and DecimalHash to get corresponding hash set and central hash from results of (2). If DecimalHash is not suitable because of very large number of buckets or dimensions, use FNV1aHash to get both the hash set and the central hash).
|
||||
|
|
16
about.go
16
about.go
|
@ -1,16 +0,0 @@
|
|||
package hyper
|
||||
|
||||
// Package hyper allows fast approximate search of nearest
|
||||
// neighbour vectors in n-dimensional space.
|
||||
// Package functions discretize a vector and generate a set
|
||||
// of fuzzy hashes, as described in the following document:
|
||||
// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
|
||||
|
||||
// To use the package follow the sequence:
|
||||
// 1) CubeSet or CentralCube, depending which one
|
||||
// is used for a database record and which one for a query,
|
||||
// 2) HashSet and Decimal to get corresponding hash set
|
||||
// and central hash from results of (2). If Decimal hash
|
||||
// is not suitable because of very large number of buckets
|
||||
// or dimensions, use FNV1a to get both the hash set and
|
||||
// the central hash).
|
|
@ -1,24 +1,18 @@
|
|||
package hyper
|
||||
|
||||
// rescale is a helper function to offset and rescale all values
|
||||
// to [0, numBuckets] range.
|
||||
func rescale(vector []float64, numBuckets int, min, max float64) []float64 {
|
||||
rescaled := make([]float64, len(vector))
|
||||
amp := max - min
|
||||
for i := range vector {
|
||||
// Offset to zero and rescale to [0, numBuckets] range.
|
||||
rescaled[i] = (vector[i] - min) * float64(numBuckets) / amp
|
||||
}
|
||||
return rescaled
|
||||
}
|
||||
// Hypercube is represented by a slice of its coordinates.
|
||||
type Cube []int
|
||||
type Cubes []Cube
|
||||
|
||||
// clone makes a totally independent copy of a 2D slice.
|
||||
func clone(src [][]int) (dst [][]int) {
|
||||
dst = make([][]int, len(src))
|
||||
for i := range src {
|
||||
dst[i] = append([]int{}, src[i]...)
|
||||
}
|
||||
return dst
|
||||
// Parameters of space discretization.
|
||||
type Params struct {
|
||||
// Value limits per dimension. For example 0, 255 for pixel values.
|
||||
Min, Max float64
|
||||
// Uncertainty interval expressed as a fraction of bucketWidth
|
||||
// (for example 0.25 for eps = 1/4 of bucketWidth).
|
||||
EpsPercent float64
|
||||
// Number of buckets per dimension.
|
||||
NumBuckets int
|
||||
}
|
||||
|
||||
// CubeSet returns a set of hypercubes, which represent
|
||||
|
@ -29,49 +23,43 @@ func clone(src [][]int) (dst [][]int) {
|
|||
// min and max are minimum and maximum possible values of
|
||||
// the vector components. The assumption is that min and max
|
||||
// are the same for all dimensions.
|
||||
// numBuckets is number of buckets per dimension.
|
||||
// min and max are value limits per dimension.
|
||||
// epsPercent is the uncertainty interval expressed as
|
||||
// a fraction of bucketWidth (for example 0.25 for eps = 1/4
|
||||
// of bucketWidth).
|
||||
func CubeSet(vector []float64, min, max, epsPercent float64,
|
||||
numBuckets int) (set [][]int) {
|
||||
func CubeSet(vector []float64, params Params) (set Cubes) {
|
||||
|
||||
if epsPercent >= 0.5 {
|
||||
panic(`Error: epsPercent must be less than 0.5.`)
|
||||
if params.EpsPercent >= 0.5 {
|
||||
panic(`Error: EpsPercent must be less than 0.5.`)
|
||||
}
|
||||
|
||||
var (
|
||||
bC int // Central bucket number.
|
||||
bL, bR int // Left and right bucket number.
|
||||
setL, setR [][]int // Set copies.
|
||||
branching bool // Branching flag.
|
||||
bC int // Central bucket number.
|
||||
bL, bR int // Left and right bucket number.
|
||||
setL, setR Cubes // Set clones (for Left and Right).
|
||||
branching bool // Branching flag.
|
||||
)
|
||||
|
||||
// Rescaling vector to avoid potential mistakes with
|
||||
// divisions and offsets later on.
|
||||
rescaled := rescale(vector, numBuckets, min, max)
|
||||
rescaled := rescale(vector, params)
|
||||
// After the rescale value range of the vector are
|
||||
// [0, numBuckets], and not [min, max].
|
||||
|
||||
// min = 0.0 from now on.
|
||||
max = float64(numBuckets)
|
||||
max := float64(params.NumBuckets)
|
||||
|
||||
for _, val := range rescaled {
|
||||
|
||||
branching = false
|
||||
|
||||
bL = int(val - epsPercent)
|
||||
bR = int(val + epsPercent)
|
||||
bL = int(val - params.EpsPercent)
|
||||
bR = int(val + params.EpsPercent)
|
||||
|
||||
// Get extreme values out of the way.
|
||||
if val-epsPercent <= 0.0 { // This means that val >= 0.
|
||||
if val-params.EpsPercent <= 0.0 { // This means that val >= 0.
|
||||
bC = bR
|
||||
goto branchingCheck // No branching.
|
||||
}
|
||||
|
||||
// Get extreme values out of the way.
|
||||
if val+epsPercent >= max { // This means that val =< max.
|
||||
if val+params.EpsPercent >= max { // This means that val =< max.
|
||||
// Above max = numBuckets.
|
||||
bC = bL
|
||||
goto branchingCheck // No branching.
|
||||
|
@ -135,33 +123,54 @@ func CubeSet(vector []float64, min, max, epsPercent float64,
|
|||
|
||||
// CentralCube returns the hypercube containing the vector end.
|
||||
// Arguments are the same as for the CubeSet function.
|
||||
func CentralCube(vector []float64, min, max, epsPercent float64,
|
||||
numBuckets int) (central []int) {
|
||||
func CentralCube(vector []float64, params Params) (central Cube) {
|
||||
|
||||
if epsPercent >= 0.5 {
|
||||
panic(`Error: epsPercent must be less than 0.5.`)
|
||||
if params.EpsPercent >= 0.5 {
|
||||
panic(`Error: EpsPercent must be less than 0.5.`)
|
||||
}
|
||||
|
||||
var bC int // Central bucket numbers.
|
||||
|
||||
// Rescaling vector to avoid potential mistakes with
|
||||
// divisions and offsets later on.
|
||||
rescaled := rescale(vector, numBuckets, min, max)
|
||||
rescaled := rescale(vector, params)
|
||||
// After the rescale value range of the vector are
|
||||
// [0, numBuckets], and not [min, max].
|
||||
|
||||
// min = 0.0 from now on.
|
||||
max = float64(numBuckets)
|
||||
max := float64(params.NumBuckets)
|
||||
|
||||
for _, val := range rescaled {
|
||||
bC = int(val)
|
||||
if val-epsPercent <= 0.0 { // This means that val >= 0.
|
||||
bC = int(val + epsPercent)
|
||||
if val-params.EpsPercent <= 0.0 { // This means that val >= 0.
|
||||
bC = int(val + params.EpsPercent)
|
||||
}
|
||||
if val+epsPercent >= max { // Meaning val =< max.
|
||||
bC = int(val - epsPercent)
|
||||
if val+params.EpsPercent >= max { // Meaning val =< max.
|
||||
bC = int(val - params.EpsPercent)
|
||||
}
|
||||
central = append(central, bC)
|
||||
}
|
||||
return central
|
||||
}
|
||||
|
||||
// rescale is a helper function to offset and rescale all values
|
||||
// to [0, numBuckets] range.
|
||||
func rescale(vector []float64, params Params) []float64 {
|
||||
rescaled := make([]float64, len(vector))
|
||||
amp := params.Max - params.Min
|
||||
for i := range vector {
|
||||
// Offset to zero and rescale to [0, numBuckets] range.
|
||||
rescaled[i] =
|
||||
(vector[i] - params.Min) * float64(params.NumBuckets) / amp
|
||||
}
|
||||
return rescaled
|
||||
}
|
||||
|
||||
// clone makes an unlinked copy of a 2D slice.
|
||||
func clone(src Cubes) (dst Cubes) {
|
||||
dst = make(Cubes, len(src))
|
||||
for i := range src {
|
||||
dst[i] = append(Cube{}, src[i]...)
|
||||
}
|
||||
return dst
|
||||
}
|
|
@ -5,7 +5,7 @@ import (
|
|||
"testing"
|
||||
)
|
||||
|
||||
func centralIsNotInTheSet(set [][]int, central []int) bool {
|
||||
func centralIsNotInTheSet(set Cubes, central Cube) bool {
|
||||
for _, cube := range set {
|
||||
counter := 0
|
||||
for i, c := range central {
|
||||
|
@ -21,9 +21,9 @@ func centralIsNotInTheSet(set [][]int, central []int) bool {
|
|||
}
|
||||
|
||||
func TestRescale(t *testing.T) { // Testing panic.
|
||||
numBuckets, min, max, _ := 10, 0.0, 255.0, 0.25
|
||||
vector := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 255.0}
|
||||
rescaled := rescale(vector, numBuckets, min, max)
|
||||
params := Params{0.0, 255.0, 0.25, 10}
|
||||
rescaled := rescale(vector, params)
|
||||
got := rescaled
|
||||
want := []float64{
|
||||
1, 0.0003921568627450981, 8.24705882352941,
|
||||
|
@ -38,20 +38,20 @@ func TestCubeSet1(t *testing.T) { // Testing panic.
|
|||
defer func() { recover() }()
|
||||
// Intentionally forbiden value for epsPercent.
|
||||
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
|
||||
min, max, epsPercent, numBuckets := 0.0, 255.0, 0.51, 10
|
||||
_ = CubeSet(values, min, max, epsPercent, numBuckets)
|
||||
params := Params{0.0, 255.0, 0.51, 10}
|
||||
_ = CubeSet(values, params)
|
||||
// Never reaches here if Params panics.
|
||||
t.Errorf("Params did not panic on epsPercent > 0.5")
|
||||
}
|
||||
|
||||
func TestCubeSet2(t *testing.T) {
|
||||
numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25
|
||||
params := Params{0.0, 255.0, 0.25, 10}
|
||||
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
|
||||
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
|
||||
gotCentral := CentralCube(values, min, max, epsPercent, numBuckets)
|
||||
wantCubes := [][]int{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9},
|
||||
gotCubes := CubeSet(values, params)
|
||||
gotCentral := CentralCube(values, params)
|
||||
wantCubes := Cubes{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9},
|
||||
{0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}}
|
||||
wantCentral := []int{1, 0, 8, 3, 0, 0, 9}
|
||||
wantCentral := Cube{1, 0, 8, 3, 0, 0, 9}
|
||||
if !reflect.DeepEqual(gotCubes, wantCubes) {
|
||||
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
|
||||
}
|
||||
|
@ -65,12 +65,12 @@ func TestCubeSet2(t *testing.T) {
|
|||
|
||||
// Testing bucket borders.
|
||||
func TestCubeSet3(t *testing.T) {
|
||||
numBuckets, min, max, epsPercent := 4, 0.0, 4.0, 0.25
|
||||
params := Params{0.0, 4.0, 0.25, 4}
|
||||
values := []float64{0.01, 2 * 0.999, 2 * 1.001}
|
||||
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
|
||||
gotCentral := CentralCube(values, min, max, epsPercent, numBuckets)
|
||||
wantCubes := [][]int{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}}
|
||||
wantCentral := []int{0, 1, 2}
|
||||
gotCubes := CubeSet(values, params)
|
||||
gotCentral := CentralCube(values, params)
|
||||
wantCubes := Cubes{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}}
|
||||
wantCentral := Cube{0, 1, 2}
|
||||
if !reflect.DeepEqual(gotCubes, wantCubes) {
|
||||
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
|
||||
}
|
||||
|
@ -85,9 +85,9 @@ func TestCubeSet3(t *testing.T) {
|
|||
// Testing extreme buckets.
|
||||
func TestCubeSet4(t *testing.T) {
|
||||
values := []float64{255.0, 0.0, 255.0, 0.0, 255.0, 0.0, 255.0}
|
||||
numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25
|
||||
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
|
||||
wantCubes := [][]int{{3, 0, 3, 0, 3, 0, 3}}
|
||||
params := Params{0.0, 255.0, 0.25, 4}
|
||||
gotCubes := CubeSet(values, params)
|
||||
wantCubes := Cubes{{3, 0, 3, 0, 3, 0, 3}}
|
||||
if !reflect.DeepEqual(gotCubes, wantCubes) {
|
||||
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
|
||||
}
|
||||
|
@ -97,9 +97,9 @@ var vector = []float64{
|
|||
0, 183, 148, 21, 47, 16, 69, 45, 151, 64, 181}
|
||||
|
||||
func TestCubeSet5(t *testing.T) {
|
||||
numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25
|
||||
gotCubes := CubeSet(vector, min, max, epsPercent, numBuckets)
|
||||
wantCubes := [][]int{
|
||||
params := Params{0.0, 255.0, 0.25, 4}
|
||||
gotCubes := CubeSet(vector, params)
|
||||
wantCubes := Cubes{
|
||||
{0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2}, {0, 3, 2, 0, 0, 0, 0, 0, 2, 0, 2},
|
||||
{0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 2}, {0, 3, 2, 0, 0, 0, 1, 0, 2, 0, 2},
|
||||
{0, 2, 2, 0, 0, 0, 0, 0, 2, 1, 2}, {0, 3, 2, 0, 0, 0, 0, 0, 2, 1, 2},
|
12
hashes.go
12
hashes.go
|
@ -6,22 +6,22 @@ import (
|
|||
"hash/fnv"
|
||||
)
|
||||
|
||||
// Decimal hashes hypercubes without collisions. IMPORTANT:
|
||||
// DecimalHash hashes hypercubes without collisions. IMPORTANT:
|
||||
// To work correctly, the number of buckets must be
|
||||
// less than 11 and the number of dimensions less than 20.
|
||||
// Else at certain unexpected moment you might get a hash
|
||||
// value overflow.
|
||||
func Decimal(cube []int) (h uint64) {
|
||||
func (cube Cube) DecimalHash() (h uint64) {
|
||||
for _, v := range cube {
|
||||
h = h*10 + uint64(v)
|
||||
}
|
||||
return h
|
||||
}
|
||||
|
||||
// FNV1a hashes hypercubes with rare collisions,
|
||||
// FNV1aHash hashes hypercubes with rare collisions,
|
||||
// and should be used when Decimal cannot be used
|
||||
// because of very large number of buckets or dimensions.
|
||||
func FNV1a(cube []int) uint64 {
|
||||
func (cube Cube) FNV1aHash() uint64 {
|
||||
var b bytes.Buffer
|
||||
gob.NewEncoder(&b).Encode(cube)
|
||||
hash := fnv.New64a()
|
||||
|
@ -30,11 +30,11 @@ func FNV1a(cube []int) uint64 {
|
|||
}
|
||||
|
||||
// HashFunc can be any function (also user-defined).
|
||||
type HashFunc func(hypercube []int) uint64
|
||||
type HashFunc func(cube Cube) uint64
|
||||
|
||||
// Hash64Set returns a set of hashes for a hypercube set
|
||||
// and a concrete hash function.
|
||||
func HashSet(cubeSet [][]int, hashFunc HashFunc) (
|
||||
func (cubeSet Cubes) HashSet(hashFunc HashFunc) (
|
||||
hs []uint64) {
|
||||
for i := 0; i < len(cubeSet); i++ {
|
||||
hs = append(hs, hashFunc(cubeSet[i]))
|
||||
|
|
|
@ -5,37 +5,37 @@ import (
|
|||
"testing"
|
||||
)
|
||||
|
||||
func TestDecimal(t *testing.T) {
|
||||
hypercube := []int{3, 2, 0, 1, 1, 4, 1, 0}
|
||||
hash := Decimal(hypercube)
|
||||
func TestDecimalHash(t *testing.T) {
|
||||
cube := Cube{3, 2, 0, 1, 1, 4, 1, 0}
|
||||
hash := cube.DecimalHash()
|
||||
want := uint64(32011410)
|
||||
if hash != want {
|
||||
t.Errorf(`Got %v, want %v.`, hash, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFNV1a(t *testing.T) {
|
||||
buckets := []int{5, 59, 255, 9, 7, 12, 22, 31}
|
||||
hash := FNV1a(buckets)
|
||||
want := uint64(13992349377752315208)
|
||||
func TestFNV1aHash(t *testing.T) {
|
||||
cube := Cube{5, 59, 255, 9, 7, 12, 22, 31}
|
||||
hash := cube.FNV1aHash()
|
||||
want := uint64(1659788114117494335)
|
||||
if hash != want {
|
||||
t.Errorf(`Got %v, want %v.`, hash, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHashSet(t *testing.T) {
|
||||
tree := [][]int{
|
||||
cubes := Cubes{
|
||||
{0, 0, 7, 3, 0, 0, 9},
|
||||
{1, 0, 7, 3, 0, 0, 9},
|
||||
{0, 0, 8, 3, 0, 0, 9},
|
||||
{1, 0, 8, 3, 0, 0, 9}}
|
||||
hs := HashSet(tree, FNV1a)
|
||||
hashSet := cubes.HashSet((Cube).FNV1aHash)
|
||||
want := []uint64{
|
||||
14647827280143437043,
|
||||
17530493565529410009,
|
||||
7065940388079601005,
|
||||
13953051952027146823}
|
||||
if !reflect.DeepEqual(hs, want) {
|
||||
t.Errorf(`Got %v, want %v.`, hs, want)
|
||||
6172277127052188606,
|
||||
3265650857171344968,
|
||||
13730239218993256724,
|
||||
6843127655045710906}
|
||||
if !reflect.DeepEqual(hashSet, want) {
|
||||
t.Errorf(`Got %v, want %v.`, hashSet, want)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue