switch to methods

master v1.0.0
Vitali Fedulov 2022-01-19 09:08:56 +01:00
parent 0d4c3c110c
commit c42552770d
6 changed files with 104 additions and 111 deletions

View File

@ -1,11 +1,11 @@
# Hashing float vectors in N-dimensions
This is an early beta version.
Package hyper allows fast approximate search of nearest neighbour vectors in n-dimensional space.
### Algorithm
**This is an early beta version**. Description below will be improved (TODO). See tests for examples.
https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
Package functions discretize a vector and generate a set of hashes, as described in the following document: https://similar.pictures/algorithm-for-hashing-high-dimensional-float-vectors.html
### How to use
about.go contains a short instruction.
To use the package follow the sequence of functions/methods:
1) CubeSet or CentralCube, depending which one is used for a database record and which one for a query.
2) HashSet and DecimalHash to get corresponding hash set and central hash from results of (2). If DecimalHash is not suitable because of very large number of buckets or dimensions, use FNV1aHash to get both the hash set and the central hash).

View File

@ -1,16 +0,0 @@
package hyper
// Package hyper allows fast approximate search of nearest
// neighbour vectors in n-dimensional space.
// Package functions discretize a vector and generate a set
// of fuzzy hashes, as described in the following document:
// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
// To use the package follow the sequence:
// 1) CubeSet or CentralCube, depending which one
// is used for a database record and which one for a query,
// 2) HashSet and Decimal to get corresponding hash set
// and central hash from results of (2). If Decimal hash
// is not suitable because of very large number of buckets
// or dimensions, use FNV1a to get both the hash set and
// the central hash).

View File

@ -1,24 +1,18 @@
package hyper
// rescale is a helper function to offset and rescale all values
// to [0, numBuckets] range.
func rescale(vector []float64, numBuckets int, min, max float64) []float64 {
rescaled := make([]float64, len(vector))
amp := max - min
for i := range vector {
// Offset to zero and rescale to [0, numBuckets] range.
rescaled[i] = (vector[i] - min) * float64(numBuckets) / amp
}
return rescaled
}
// Hypercube is represented by a slice of its coordinates.
type Cube []int
type Cubes []Cube
// clone makes a totally independent copy of a 2D slice.
func clone(src [][]int) (dst [][]int) {
dst = make([][]int, len(src))
for i := range src {
dst[i] = append([]int{}, src[i]...)
}
return dst
// Parameters of space discretization.
type Params struct {
// Value limits per dimension. For example 0, 255 for pixel values.
Min, Max float64
// Uncertainty interval expressed as a fraction of bucketWidth
// (for example 0.25 for eps = 1/4 of bucketWidth).
EpsPercent float64
// Number of buckets per dimension.
NumBuckets int
}
// CubeSet returns a set of hypercubes, which represent
@ -29,49 +23,43 @@ func clone(src [][]int) (dst [][]int) {
// min and max are minimum and maximum possible values of
// the vector components. The assumption is that min and max
// are the same for all dimensions.
// numBuckets is number of buckets per dimension.
// min and max are value limits per dimension.
// epsPercent is the uncertainty interval expressed as
// a fraction of bucketWidth (for example 0.25 for eps = 1/4
// of bucketWidth).
func CubeSet(vector []float64, min, max, epsPercent float64,
numBuckets int) (set [][]int) {
func CubeSet(vector []float64, params Params) (set Cubes) {
if epsPercent >= 0.5 {
panic(`Error: epsPercent must be less than 0.5.`)
if params.EpsPercent >= 0.5 {
panic(`Error: EpsPercent must be less than 0.5.`)
}
var (
bC int // Central bucket number.
bL, bR int // Left and right bucket number.
setL, setR [][]int // Set copies.
branching bool // Branching flag.
bC int // Central bucket number.
bL, bR int // Left and right bucket number.
setL, setR Cubes // Set clones (for Left and Right).
branching bool // Branching flag.
)
// Rescaling vector to avoid potential mistakes with
// divisions and offsets later on.
rescaled := rescale(vector, numBuckets, min, max)
rescaled := rescale(vector, params)
// After the rescale value range of the vector are
// [0, numBuckets], and not [min, max].
// min = 0.0 from now on.
max = float64(numBuckets)
max := float64(params.NumBuckets)
for _, val := range rescaled {
branching = false
bL = int(val - epsPercent)
bR = int(val + epsPercent)
bL = int(val - params.EpsPercent)
bR = int(val + params.EpsPercent)
// Get extreme values out of the way.
if val-epsPercent <= 0.0 { // This means that val >= 0.
if val-params.EpsPercent <= 0.0 { // This means that val >= 0.
bC = bR
goto branchingCheck // No branching.
}
// Get extreme values out of the way.
if val+epsPercent >= max { // This means that val =< max.
if val+params.EpsPercent >= max { // This means that val =< max.
// Above max = numBuckets.
bC = bL
goto branchingCheck // No branching.
@ -135,33 +123,54 @@ func CubeSet(vector []float64, min, max, epsPercent float64,
// CentralCube returns the hypercube containing the vector end.
// Arguments are the same as for the CubeSet function.
func CentralCube(vector []float64, min, max, epsPercent float64,
numBuckets int) (central []int) {
func CentralCube(vector []float64, params Params) (central Cube) {
if epsPercent >= 0.5 {
panic(`Error: epsPercent must be less than 0.5.`)
if params.EpsPercent >= 0.5 {
panic(`Error: EpsPercent must be less than 0.5.`)
}
var bC int // Central bucket numbers.
// Rescaling vector to avoid potential mistakes with
// divisions and offsets later on.
rescaled := rescale(vector, numBuckets, min, max)
rescaled := rescale(vector, params)
// After the rescale value range of the vector are
// [0, numBuckets], and not [min, max].
// min = 0.0 from now on.
max = float64(numBuckets)
max := float64(params.NumBuckets)
for _, val := range rescaled {
bC = int(val)
if val-epsPercent <= 0.0 { // This means that val >= 0.
bC = int(val + epsPercent)
if val-params.EpsPercent <= 0.0 { // This means that val >= 0.
bC = int(val + params.EpsPercent)
}
if val+epsPercent >= max { // Meaning val =< max.
bC = int(val - epsPercent)
if val+params.EpsPercent >= max { // Meaning val =< max.
bC = int(val - params.EpsPercent)
}
central = append(central, bC)
}
return central
}
// rescale is a helper function to offset and rescale all values
// to [0, numBuckets] range.
func rescale(vector []float64, params Params) []float64 {
rescaled := make([]float64, len(vector))
amp := params.Max - params.Min
for i := range vector {
// Offset to zero and rescale to [0, numBuckets] range.
rescaled[i] =
(vector[i] - params.Min) * float64(params.NumBuckets) / amp
}
return rescaled
}
// clone makes an unlinked copy of a 2D slice.
func clone(src Cubes) (dst Cubes) {
dst = make(Cubes, len(src))
for i := range src {
dst[i] = append(Cube{}, src[i]...)
}
return dst
}

View File

@ -5,7 +5,7 @@ import (
"testing"
)
func centralIsNotInTheSet(set [][]int, central []int) bool {
func centralIsNotInTheSet(set Cubes, central Cube) bool {
for _, cube := range set {
counter := 0
for i, c := range central {
@ -21,9 +21,9 @@ func centralIsNotInTheSet(set [][]int, central []int) bool {
}
func TestRescale(t *testing.T) { // Testing panic.
numBuckets, min, max, _ := 10, 0.0, 255.0, 0.25
vector := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 255.0}
rescaled := rescale(vector, numBuckets, min, max)
params := Params{0.0, 255.0, 0.25, 10}
rescaled := rescale(vector, params)
got := rescaled
want := []float64{
1, 0.0003921568627450981, 8.24705882352941,
@ -38,20 +38,20 @@ func TestCubeSet1(t *testing.T) { // Testing panic.
defer func() { recover() }()
// Intentionally forbiden value for epsPercent.
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
min, max, epsPercent, numBuckets := 0.0, 255.0, 0.51, 10
_ = CubeSet(values, min, max, epsPercent, numBuckets)
params := Params{0.0, 255.0, 0.51, 10}
_ = CubeSet(values, params)
// Never reaches here if Params panics.
t.Errorf("Params did not panic on epsPercent > 0.5")
}
func TestCubeSet2(t *testing.T) {
numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25
params := Params{0.0, 255.0, 0.25, 10}
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
gotCentral := CentralCube(values, min, max, epsPercent, numBuckets)
wantCubes := [][]int{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9},
gotCubes := CubeSet(values, params)
gotCentral := CentralCube(values, params)
wantCubes := Cubes{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9},
{0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}}
wantCentral := []int{1, 0, 8, 3, 0, 0, 9}
wantCentral := Cube{1, 0, 8, 3, 0, 0, 9}
if !reflect.DeepEqual(gotCubes, wantCubes) {
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
}
@ -65,12 +65,12 @@ func TestCubeSet2(t *testing.T) {
// Testing bucket borders.
func TestCubeSet3(t *testing.T) {
numBuckets, min, max, epsPercent := 4, 0.0, 4.0, 0.25
params := Params{0.0, 4.0, 0.25, 4}
values := []float64{0.01, 2 * 0.999, 2 * 1.001}
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
gotCentral := CentralCube(values, min, max, epsPercent, numBuckets)
wantCubes := [][]int{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}}
wantCentral := []int{0, 1, 2}
gotCubes := CubeSet(values, params)
gotCentral := CentralCube(values, params)
wantCubes := Cubes{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}}
wantCentral := Cube{0, 1, 2}
if !reflect.DeepEqual(gotCubes, wantCubes) {
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
}
@ -85,9 +85,9 @@ func TestCubeSet3(t *testing.T) {
// Testing extreme buckets.
func TestCubeSet4(t *testing.T) {
values := []float64{255.0, 0.0, 255.0, 0.0, 255.0, 0.0, 255.0}
numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
wantCubes := [][]int{{3, 0, 3, 0, 3, 0, 3}}
params := Params{0.0, 255.0, 0.25, 4}
gotCubes := CubeSet(values, params)
wantCubes := Cubes{{3, 0, 3, 0, 3, 0, 3}}
if !reflect.DeepEqual(gotCubes, wantCubes) {
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
}
@ -97,9 +97,9 @@ var vector = []float64{
0, 183, 148, 21, 47, 16, 69, 45, 151, 64, 181}
func TestCubeSet5(t *testing.T) {
numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25
gotCubes := CubeSet(vector, min, max, epsPercent, numBuckets)
wantCubes := [][]int{
params := Params{0.0, 255.0, 0.25, 4}
gotCubes := CubeSet(vector, params)
wantCubes := Cubes{
{0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2}, {0, 3, 2, 0, 0, 0, 0, 0, 2, 0, 2},
{0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 2}, {0, 3, 2, 0, 0, 0, 1, 0, 2, 0, 2},
{0, 2, 2, 0, 0, 0, 0, 0, 2, 1, 2}, {0, 3, 2, 0, 0, 0, 0, 0, 2, 1, 2},

View File

@ -6,22 +6,22 @@ import (
"hash/fnv"
)
// Decimal hashes hypercubes without collisions. IMPORTANT:
// DecimalHash hashes hypercubes without collisions. IMPORTANT:
// To work correctly, the number of buckets must be
// less than 11 and the number of dimensions less than 20.
// Else at certain unexpected moment you might get a hash
// value overflow.
func Decimal(cube []int) (h uint64) {
func (cube Cube) DecimalHash() (h uint64) {
for _, v := range cube {
h = h*10 + uint64(v)
}
return h
}
// FNV1a hashes hypercubes with rare collisions,
// FNV1aHash hashes hypercubes with rare collisions,
// and should be used when Decimal cannot be used
// because of very large number of buckets or dimensions.
func FNV1a(cube []int) uint64 {
func (cube Cube) FNV1aHash() uint64 {
var b bytes.Buffer
gob.NewEncoder(&b).Encode(cube)
hash := fnv.New64a()
@ -30,11 +30,11 @@ func FNV1a(cube []int) uint64 {
}
// HashFunc can be any function (also user-defined).
type HashFunc func(hypercube []int) uint64
type HashFunc func(cube Cube) uint64
// Hash64Set returns a set of hashes for a hypercube set
// and a concrete hash function.
func HashSet(cubeSet [][]int, hashFunc HashFunc) (
func (cubeSet Cubes) HashSet(hashFunc HashFunc) (
hs []uint64) {
for i := 0; i < len(cubeSet); i++ {
hs = append(hs, hashFunc(cubeSet[i]))

View File

@ -5,37 +5,37 @@ import (
"testing"
)
func TestDecimal(t *testing.T) {
hypercube := []int{3, 2, 0, 1, 1, 4, 1, 0}
hash := Decimal(hypercube)
func TestDecimalHash(t *testing.T) {
cube := Cube{3, 2, 0, 1, 1, 4, 1, 0}
hash := cube.DecimalHash()
want := uint64(32011410)
if hash != want {
t.Errorf(`Got %v, want %v.`, hash, want)
}
}
func TestFNV1a(t *testing.T) {
buckets := []int{5, 59, 255, 9, 7, 12, 22, 31}
hash := FNV1a(buckets)
want := uint64(13992349377752315208)
func TestFNV1aHash(t *testing.T) {
cube := Cube{5, 59, 255, 9, 7, 12, 22, 31}
hash := cube.FNV1aHash()
want := uint64(1659788114117494335)
if hash != want {
t.Errorf(`Got %v, want %v.`, hash, want)
}
}
func TestHashSet(t *testing.T) {
tree := [][]int{
cubes := Cubes{
{0, 0, 7, 3, 0, 0, 9},
{1, 0, 7, 3, 0, 0, 9},
{0, 0, 8, 3, 0, 0, 9},
{1, 0, 8, 3, 0, 0, 9}}
hs := HashSet(tree, FNV1a)
hashSet := cubes.HashSet((Cube).FNV1aHash)
want := []uint64{
14647827280143437043,
17530493565529410009,
7065940388079601005,
13953051952027146823}
if !reflect.DeepEqual(hs, want) {
t.Errorf(`Got %v, want %v.`, hs, want)
6172277127052188606,
3265650857171344968,
13730239218993256724,
6843127655045710906}
if !reflect.DeepEqual(hashSet, want) {
t.Errorf(`Got %v, want %v.`, hashSet, want)
}
}