API update. Fast decimal hash added.
parent
fdb7af71e7
commit
fa46a571ec
10
about.go
10
about.go
|
@ -7,8 +7,10 @@ package hyper
|
||||||
// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
|
// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
|
||||||
|
|
||||||
// A typical sequence of functions when using the package is:
|
// A typical sequence of functions when using the package is:
|
||||||
// 1) Params, 2) Hypercubes, 3) FVN1a to get the central hash,
|
// 1) Params, 2) CubeSet or CentralCube, depending which one
|
||||||
// and Hashes64 with FVN1a as the hash argument to get
|
// is used for a database record and which one for a query,
|
||||||
// the full hash set.
|
// 3) HashSet or CentralHash to get corresponding hashes
|
||||||
|
// from results of (2).
|
||||||
|
|
||||||
// You can also define own function for hashing hypercubes.
|
// It is possible to define own hashing function instead of
|
||||||
|
// using the default one.
|
||||||
|
|
43
hashes.go
43
hashes.go
|
@ -6,22 +6,47 @@ import (
|
||||||
"hash/fnv"
|
"hash/fnv"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Hash64 can be any function (user defined, for example).
|
// Decimal hashes hypercubes without collisions. For that
|
||||||
type Hash64 func(buckets []int) uint64
|
// it assumes that number of buckets is 10 or less
|
||||||
|
// and number of dimensions is 19 or less.
|
||||||
|
func Decimal(cube []int, numBuckets int) (h uint64) {
|
||||||
|
if numBuckets > 10 {
|
||||||
|
panic(`Decimal hash can only be used if
|
||||||
|
numBuckets <= 10. FVN1a can be used instead.`)
|
||||||
|
}
|
||||||
|
// Max uint64 equals 18446744073709551615,
|
||||||
|
// therefore larger number of dimensions will overflow.
|
||||||
|
if len(cube) > 19 {
|
||||||
|
panic(`Decimal hash can only be used if
|
||||||
|
number of dimensions is less than 20.
|
||||||
|
FVN1a hash can be used instead.`)
|
||||||
|
}
|
||||||
|
for _, v := range cube {
|
||||||
|
h = h*10 + uint64(v)
|
||||||
|
}
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
|
||||||
// FVN1a is the default hash in this package.
|
// FVN1a hashes hypercubes with rare collisions,
|
||||||
func FVN1a(buckets []int) uint64 {
|
// and should be used when Decimal cannot be used
|
||||||
|
// because of very large number of buckets or dimensions.
|
||||||
|
func FVN1a(cube []int) uint64 {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
gob.NewEncoder(&b).Encode(buckets)
|
gob.NewEncoder(&b).Encode(cube)
|
||||||
hash := fnv.New64a()
|
hash := fnv.New64a()
|
||||||
hash.Write(b.Bytes())
|
hash.Write(b.Bytes())
|
||||||
return hash.Sum64()
|
return hash.Sum64()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Hashes64 returns a set of hashes for a tree of bucket ids.
|
// HashFunc can be any function (also user-defined).
|
||||||
func Hashes64(tree [][]int, hash Hash64) (hs []uint64) {
|
type HashFunc func(hypercube []int) uint64
|
||||||
for i := 0; i < len(tree); i++ {
|
|
||||||
hs = append(hs, hash(tree[i]))
|
// Hash64Set returns a set of hashes for a hypercube set
|
||||||
|
// and a concrete hash function.
|
||||||
|
func HashSet(cubeSet [][]int, hashFunc HashFunc) (
|
||||||
|
hs []uint64) {
|
||||||
|
for i := 0; i < len(cubeSet); i++ {
|
||||||
|
hs = append(hs, hashFunc(cubeSet[i]))
|
||||||
}
|
}
|
||||||
return hs
|
return hs
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,17 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestDefault(t *testing.T) {
|
func TestDecimal(t *testing.T) {
|
||||||
|
numBuckets := 5
|
||||||
|
hypercube := []int{3, 2, 0, 1, 1, 4, 1, 0}
|
||||||
|
hash := Decimal(hypercube, numBuckets)
|
||||||
|
want := uint64(32011410)
|
||||||
|
if hash != want {
|
||||||
|
t.Errorf(`Got %v, want %v.`, hash, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFVN1a(t *testing.T) {
|
||||||
buckets := []int{5, 59, 255, 9, 7, 12, 22, 31}
|
buckets := []int{5, 59, 255, 9, 7, 12, 22, 31}
|
||||||
hash := FVN1a(buckets)
|
hash := FVN1a(buckets)
|
||||||
want := uint64(13992349377752315208)
|
want := uint64(13992349377752315208)
|
||||||
|
@ -14,13 +24,13 @@ func TestDefault(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestHashes64(t *testing.T) {
|
func TestHashSet(t *testing.T) {
|
||||||
tree := [][]int{
|
tree := [][]int{
|
||||||
{0, 0, 7, 3, 0, 0, 9},
|
{0, 0, 7, 3, 0, 0, 9},
|
||||||
{1, 0, 7, 3, 0, 0, 9},
|
{1, 0, 7, 3, 0, 0, 9},
|
||||||
{0, 0, 8, 3, 0, 0, 9},
|
{0, 0, 8, 3, 0, 0, 9},
|
||||||
{1, 0, 8, 3, 0, 0, 9}}
|
{1, 0, 8, 3, 0, 0, 9}}
|
||||||
hs := Hashes64(tree, FVN1a)
|
hs := HashSet(tree, FVN1a)
|
||||||
want := []uint64{
|
want := []uint64{
|
||||||
14647827280143437043,
|
14647827280143437043,
|
||||||
17530493565529410009,
|
17530493565529410009,
|
||||||
|
|
|
@ -7,7 +7,8 @@ package hyper
|
||||||
// of bucketWidth.
|
// of bucketWidth.
|
||||||
// eps is the absolute value of the uncertainty interval epsilon.
|
// eps is the absolute value of the uncertainty interval epsilon.
|
||||||
func Params(
|
func Params(
|
||||||
numBuckets int, min, max, epsPercent float64) (bucketWidth, eps float64) {
|
numBuckets int, min, max, epsPercent float64) (
|
||||||
|
bucketWidth, eps float64) {
|
||||||
if epsPercent >= 0.5 {
|
if epsPercent >= 0.5 {
|
||||||
panic(`Error: epsPercent must be less than 50%.
|
panic(`Error: epsPercent must be less than 50%.
|
||||||
Recommendation: decrease numBuckets instead.`)
|
Recommendation: decrease numBuckets instead.`)
|
||||||
|
@ -17,19 +18,18 @@ func Params(
|
||||||
return bucketWidth, eps
|
return bucketWidth, eps
|
||||||
}
|
}
|
||||||
|
|
||||||
// Hypercubes returns a set of hypercubes, which represent
|
// CubeSet returns a set of hypercubes, which represent
|
||||||
// fuzzy discretization of one n-dimensional vector, as described in
|
// fuzzy discretization of one n-dimensional vector,
|
||||||
|
// as described in
|
||||||
// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
|
// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
|
||||||
// One hupercube is defined by bucket numbers in each dimension.
|
// One hupercube is defined by bucket numbers in each dimension.
|
||||||
// The function also returns the central hypercube (in which
|
|
||||||
// the vector end is located).
|
|
||||||
// min and max are minimum and maximum possible values of
|
// min and max are minimum and maximum possible values of
|
||||||
// the vector components. The assumption is that min and max
|
// the vector components. The assumption is that min and max
|
||||||
// are the same for all dimensions.
|
// are the same for all dimensions.
|
||||||
// bucketWidth and eps are defined in the Params function.
|
// bucketWidth and eps are defined in the Params function.
|
||||||
func Hypercubes(
|
func CubeSet(
|
||||||
vector []float64, min, max, bucketWidth, eps float64) (
|
vector []float64, min, max, bucketWidth, eps float64) (
|
||||||
set [][]int, central []int) {
|
set [][]int) {
|
||||||
|
|
||||||
var (
|
var (
|
||||||
bC, bS int // Central and side bucket ids.
|
bC, bS int // Central and side bucket ids.
|
||||||
|
@ -42,7 +42,6 @@ func Hypercubes(
|
||||||
for _, val := range vector {
|
for _, val := range vector {
|
||||||
|
|
||||||
bC = int(val / bucketWidth)
|
bC = int(val / bucketWidth)
|
||||||
central = append(central, bC)
|
|
||||||
branching = false
|
branching = false
|
||||||
|
|
||||||
// Value is in the lower uncertainty interval.
|
// Value is in the lower uncertainty interval.
|
||||||
|
@ -103,9 +102,27 @@ func Hypercubes(
|
||||||
length = len(vector)
|
length = len(vector)
|
||||||
for i := 0; i < len(set); i++ {
|
for i := 0; i < len(set); i++ {
|
||||||
if len(set[i]) != length {
|
if len(set[i]) != length {
|
||||||
panic(`Number of hypercube coordinates must equal to len(vector).`)
|
panic(`Number of hypercube coordinates must equal
|
||||||
|
to len(vector).`)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return set, central
|
return set
|
||||||
|
}
|
||||||
|
|
||||||
|
// CentralCube returns the hypercube containing the vector end.
|
||||||
|
// Arguments are the same as for the CubeSet function.
|
||||||
|
func CentralCube(
|
||||||
|
vector []float64, min, max, bucketWidth, eps float64) (
|
||||||
|
central []int) {
|
||||||
|
|
||||||
|
var bC int // Central bucket ids.
|
||||||
|
|
||||||
|
// For each component of the vector.
|
||||||
|
for _, val := range vector {
|
||||||
|
bC = int(val / bucketWidth)
|
||||||
|
central = append(central, bC)
|
||||||
|
}
|
||||||
|
|
||||||
|
return central
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,7 +30,8 @@ func TestHypercubes(t *testing.T) {
|
||||||
numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25
|
numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25
|
||||||
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
|
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
|
||||||
bucketWidth, eps := Params(numBuckets, min, max, bucketPct)
|
bucketWidth, eps := Params(numBuckets, min, max, bucketPct)
|
||||||
gotCubes, gotCentral := Hypercubes(values, min, max, bucketWidth, eps)
|
gotCubes := CubeSet(values, min, max, bucketWidth, eps)
|
||||||
|
gotCentral := CentralCube(values, min, max, bucketWidth, eps)
|
||||||
wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9},
|
wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9},
|
||||||
{1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}}
|
{1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}}
|
||||||
wantCentral := []int{1, 0, 8, 3, 0, 0, 9}
|
wantCentral := []int{1, 0, 8, 3, 0, 0, 9}
|
||||||
|
@ -45,7 +46,8 @@ func TestHypercubes(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001}
|
values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001}
|
||||||
gotCubes, gotCentral = Hypercubes(values, min, max, bucketWidth, eps)
|
gotCubes = CubeSet(values, min, max, bucketWidth, eps)
|
||||||
|
gotCentral = CentralCube(values, min, max, bucketWidth, eps)
|
||||||
wantCubes = [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}}
|
wantCubes = [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}}
|
||||||
wantCentral = []int{0, 1, 2}
|
wantCentral = []int{0, 1, 2}
|
||||||
if !reflect.DeepEqual(gotCubes, wantCubes) {
|
if !reflect.DeepEqual(gotCubes, wantCubes) {
|
||||||
|
|
Loading…
Reference in New Issue