API update. Fast decimal hash added.

2021-12-19 04:53:51 +01:00 · 2021-12-19 04:53:51 +01:00 · fa46a571ec
parent fdb7af71e7
commit fa46a571ec
5 changed files with 84 additions and 28 deletions
--- a/about.go
+++ b/about.go
@ -7,8 +7,10 @@ package hyper
 // https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html

 // A typical sequence of functions when using the package is:
-// 1) Params, 2) Hypercubes, 3) FVN1a to get the central hash,
-// and Hashes64 with FVN1a as the hash argument to get
-// the full hash set.
+// 1) Params, 2) CubeSet or CentralCube, depending which one
+// is used for a database record and which one for a query,
+// 3) HashSet or CentralHash to get corresponding hashes
+// from results of (2).

-// You can also define own function for hashing hypercubes.
+// It is possible to define own hashing function instead of
+// using the default one.
--- a/hashes.go
+++ b/hashes.go
@ -6,22 +6,47 @@ import (
 	"hash/fnv"
 )

-// Hash64 can be any function (user defined, for example).
-type Hash64 func(buckets []int) uint64
+// Decimal hashes hypercubes without collisions. For that
+// it assumes that number of buckets is 10 or less
+// and number of dimensions is 19 or less.
+func Decimal(cube []int, numBuckets int) (h uint64) {
+	if numBuckets > 10 {
+		panic(`Decimal hash can only be used if
+		numBuckets <= 10. FVN1a can be used instead.`)
+	}
+	// Max uint64 equals 18446744073709551615,
+	// therefore larger number of dimensions will overflow.
+	if len(cube) > 19 {
+		panic(`Decimal hash can only be used if
+		number of dimensions is less than 20.
+		FVN1a hash can be used instead.`)
+	}
+	for _, v := range cube {
+		h = h*10 + uint64(v)
+	}
+	return h
+}

-// FVN1a is the default hash in this package.
-func FVN1a(buckets []int) uint64 {
+// FVN1a hashes hypercubes with rare collisions,
+// and should be used when Decimal cannot be used
+// because of very large number of buckets or dimensions.
+func FVN1a(cube []int) uint64 {
 	var b bytes.Buffer
-	gob.NewEncoder(&b).Encode(buckets)
+	gob.NewEncoder(&b).Encode(cube)
 	hash := fnv.New64a()
 	hash.Write(b.Bytes())
 	return hash.Sum64()
 }

-// Hashes64 returns a set of hashes for a tree of bucket ids.
-func Hashes64(tree [][]int, hash Hash64) (hs []uint64) {
-	for i := 0; i < len(tree); i++ {
-		hs = append(hs, hash(tree[i]))
+// HashFunc can be any function (also user-defined).
+type HashFunc func(hypercube []int) uint64
+
+// Hash64Set returns a set of hashes for a hypercube set
+// and a concrete hash function.
+func HashSet(cubeSet [][]int, hashFunc HashFunc) (
+	hs []uint64) {
+	for i := 0; i < len(cubeSet); i++ {
+		hs = append(hs, hashFunc(cubeSet[i]))
 	}
 	return hs
 }
--- a/hashes_test.go
+++ b/hashes_test.go
@ -5,7 +5,17 @@ import (
 	"testing"
 )

-func TestDefault(t *testing.T) {
+func TestDecimal(t *testing.T) {
+	numBuckets := 5
+	hypercube := []int{3, 2, 0, 1, 1, 4, 1, 0}
+	hash := Decimal(hypercube, numBuckets)
+	want := uint64(32011410)
+	if hash != want {
+		t.Errorf(`Got %v, want %v.`, hash, want)
+	}
+}
+
+func TestFVN1a(t *testing.T) {
 	buckets := []int{5, 59, 255, 9, 7, 12, 22, 31}
 	hash := FVN1a(buckets)
 	want := uint64(13992349377752315208)
@ -14,13 +24,13 @@ func TestDefault(t *testing.T) {
 	}
 }

-func TestHashes64(t *testing.T) {
+func TestHashSet(t *testing.T) {
 	tree := [][]int{
 		{0, 0, 7, 3, 0, 0, 9},
 		{1, 0, 7, 3, 0, 0, 9},
 		{0, 0, 8, 3, 0, 0, 9},
 		{1, 0, 8, 3, 0, 0, 9}}
-	hs := Hashes64(tree, FVN1a)
+	hs := HashSet(tree, FVN1a)
 	want := []uint64{
 		14647827280143437043,
 		17530493565529410009,
--- a/hypercubes.go
+++ b/hypercubes.go
@ -7,7 +7,8 @@ package hyper
 // of bucketWidth.
 // eps is the absolute value of the uncertainty interval epsilon.
 func Params(
-	numBuckets int, min, max, epsPercent float64) (bucketWidth, eps float64) {
+	numBuckets int, min, max, epsPercent float64) (
+	bucketWidth, eps float64) {
 	if epsPercent >= 0.5 {
 		panic(`Error: epsPercent must be less than 50%.
 			Recommendation: decrease numBuckets instead.`)
@ -17,19 +18,18 @@ func Params(
 	return bucketWidth, eps
 }

-// Hypercubes returns a set of hypercubes, which represent
-// fuzzy discretization of one n-dimensional vector, as described in
+// CubeSet returns a set of hypercubes, which represent
+// fuzzy discretization of one n-dimensional vector,
+// as described in
 // https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
 // One hupercube is defined by bucket numbers in each dimension.
-// The function also returns the central hypercube (in which
-// the vector end is located).
 // min and max are minimum and maximum possible values of
 // the vector components. The assumption is that min and max
 // are the same for all dimensions.
 // bucketWidth and eps are defined in the Params function.
-func Hypercubes(
+func CubeSet(
 	vector []float64, min, max, bucketWidth, eps float64) (
-	set [][]int, central []int) {
+	set [][]int) {

 	var (
 		bC, bS    int     // Central and side bucket ids.
@ -42,7 +42,6 @@ func Hypercubes(
 	for _, val := range vector {

 		bC = int(val / bucketWidth)
-		central = append(central, bC)
 		branching = false

 		// Value is in the lower uncertainty interval.
@ -103,9 +102,27 @@ func Hypercubes(
 	length = len(vector)
 	for i := 0; i < len(set); i++ {
 		if len(set[i]) != length {
-			panic(`Number of hypercube coordinates must equal to len(vector).`)
+			panic(`Number of hypercube coordinates must equal
+			to len(vector).`)
 		}
 	}

-	return set, central
+	return set
+}
+
+// CentralCube returns the hypercube containing the vector end.
+// Arguments are the same as for the CubeSet function.
+func CentralCube(
+	vector []float64, min, max, bucketWidth, eps float64) (
+	central []int) {
+
+	var bC int // Central bucket ids.
+
+	// For each component of the vector.
+	for _, val := range vector {
+		bC = int(val / bucketWidth)
+		central = append(central, bC)
+	}
+
+	return central
 }
--- a/hypercubes_test.go
+++ b/hypercubes_test.go
@ -30,7 +30,8 @@ func TestHypercubes(t *testing.T) {
 	numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25
 	values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
 	bucketWidth, eps := Params(numBuckets, min, max, bucketPct)
-	gotCubes, gotCentral := Hypercubes(values, min, max, bucketWidth, eps)
+	gotCubes := CubeSet(values, min, max, bucketWidth, eps)
+	gotCentral := CentralCube(values, min, max, bucketWidth, eps)
 	wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9},
 		{1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}}
 	wantCentral := []int{1, 0, 8, 3, 0, 0, 9}
@ -45,7 +46,8 @@ func TestHypercubes(t *testing.T) {
 	}

 	values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001}
-	gotCubes, gotCentral = Hypercubes(values, min, max, bucketWidth, eps)
+	gotCubes = CubeSet(values, min, max, bucketWidth, eps)
+	gotCentral = CentralCube(values, min, max, bucketWidth, eps)
 	wantCubes = [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}}
 	wantCentral = []int{0, 1, 2}
 	if !reflect.DeepEqual(gotCubes, wantCubes) {