diff --git a/about.go b/about.go index 3a663a7..7469a7a 100644 --- a/about.go +++ b/about.go @@ -7,8 +7,10 @@ package hyper // https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html // A typical sequence of functions when using the package is: -// 1) Params, 2) Hypercubes, 3) FVN1a to get the central hash, -// and Hashes64 with FVN1a as the hash argument to get -// the full hash set. +// 1) Params, 2) CubeSet or CentralCube, depending which one +// is used for a database record and which one for a query, +// 3) HashSet or CentralHash to get corresponding hashes +// from results of (2). -// You can also define own function for hashing hypercubes. +// It is possible to define own hashing function instead of +// using the default one. diff --git a/hashes.go b/hashes.go index 3054c4e..31cf378 100644 --- a/hashes.go +++ b/hashes.go @@ -6,22 +6,47 @@ import ( "hash/fnv" ) -// Hash64 can be any function (user defined, for example). -type Hash64 func(buckets []int) uint64 +// Decimal hashes hypercubes without collisions. For that +// it assumes that number of buckets is 10 or less +// and number of dimensions is 19 or less. +func Decimal(cube []int, numBuckets int) (h uint64) { + if numBuckets > 10 { + panic(`Decimal hash can only be used if + numBuckets <= 10. FVN1a can be used instead.`) + } + // Max uint64 equals 18446744073709551615, + // therefore larger number of dimensions will overflow. + if len(cube) > 19 { + panic(`Decimal hash can only be used if + number of dimensions is less than 20. + FVN1a hash can be used instead.`) + } + for _, v := range cube { + h = h*10 + uint64(v) + } + return h +} -// FVN1a is the default hash in this package. -func FVN1a(buckets []int) uint64 { +// FVN1a hashes hypercubes with rare collisions, +// and should be used when Decimal cannot be used +// because of very large number of buckets or dimensions. +func FVN1a(cube []int) uint64 { var b bytes.Buffer - gob.NewEncoder(&b).Encode(buckets) + gob.NewEncoder(&b).Encode(cube) hash := fnv.New64a() hash.Write(b.Bytes()) return hash.Sum64() } -// Hashes64 returns a set of hashes for a tree of bucket ids. -func Hashes64(tree [][]int, hash Hash64) (hs []uint64) { - for i := 0; i < len(tree); i++ { - hs = append(hs, hash(tree[i])) +// HashFunc can be any function (also user-defined). +type HashFunc func(hypercube []int) uint64 + +// Hash64Set returns a set of hashes for a hypercube set +// and a concrete hash function. +func HashSet(cubeSet [][]int, hashFunc HashFunc) ( + hs []uint64) { + for i := 0; i < len(cubeSet); i++ { + hs = append(hs, hashFunc(cubeSet[i])) } return hs } diff --git a/hashes_test.go b/hashes_test.go index 6e78550..7f1c9af 100644 --- a/hashes_test.go +++ b/hashes_test.go @@ -5,7 +5,17 @@ import ( "testing" ) -func TestDefault(t *testing.T) { +func TestDecimal(t *testing.T) { + numBuckets := 5 + hypercube := []int{3, 2, 0, 1, 1, 4, 1, 0} + hash := Decimal(hypercube, numBuckets) + want := uint64(32011410) + if hash != want { + t.Errorf(`Got %v, want %v.`, hash, want) + } +} + +func TestFVN1a(t *testing.T) { buckets := []int{5, 59, 255, 9, 7, 12, 22, 31} hash := FVN1a(buckets) want := uint64(13992349377752315208) @@ -14,13 +24,13 @@ func TestDefault(t *testing.T) { } } -func TestHashes64(t *testing.T) { +func TestHashSet(t *testing.T) { tree := [][]int{ {0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}} - hs := Hashes64(tree, FVN1a) + hs := HashSet(tree, FVN1a) want := []uint64{ 14647827280143437043, 17530493565529410009, diff --git a/hypercubes.go b/hypercubes.go index 1268e4d..a9624a1 100644 --- a/hypercubes.go +++ b/hypercubes.go @@ -7,7 +7,8 @@ package hyper // of bucketWidth. // eps is the absolute value of the uncertainty interval epsilon. func Params( - numBuckets int, min, max, epsPercent float64) (bucketWidth, eps float64) { + numBuckets int, min, max, epsPercent float64) ( + bucketWidth, eps float64) { if epsPercent >= 0.5 { panic(`Error: epsPercent must be less than 50%. Recommendation: decrease numBuckets instead.`) @@ -17,19 +18,18 @@ func Params( return bucketWidth, eps } -// Hypercubes returns a set of hypercubes, which represent -// fuzzy discretization of one n-dimensional vector, as described in +// CubeSet returns a set of hypercubes, which represent +// fuzzy discretization of one n-dimensional vector, +// as described in // https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html // One hupercube is defined by bucket numbers in each dimension. -// The function also returns the central hypercube (in which -// the vector end is located). // min and max are minimum and maximum possible values of // the vector components. The assumption is that min and max // are the same for all dimensions. // bucketWidth and eps are defined in the Params function. -func Hypercubes( +func CubeSet( vector []float64, min, max, bucketWidth, eps float64) ( - set [][]int, central []int) { + set [][]int) { var ( bC, bS int // Central and side bucket ids. @@ -42,7 +42,6 @@ func Hypercubes( for _, val := range vector { bC = int(val / bucketWidth) - central = append(central, bC) branching = false // Value is in the lower uncertainty interval. @@ -103,9 +102,27 @@ func Hypercubes( length = len(vector) for i := 0; i < len(set); i++ { if len(set[i]) != length { - panic(`Number of hypercube coordinates must equal to len(vector).`) + panic(`Number of hypercube coordinates must equal + to len(vector).`) } } - return set, central + return set +} + +// CentralCube returns the hypercube containing the vector end. +// Arguments are the same as for the CubeSet function. +func CentralCube( + vector []float64, min, max, bucketWidth, eps float64) ( + central []int) { + + var bC int // Central bucket ids. + + // For each component of the vector. + for _, val := range vector { + bC = int(val / bucketWidth) + central = append(central, bC) + } + + return central } diff --git a/hypercubes_test.go b/hypercubes_test.go index 559d45e..7ce3b17 100644 --- a/hypercubes_test.go +++ b/hypercubes_test.go @@ -30,7 +30,8 @@ func TestHypercubes(t *testing.T) { numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25 values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9} bucketWidth, eps := Params(numBuckets, min, max, bucketPct) - gotCubes, gotCentral := Hypercubes(values, min, max, bucketWidth, eps) + gotCubes := CubeSet(values, min, max, bucketWidth, eps) + gotCentral := CentralCube(values, min, max, bucketWidth, eps) wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}} wantCentral := []int{1, 0, 8, 3, 0, 0, 9} @@ -45,7 +46,8 @@ func TestHypercubes(t *testing.T) { } values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001} - gotCubes, gotCentral = Hypercubes(values, min, max, bucketWidth, eps) + gotCubes = CubeSet(values, min, max, bucketWidth, eps) + gotCentral = CentralCube(values, min, max, bucketWidth, eps) wantCubes = [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}} wantCentral = []int{0, 1, 2} if !reflect.DeepEqual(gotCubes, wantCubes) {