commit c0561abb32fdf20cbb8fca3199a8c4384afbf21d Author: Vitali Fedulov Date: Mon Oct 25 15:50:52 2021 +0200 first commit diff --git a/about.go b/about.go new file mode 100644 index 0000000..fdd2c59 --- /dev/null +++ b/about.go @@ -0,0 +1,4 @@ +package hyper + +// Package hyper discretizes n-dimensional space and generates hashes, +// so that fast approximate search of nearest points in n-space is possible. diff --git a/buckets.go b/buckets.go new file mode 100644 index 0000000..6229181 --- /dev/null +++ b/buckets.go @@ -0,0 +1,100 @@ +package hyper + +// Params returns discretization parameters. +// numBuckets represents number of discretization buckets into which all values +// will fall. Ids of those buckets will be used to create hashes. +// min and max are minimum and maximum possible values of discretized variable. +// bucketWidth is width of the discretization bucket. +// bucketPct is percentage of bucketWidth to allow for an error of discretized +// variable (a specific value of a discretized variable may fall into 2 buckets +// simultaneosly). +// eps is actual width corresponding to the bucketWidth bucketPct on the discretized +// variable axis. +func Params(numBuckets int, min, max, bucketPct float64) (bucketWidth, eps float64) { + if bucketPct >= 0.5 { + panic("Error: bucketPct must be less than 50%. Recommendation: decrease numBuckets instead.") + } + bucketWidth = (max - min) / float64(numBuckets) + eps = bucketPct * bucketWidth + return bucketWidth, eps +} + +// Buckets generates a set of slices of all possible bucket ids +// as permutations based on n-dimensional space discretization. +// point are values for each of those n dimensions. +// min and max are minimum and maximum possible values of discretized +// point components. The assumption is that min and max are the same for all +// dimensions (in the context of the Buckets function). +// bucketWidth and eps are defined in the Params function. +func Buckets(point []float64, min, max, bucketWidth, eps float64) (tree [][]int) { + + // Bucket ids. Default bucket is b. + var ( + val float64 // Sample value (one axis of n-space). + bL, bR int // Left and right bucket ids. + treeCopy [][]int // Bucket tree copy. + length int + ) + + // For each component of the point. + for k := 0; k < len(point); k++ { + val = point[k] + + bL = int((val - eps) / bucketWidth) + bR = int((val + eps) / bucketWidth) + + if val-eps < min { // No bucket for smaller than min. + bL = bR + } else if val+eps > max { // No bucket for larger than max. + bR = bL + } + + if bL == bR { // No branching. + if len(tree) == 0 { + tree = append(tree, []int{bL}) + } else { + length = len(tree) + for i := 0; i < length; i++ { + // Constructing buckets set. + tree[i] = append(tree[i], bL) + } + } + + } else { // Branching. + treeCopy = make([][]int, len(tree)) + copy(treeCopy, tree) + + if len(tree) == 0 { + tree = append(tree, []int{bL}) + } else { + length = len(tree) + for i := 0; i < length; i++ { + tree[i] = append(tree[i], bL) + } + } + + if len(treeCopy) == 0 { + treeCopy = append(treeCopy, []int{bR}) + } else { + length = len(treeCopy) + for i := 0; i < length; i++ { + treeCopy[i] = append(treeCopy[i], bR) + } + } + + tree = append(tree, treeCopy...) + } + + } + + // Verification that branching works correctly and no buckets are lost. + // TODO: Disable once whole package got tested on large image sets. + length = len(point) + for i := 0; i < len(tree); i++ { + if len(tree[i]) != length { + panic(`Buckets slice length must be equal to len(point).`) + } + } + + return tree +} diff --git a/buckets_test.go b/buckets_test.go new file mode 100644 index 0000000..571ab27 --- /dev/null +++ b/buckets_test.go @@ -0,0 +1,46 @@ +package hyper + +import ( + "reflect" + "testing" +) + +func TestParams(t *testing.T) { + numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25 + bucketWidth, eps := Params(numBuckets, min, max, bucketPct) + wantBucketWidth, wantEps := 25.5, 6.375 + if bucketWidth != wantBucketWidth { + t.Errorf(`Got bucketWidth %v, want %v`, bucketWidth, wantBucketWidth) + } + if eps != wantEps { + t.Errorf(`Got eps %v, want %v`, eps, wantEps) + } +} + +func TestParamsPanic(t *testing.T) { + defer func() { recover() }() + // Intentionally forbiden value for bucketPct. + numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.51 + _, _ = Params(numBuckets, min, max, bucketPct) + // Never reaches here if Params panics. + t.Errorf("Params did not panic on bucketPct > 0.5") +} + +func TestBuckets(t *testing.T) { + numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25 + values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9} + bucketWidth, eps := Params(numBuckets, min, max, bucketPct) + got := Buckets(values, min, max, bucketWidth, eps) + want := [][]int{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9}, + {0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}} + if !reflect.DeepEqual(got, want) { + t.Errorf(`Got %v, want %v. Number of buckets is %v.`, got, want, numBuckets) + } + + values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001} + got = Buckets(values, min, max, bucketWidth, eps) + want = [][]int{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}} + if !reflect.DeepEqual(got, want) { + t.Errorf(`Got %v, want %v. Number of buckets is %v.`, got, want, numBuckets) + } +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..97b44e2 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/vitali-fedulov/hyper + +go 1.16 diff --git a/hashes.go b/hashes.go new file mode 100644 index 0000000..8f72209 --- /dev/null +++ b/hashes.go @@ -0,0 +1,28 @@ +package hyper + +import ( + "bytes" + "encoding/gob" + "hash/fnv" +) + +// For a specific hashing function to be (re)defined. +type Hash func(buckets []int) uint64 + +// Fnva64 is a specific hash implementation, which returns +// a FVN-1a hash for a slice of bucket numbers. +func Fnva64(buckets []int) uint64 { + var b bytes.Buffer + gob.NewEncoder(&b).Encode(buckets) + hash := fnv.New64a() + hash.Write(b.Bytes()) + return hash.Sum64() +} + +// HashSet returns a slice of hashes for a tree of bucket ids. +func HashSet(tree [][]int, hash Hash) (hs []uint64) { + for i := 0; i < len(tree); i++ { + hs = append(hs, hash(tree[i])) + } + return hs +} diff --git a/hashes_test.go b/hashes_test.go new file mode 100644 index 0000000..e9a0311 --- /dev/null +++ b/hashes_test.go @@ -0,0 +1,32 @@ +package hyper + +import ( + "reflect" + "testing" +) + +func TestFnva64(t *testing.T) { + buckets := []int{5, 59, 255, 9, 7, 12, 22, 31} + hash := Fnva64(buckets) + want := uint64(13992349377752315208) + if hash != want { + t.Errorf(`Got %v, want %v`, hash, want) + } +} + +func TestHashSet(t *testing.T) { + tree := [][]int{ + {0, 0, 7, 3, 0, 0, 9}, + {1, 0, 7, 3, 0, 0, 9}, + {0, 0, 8, 3, 0, 0, 9}, + {1, 0, 8, 3, 0, 0, 9}} + hs := HashSet(tree, Fnva64) + want := []uint64{ + 14647827280143437043, + 17530493565529410009, + 7065940388079601005, + 13953051952027146823} + if !reflect.DeepEqual(hs, want) { + t.Errorf(`Got %v, want %v`, hs, want) + } +}