diff --git a/NEXT b/NEXT deleted file mode 100644 index 0d5e890..0000000 --- a/NEXT +++ /dev/null @@ -1,4 +0,0 @@ -- Panic in verification that 2*eps is less than bucket width. - Make sure the "if" equality is correct. - -- Make sure hash contains ":" between bucket numbers. Or the analogy to ":". \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..cca3119 --- /dev/null +++ b/README.md @@ -0,0 +1,17 @@ +# Hashing float vectors in N-dimensions + +This is a working beta version. + +### Algorithm + +https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html + +### How to use + +about.go contains a short instruction. + +Please fork if you want use it long term, as the API is not finalized yet. + +### TODO + +Add an example, for example related to package "images". \ No newline at end of file diff --git a/about.go b/about.go index fdd2c59..129eb3c 100644 --- a/about.go +++ b/about.go @@ -1,4 +1,14 @@ package hyper -// Package hyper discretizes n-dimensional space and generates hashes, -// so that fast approximate search of nearest points in n-space is possible. +// Package hyper allows fast approximate search of nearest +// neighbour vectors in n-dimensional space. +// Package functions discretize a vector and generate a set +// of fuzzy hashes, as described in the following paper: +// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html + +// A typical sequence of functions when using the package is: +// 1) Params, 2) Hypercubes, 3) FVN1a to get the central hash, +// and Hashes64 with FVN1a as the hash argument to get +// the full hash set. + +// You can also define own function for hashing hypercubes. diff --git a/buckets.go b/buckets.go deleted file mode 100644 index 6229181..0000000 --- a/buckets.go +++ /dev/null @@ -1,100 +0,0 @@ -package hyper - -// Params returns discretization parameters. -// numBuckets represents number of discretization buckets into which all values -// will fall. Ids of those buckets will be used to create hashes. -// min and max are minimum and maximum possible values of discretized variable. -// bucketWidth is width of the discretization bucket. -// bucketPct is percentage of bucketWidth to allow for an error of discretized -// variable (a specific value of a discretized variable may fall into 2 buckets -// simultaneosly). -// eps is actual width corresponding to the bucketWidth bucketPct on the discretized -// variable axis. -func Params(numBuckets int, min, max, bucketPct float64) (bucketWidth, eps float64) { - if bucketPct >= 0.5 { - panic("Error: bucketPct must be less than 50%. Recommendation: decrease numBuckets instead.") - } - bucketWidth = (max - min) / float64(numBuckets) - eps = bucketPct * bucketWidth - return bucketWidth, eps -} - -// Buckets generates a set of slices of all possible bucket ids -// as permutations based on n-dimensional space discretization. -// point are values for each of those n dimensions. -// min and max are minimum and maximum possible values of discretized -// point components. The assumption is that min and max are the same for all -// dimensions (in the context of the Buckets function). -// bucketWidth and eps are defined in the Params function. -func Buckets(point []float64, min, max, bucketWidth, eps float64) (tree [][]int) { - - // Bucket ids. Default bucket is b. - var ( - val float64 // Sample value (one axis of n-space). - bL, bR int // Left and right bucket ids. - treeCopy [][]int // Bucket tree copy. - length int - ) - - // For each component of the point. - for k := 0; k < len(point); k++ { - val = point[k] - - bL = int((val - eps) / bucketWidth) - bR = int((val + eps) / bucketWidth) - - if val-eps < min { // No bucket for smaller than min. - bL = bR - } else if val+eps > max { // No bucket for larger than max. - bR = bL - } - - if bL == bR { // No branching. - if len(tree) == 0 { - tree = append(tree, []int{bL}) - } else { - length = len(tree) - for i := 0; i < length; i++ { - // Constructing buckets set. - tree[i] = append(tree[i], bL) - } - } - - } else { // Branching. - treeCopy = make([][]int, len(tree)) - copy(treeCopy, tree) - - if len(tree) == 0 { - tree = append(tree, []int{bL}) - } else { - length = len(tree) - for i := 0; i < length; i++ { - tree[i] = append(tree[i], bL) - } - } - - if len(treeCopy) == 0 { - treeCopy = append(treeCopy, []int{bR}) - } else { - length = len(treeCopy) - for i := 0; i < length; i++ { - treeCopy[i] = append(treeCopy[i], bR) - } - } - - tree = append(tree, treeCopy...) - } - - } - - // Verification that branching works correctly and no buckets are lost. - // TODO: Disable once whole package got tested on large image sets. - length = len(point) - for i := 0; i < len(tree); i++ { - if len(tree[i]) != length { - panic(`Buckets slice length must be equal to len(point).`) - } - } - - return tree -} diff --git a/buckets_test.go b/buckets_test.go deleted file mode 100644 index 571ab27..0000000 --- a/buckets_test.go +++ /dev/null @@ -1,46 +0,0 @@ -package hyper - -import ( - "reflect" - "testing" -) - -func TestParams(t *testing.T) { - numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25 - bucketWidth, eps := Params(numBuckets, min, max, bucketPct) - wantBucketWidth, wantEps := 25.5, 6.375 - if bucketWidth != wantBucketWidth { - t.Errorf(`Got bucketWidth %v, want %v`, bucketWidth, wantBucketWidth) - } - if eps != wantEps { - t.Errorf(`Got eps %v, want %v`, eps, wantEps) - } -} - -func TestParamsPanic(t *testing.T) { - defer func() { recover() }() - // Intentionally forbiden value for bucketPct. - numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.51 - _, _ = Params(numBuckets, min, max, bucketPct) - // Never reaches here if Params panics. - t.Errorf("Params did not panic on bucketPct > 0.5") -} - -func TestBuckets(t *testing.T) { - numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25 - values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9} - bucketWidth, eps := Params(numBuckets, min, max, bucketPct) - got := Buckets(values, min, max, bucketWidth, eps) - want := [][]int{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9}, - {0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}} - if !reflect.DeepEqual(got, want) { - t.Errorf(`Got %v, want %v. Number of buckets is %v.`, got, want, numBuckets) - } - - values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001} - got = Buckets(values, min, max, bucketWidth, eps) - want = [][]int{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}} - if !reflect.DeepEqual(got, want) { - t.Errorf(`Got %v, want %v. Number of buckets is %v.`, got, want, numBuckets) - } -} diff --git a/hashes.go b/hashes.go index a313277..f811be3 100644 --- a/hashes.go +++ b/hashes.go @@ -9,9 +9,8 @@ import ( // Hash64 can be any function of this kind. type Hash64 func(buckets []int) uint64 -// Default is the default Hash64 function for this package. -// It returns a FVN-1a hash for a slice of bucket numbers. -func Default(buckets []int) uint64 { +// FVN1a is the default hash in this package. +func FVN1a(buckets []int) uint64 { var b bytes.Buffer gob.NewEncoder(&b).Encode(buckets) hash := fnv.New64a() diff --git a/hashes_test.go b/hashes_test.go index ce89755..854fc80 100644 --- a/hashes_test.go +++ b/hashes_test.go @@ -7,7 +7,7 @@ import ( func TestDefault(t *testing.T) { buckets := []int{5, 59, 255, 9, 7, 12, 22, 31} - hash := Default(buckets) + hash := FVN1a(buckets) want := uint64(13992349377752315208) if hash != want { t.Errorf(`Got %v, want %v`, hash, want) @@ -20,7 +20,7 @@ func TestHashes64(t *testing.T) { {1, 0, 7, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}} - hs := Hashes64(tree, Default) + hs := Hashes64(tree, FVN1a) want := []uint64{ 14647827280143437043, 17530493565529410009, diff --git a/hypercubes.go b/hypercubes.go new file mode 100644 index 0000000..e5808fc --- /dev/null +++ b/hypercubes.go @@ -0,0 +1,117 @@ +package hyper + +// Params returns discretization parameters. +// numBuckets represents number of discretization buckets into +// which all values will fall. Ids of those buckets will be used +// to create hashes. +// min and max are minimum and maximum possible values +// of discretized variable. +// bucketWidth is width of the discretization bucket. +// bucketPct is percentage of bucketWidth to allow for an error +// of discretized variable (a specific value of a discretized +// variable may fall into 2 buckets simultaneosly). +// eps is actual width corresponding to the bucketWidth bucketPct +// on the discretized variable axis. +func Params( + numBuckets int, min, max, bucketPct float64) (bucketWidth, eps float64) { + if bucketPct >= 0.5 { + panic(`Error: bucketPct must be less than 50%. + Recommendation: decrease numBuckets instead.`) + } + bucketWidth = (max - min) / float64(numBuckets) + eps = bucketPct * bucketWidth + return bucketWidth, eps +} + +// Hypercubes returns a set of hypercubes, which represent +// fuzzy discretization of one n-dimensional vector, as described in +// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html +// One hupercube is defined by bucket numbers in each dimension. +// The function also returns the central hypercube (in which +// the vector end is located). +// min and max are minimum and maximum possible values of +// the vector components. The assumption is that min and max +// are the same for all dimensions. +// bucketWidth and eps are defined in the Params function. +func Hypercubes( + vector []float64, min, max, bucketWidth, eps float64) ( + set [][]int, central []int) { + + var ( + bC, bS int // Central and side bucket ids. + setCopy [][]int // Set copy. + length int + branching bool // Branching flag. + ) + + // For each component of the vector. + for _, val := range vector { + + bC = int(val / bucketWidth) + central = append(central, bC) + branching = false + + // Value is in the lower uncertainty interval. + if val-float64(bC)*bucketWidth < eps { + bS = bC - 1 + if val-eps >= min { + branching = true + } + + // Value is in the upper uncertainty interval. + } else if float64(bC+1)*bucketWidth-val < eps { + bS = bC + 1 + if val+eps <= max { + branching = true + } + } + + if branching { + setCopy = make([][]int, len(set)) + copy(setCopy, set) + + if len(set) == 0 { + set = append(set, []int{bC}) + } else { + length = len(set) + for i := 0; i < length; i++ { + set[i] = append(set[i], bC) + } + } + + if len(setCopy) == 0 { + setCopy = append(setCopy, []int{bS}) + } else { + length = len(setCopy) + for i := 0; i < length; i++ { + setCopy[i] = append(setCopy[i], bS) + } + } + + set = append(set, setCopy...) + + } else { + + if len(set) == 0 { + set = append(set, []int{bC}) + } else { + length = len(set) + for i := 0; i < length; i++ { + set[i] = append(set[i], bC) + } + } + } + } + + // Real use case verification that branching works correctly + // and no buckets are lost for a very large number of vectors. + // TODO: Remove once tested. + length = len(vector) + for i := 0; i < len(set); i++ { + if len(set[i]) != length { + panic(`Number of hypercube coordinates must equal to len(vector).`) + } + } + + return set, central +} diff --git a/hypercubes_test.go b/hypercubes_test.go new file mode 100644 index 0000000..089724c --- /dev/null +++ b/hypercubes_test.go @@ -0,0 +1,75 @@ +package hyper + +import ( + "reflect" + "testing" +) + +func TestParams(t *testing.T) { + numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25 + bucketWidth, eps := Params(numBuckets, min, max, bucketPct) + wantBucketWidth, wantEps := 25.5, 6.375 + if bucketWidth != wantBucketWidth { + t.Errorf(`Got bucketWidth %v, want %v`, bucketWidth, wantBucketWidth) + } + if eps != wantEps { + t.Errorf(`Got eps %v, want %v`, eps, wantEps) + } +} + +func TestParamsPanic(t *testing.T) { + defer func() { recover() }() + // Intentionally forbiden value for bucketPct. + numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.51 + _, _ = Params(numBuckets, min, max, bucketPct) + // Never reaches here if Params panics. + t.Errorf("Params did not panic on bucketPct > 0.5") +} + +func TestHypercubes(t *testing.T) { + numBuckets, min, max, bucketPct := 10, 0.0, 255.0, 0.25 + values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9} + bucketWidth, eps := Params(numBuckets, min, max, bucketPct) + gotCubes, gotCentral := Hypercubes(values, min, max, bucketWidth, eps) + wantCubes := [][]int{{1, 0, 8, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9}, + {1, 0, 7, 3, 0, 0, 9}, {0, 0, 7, 3, 0, 0, 9}} + wantCentral := []int{1, 0, 8, 3, 0, 0, 9} + if !reflect.DeepEqual(gotCubes, wantCubes) { + t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes) + } + if !reflect.DeepEqual(gotCentral, wantCentral) { + t.Errorf(`Got %v, want %v.`, gotCentral, wantCentral) + } + if centralIsNotInTheSet(gotCubes, gotCentral) { + t.Errorf(`Central %v is not in the set %v.`, gotCentral, gotCubes) + } + + values = []float64{0.01, bucketWidth * 2 * 0.999, bucketWidth * 2 * 1.001} + gotCubes, gotCentral = Hypercubes(values, min, max, bucketWidth, eps) + wantCubes = [][]int{{0, 1, 2}, {0, 2, 2}, {0, 1, 1}, {0, 2, 1}} + wantCentral = []int{0, 1, 2} + if !reflect.DeepEqual(gotCubes, wantCubes) { + t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes) + } + if !reflect.DeepEqual(gotCentral, wantCentral) { + t.Errorf(`Got %v, want %v.`, gotCentral, wantCentral) + } + if centralIsNotInTheSet(gotCubes, wantCentral) { + t.Errorf(`Central %v is not in the set %v.`, gotCentral, gotCubes) + } +} + +func centralIsNotInTheSet(set [][]int, central []int) bool { + for _, cube := range set { + counter := 0 + for i, c := range central { + if cube[i] == c { + counter++ + } + } + if counter == len(central) { + return false + } + } + return true +}