118 lines
3.2 KiB
Go
118 lines
3.2 KiB
Go
package hyper
|
|
|
|
// Params returns discretization parameters.
|
|
// numBuckets represents number of discretization buckets into
|
|
// which all values will fall. Ids of those buckets will be used
|
|
// to create hashes.
|
|
// min and max are minimum and maximum possible values
|
|
// of discretized variable.
|
|
// bucketWidth is width of the discretization bucket.
|
|
// bucketPct is percentage of bucketWidth to allow for an error
|
|
// of discretized variable (a specific value of a discretized
|
|
// variable may fall into 2 buckets simultaneosly).
|
|
// eps is actual width corresponding to the bucketWidth bucketPct
|
|
// on the discretized variable axis.
|
|
func Params(
|
|
numBuckets int, min, max, bucketPct float64) (bucketWidth, eps float64) {
|
|
if bucketPct >= 0.5 {
|
|
panic(`Error: bucketPct must be less than 50%.
|
|
Recommendation: decrease numBuckets instead.`)
|
|
}
|
|
bucketWidth = (max - min) / float64(numBuckets)
|
|
eps = bucketPct * bucketWidth
|
|
return bucketWidth, eps
|
|
}
|
|
|
|
// Hypercubes returns a set of hypercubes, which represent
|
|
// fuzzy discretization of one n-dimensional vector, as described in
|
|
// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
|
|
// One hupercube is defined by bucket numbers in each dimension.
|
|
// The function also returns the central hypercube (in which
|
|
// the vector end is located).
|
|
// min and max are minimum and maximum possible values of
|
|
// the vector components. The assumption is that min and max
|
|
// are the same for all dimensions.
|
|
// bucketWidth and eps are defined in the Params function.
|
|
func Hypercubes(
|
|
vector []float64, min, max, bucketWidth, eps float64) (
|
|
set [][]int, central []int) {
|
|
|
|
var (
|
|
bC, bS int // Central and side bucket ids.
|
|
setCopy [][]int // Set copy.
|
|
length int
|
|
branching bool // Branching flag.
|
|
)
|
|
|
|
// For each component of the vector.
|
|
for _, val := range vector {
|
|
|
|
bC = int(val / bucketWidth)
|
|
central = append(central, bC)
|
|
branching = false
|
|
|
|
// Value is in the lower uncertainty interval.
|
|
if val-float64(bC)*bucketWidth < eps {
|
|
bS = bC - 1
|
|
if val-eps >= min {
|
|
branching = true
|
|
}
|
|
|
|
// Value is in the upper uncertainty interval.
|
|
} else if float64(bC+1)*bucketWidth-val < eps {
|
|
bS = bC + 1
|
|
if val+eps <= max {
|
|
branching = true
|
|
}
|
|
}
|
|
|
|
if branching {
|
|
setCopy = make([][]int, len(set))
|
|
copy(setCopy, set)
|
|
|
|
if len(set) == 0 {
|
|
set = append(set, []int{bC})
|
|
} else {
|
|
length = len(set)
|
|
for i := 0; i < length; i++ {
|
|
set[i] = append(set[i], bC)
|
|
}
|
|
}
|
|
|
|
if len(setCopy) == 0 {
|
|
setCopy = append(setCopy, []int{bS})
|
|
} else {
|
|
length = len(setCopy)
|
|
for i := 0; i < length; i++ {
|
|
setCopy[i] = append(setCopy[i], bS)
|
|
}
|
|
}
|
|
|
|
set = append(set, setCopy...)
|
|
|
|
} else {
|
|
|
|
if len(set) == 0 {
|
|
set = append(set, []int{bC})
|
|
} else {
|
|
length = len(set)
|
|
for i := 0; i < length; i++ {
|
|
set[i] = append(set[i], bC)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Real use case verification that branching works correctly
|
|
// and no buckets are lost for a very large number of vectors.
|
|
// TODO: Remove once tested.
|
|
length = len(vector)
|
|
for i := 0; i < len(set); i++ {
|
|
if len(set[i]) != length {
|
|
panic(`Number of hypercube coordinates must equal to len(vector).`)
|
|
}
|
|
}
|
|
|
|
return set, central
|
|
}
|