-
Notifications
You must be signed in to change notification settings - Fork 251
/
Copy pathbincount.jl
279 lines (240 loc) · 8.55 KB
/
bincount.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
# Optimization of bin counts for histograms, heatmaps, hexbin plots, etc.
#
# I'm using the penalized maximum-likelihood method proposed in
# Birge, L, and Rozenholc, Y. (2006) How many bins should be put in a regular
# histogram?
#
# There has been quite a bit written on this problem, but there are a number of
# methods that all seem to give good results with little difference. Birge's
# method is simple (it's just AIC with an extra logarithmic term), has a decent
# theoretical justification, and is general enough to apply to multidimensional
# and non-regular bin selecetion problems. Though, the penalty they use was
# optimized for regular histograms, so may need to be tweaked.
#
# The Birge penalty is
# penalty(D) = D - 1 + log(D)^2.5
# where D is the number of bins. The 2.5 constant was arrived at emperically by
# optimizing over samples from example density functions.
#
# Penalized log-likelihood function for a histogram with d regular bins.
#
# Args:
# d: Number of bins in the histogram.
# n: Number of sample (which should equal sum(bincounts[1:d])).
# bincounts: An array giving the number occurrences in each bin.
# binwidth: Width of each bin in the histogram.
#
# Returns:
# Log-likelihood with Birge's penalty applied.
#
function bincount_pll(d::Int, n::Int, bincounts::Vector{Int}, binwidth::AbstractFloat)
ll = 0
for i in 1:d
if bincounts[i] > 0
ll += bincounts[i] * log(d * bincounts[i] / (n * binwidth))
end
end
ll - (d - 1 + log(d)^2.5)
end
# Estimate proportion of values that are distinct
#
# Sample enough values to decide whether we're effectively
# continuous (defined as >90% of the sampled values are unique)
# By looking at a subset we ensure this isn't a bottleneck
function estimate_distinct_proportion{T}(values::AbstractArray{T})
uvalues = Set{T}()
n_sampled = n_tried = 0
while n_sampled < 15 && n_tried < length(values)
v = values[rand(1:length(values))]
n_tried += 1
Gadfly.isconcrete(v) || continue
n_sampled += 1
push!(uvalues, v)
end
return length(uvalues) / n_sampled
end
# Tally elements in xs into an array of counts.
#
# Args:
# bincounts: count per bin
# xs: values being tallied
# x_min: minimum value in xs
# binwidth: width of each bin
# numbins: number of bins
#
function bin!(bincounts::Vector, xs, x_min, binwidth, numbins)
bincounts[1:numbins] = 0
for x in xs
if !isconcrete(x)
continue
end
idx = 1 + @compat floor(Int, (x - x_min) / binwidth)
bincounts[min(numbins, idx)] += 1
end
end
# Optimize the number of bins for a regular one dimensional histogram.
#
# Args:
# xs: A sample.
# d_min: Minimum number of bins to consider.
# d_max: Maximum number of bins to consider.
#
# Returns:
# A tuple of the form (d, bincounts, x_max), where d gives the optimal number of
# bins, and bincounts is an array giving the number of occurances in each
# bin, and x_max is the end point of the final bin.
#
function choose_bin_count_1d(xs::AbstractVector, d_min=1, d_max=150)
n = length(xs)
if n == 0
return 1, Int[0], 0
elseif n == 1
return 1, Int[1], xs[1]
end
x_min, x_max = Gadfly.concrete_minimum(xs), Gadfly.concrete_maximum(xs)
span = x_max - x_min
bincounts = zeros(Int, d_max)
d_best = d_min
pll_best = -Inf
# Brute force optimization: since the number of bins has to be reasonably
# small to plot, this is pretty quick and very simple.
d_best = d_min
if d_min < d_max
for d in d_min:d_max
binwidth = span / d
bin!(bincounts, xs, x_min, binwidth, d)
pll = bincount_pll(d, n, bincounts, binwidth)
if pll > pll_best
d_best = d
pll_best = pll
end
end
end
binwidth = span / d_best
bin!(bincounts, xs, x_min, binwidth, d_best)
return d_best, bincounts, x_max
end
# Choose a reasonable number of bins when the data is considered discrete, which
# this case means integers, or low-resolution real number measurements.
#
# This works similarly to choose_bin_count_1d, except restricts the binwidths to
# be a multiple of the smallest distance between two adjacent unique values.
# E.g. for integer data, this means bin width are integers as well. This usually
# leads to better results.
#
# Args:
# xs: A sample.
# xs_set: An of the unique values in xs in sorted order.
# d_min: Minimum number of allowed bins.
# d_max: Maximum number of allowed bins.
#
# Returns
# A tuple of the form (d, bincounts, x_max), where d gives the optimal number of
# bins, and bincounts is an array giving the number of occurances in each
# bin, and x_max is the end point of the final bin.
#
function choose_bin_count_1d_discrete(xs::AbstractArray, xs_set::AbstractArray,
d_min=1, d_max=150)
n = length(xs_set)
T = eltype(xs)
if n == 0
return 1, Int[0], 0
elseif n == 1
return 1, Int[length(xs)], xs[1] + convert(T, one(T))
end
# minimum distance between two values
T = eltype(xs)
mingap = convert(T, zero(T))
for (i, j) in zip(1:length(xs_set)-1, 2:length(xs_set))
gap = xs_set[j] - xs_set[i]
if isconcrete(gap) && gap > zero(T)
if mingap == convert(T, zero(T))
mingap = gap
else
mingap = min(gap, mingap)
end
end
end
x_min, x_max = Gadfly.concrete_minimum(xs), Gadfly.concrete_maximum(xs)
span = x_max - x_min
d_best = d_min
pll_best = -Inf
bincounts = zeros(Int, d_max)
for d in d_min:d_max
binwidth = span / d
binwidth = ceil(binwidth / mingap) * mingap # round to a multiple of mingap
# don't bother with binning that stretches past the end of the data
if binwidth == mingap && x_min + binwidth * (d - 1) > x_max
break
end
bin!(bincounts, xs, x_min, binwidth, d)
pll = bincount_pll(d, n, bincounts, binwidth)
if pll > pll_best
d_best = d
pll_best = pll
end
end
d = d_best
binwidth = ceil(span / d / mingap) * mingap
x_max = x_min + binwidth * d
bin!(bincounts, xs, x_min, binwidth, d_best)
return d_best, bincounts, x_max
end
# Optimize the number of bins for regular two dimensional histograms.
#
# Args:
# xs: Dimension one data.
# ys: Dimension two data.
#
# Returns:
# A tuple of the form (dx, dy, bincounts), where dx, dy gives the number of
# bins in each respective dimension and bincounts is a dx by dy matrix giving
# the count in each bin.
#
function choose_bin_count_2d(xs::AbstractVector, ys::AbstractVector,
xminbincount::Int, xmaxbincount::Int,
yminbincount::Int, ymaxbincount::Int)
# For two demensions, I'm just going to optimize the marginal bin counts.
# This might not be optimal, but its simple and fast.
x_min, x_max = Gadfly.concrete_minimum(xs), Gadfly.concrete_maximum(xs)
y_min, y_max = Gadfly.concrete_minimum(ys), Gadfly.concrete_maximum(ys)
if estimate_distinct_proportion(xs) < 0.9
value_set = sort!(collect(Set(xs[Bool[Gadfly.isconcrete(v) for v in xs]])))
dx, _ = choose_bin_count_1d_discrete(xs, value_set, xminbincount, xmaxbincount)
else
dx, _ = choose_bin_count_1d(xs, xminbincount, xmaxbincount)
end
if estimate_distinct_proportion(ys) < 0.9
value_set = sort!(collect(Set(ys[Bool[Gadfly.isconcrete(v) for v in ys]])))
dy, _ = choose_bin_count_1d_discrete(ys, value_set, yminbincount, ymaxbincount)
else
dy, _ = choose_bin_count_1d(ys, yminbincount, ymaxbincount)
end
# bin widths
wx = (x_max - x_min) / dx
wy = (y_max - y_min) / dy
bincounts = zeros(Int, (dy, dx))
for (x, y) in zip(xs, ys)
if !Gadfly.isconcrete(x) || !Gadfly.isconcrete(y)
continue
end
i = max(1, min(dx, 1 + (@compat floor(Int, (x - x_min) / wx))))
j = max(1, min(dy, 1 + (@compat floor(Int, (y - y_min) / wy))))
bincounts[j, i] += 1
end
(dy, dx, bincounts)
end
# Optimize the number of bins for hexagonal 2d histograms.
#
# Args:
# xs: Dimension one data.
# ys: Dimension two data.
#
# Returns:
# A tuple of the form (size, bincounts), where 'size' is the hexagon size and
# bincounts is an arrays storing the count for each hexagon, using axial
# coordinates.
#
function choose_hex_bin_count(xs::AbstractVector, ys::AbstractVector)
# TODO: this should probably be a thing
end