Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Th228 fits: pvalue, fwhm uncertainty, iterative fits (optional) #29

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
Expand Down Expand Up @@ -51,6 +52,7 @@ Distributions = "0.24, 0.25"
FillArrays = "0.7,0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1"
Formatting = "0.4"
ForwardDiff = "0.10"
Interpolations = "v0.15.1"
IntervalSets = "0.7"
InverseFunctions = "0.1"
IrrationalConstants = "0.1, 0.2"
Expand Down
2 changes: 1 addition & 1 deletion src/LegendSpecFits.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ include("aoe_calibration.jl")
include("specfit_combined.jl")
include("ctc.jl")
include("qc.jl")

include("gof.jl")
include("precompile.jl")

end # module
126 changes: 126 additions & 0 deletions src/gof.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# This file is a part of LegendSpecFits.jl, licensed under the MIT License (MIT).
"""
`gof.jl`
several functions to calculate goodness-of-fit (gof) for fits (-> `specfits.jl`):
"""

"""
aux. function to convert histogram data into bin edges, bin width and bin counts
"""
function prepare_data(h::Histogram{<:Real,1})
# get bin center, width and counts from histogrammed data
bin_edges = first(h.edges)
counts = h.weights
bin_centers = (bin_edges[begin:end-1] .+ bin_edges[begin+1:end]) ./ 2
bin_widths = bin_edges[begin+1:end] .- bin_edges[begin:end-1]
return counts, bin_widths, bin_centers
end
export prepare_data

"""
aux. function to get modelled peakshape based on histogram binning and best-fit parameter
"""
function get_model_counts(f_fit::Base.Callable,v_ml::NamedTuple,bin_centers::StepRangeLen,bin_widths::StepRangeLen)
model_func = Base.Fix2(f_fit, v_ml) # fix the fit parameters to ML best-estimate
model_counts = bin_widths.*map(energy->model_func(energy), bin_centers) # evaluate model at bin center (= binned measured energies)
return model_counts
end
export get_model_counts


"""
`p_value(f_fit, h, v_ml)` : calculate p-value based on least-squares
baseline method to get goodness-of-fit (gof)
input:
* f_fit --> function handle of fit function (peakshape)
* h --> histogram of data
* v_ml --> best-fit parameters
output:
* pval --> p-value of chi2 test
* chi2 --> chi2 value
* dof --> degrees of freedom
"""
function p_value(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
# prepare data
counts, bin_widths, bin_centers = prepare_data(h)

# get peakshape of best-fit
model_counts = get_model_counts(f_fit, v_ml, bin_centers,bin_widths)

# calculate chi2
chi2 = sum((model_counts[model_counts.>0]-counts[model_counts.>0]).^2 ./ model_counts[model_counts.>0])
npar = length(v_ml)
dof = length(counts[model_counts.>0])-npar
pval = ccdf(Chisq(dof),chi2)
if any(model_counts.<=5)
@warn "WARNING: bin with <=$(round(minimum(model_counts),digits=0)) counts - chi2 test might be not valid"
else
@debug "p-value = $(round(pval,digits=2))"
end
return pval, chi2, dof
end
export p_value


""" alternative p-value via loglikelihood ratio"""
function p_value_LogLikeRatio(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
# prepare data
counts, bin_widths, bin_centers = prepare_data(h)

# get peakshape of best-fit
model_counts =get_model_counts(f_fit, v_ml, bin_centers,bin_widths)

# calculate chi2
chi2 = sum((model_counts[model_counts.>0]-counts[model_counts.>0]).^2 ./ model_counts[model_counts.>0])
npar = length(v_ml)
dof = length(counts[model_counts.>0])-npar
pval = ccdf(Chisq(dof),chi2)
if any(model_counts.<=5)
@warn "WARNING: bin with <=$(minimum(model_counts)) counts - chi2 test might be not valid"
else
@debug "p-value = $(round(pval,digits=2))"
end
chi2 = 2*sum(model_counts.*log.(model_counts./counts)+model_counts-counts)
pval = ccdf(Chisq(dof),chi2)
return pval, chi2, dof
end
export p_value_LogLikeRatio

""" alternative p-value calculation via Monte Carlo sampling. Warning: computational more expensive than p_vaule() and p_value_LogLikeRatio()
* Create n_samples randomized histograms. For each bin, samples are drawn from a Poisson distribution with λ = model peak shape (best-fit parameter)
* Each sample histogram is fit using the model function `f_fit`
* For each sample fit, the max. loglikelihood fit is calculated
% p value --> comparison of sample max. loglikelihood and max. loglikelihood of best-fit
"""
function p_value_MC(f_fit::Base.Callable, h::Histogram{<:Real,1},ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background)},v_ml::NamedTuple,;n_samples::Int64=1000)
counts, bin_widths, bin_centers = prepare_data(h) # get data

# get peakshape of best-fit and maximum likelihood value
model_func = Base.Fix2(f_fit, v_ml) # fix the fit parameters to ML best-estimate
model_counts = bin_widths.*map(energy->model_func(energy), bin_centers) # evaluate model at bin center (= binned measured energies)
loglike_bf = -hist_loglike(model_func,h)

# draw sample for each bin
dists = Poisson.(model_counts) # create poisson distribution for each bin
counts_mc_vec = rand.(dists,n_samples) # randomized histogram counts
counts_mc = [ [] for _ in 1:n_samples ] #re-structure data_samples_vec to array of arrays, there is probably a better way to do this...
for i = 1:n_samples
counts_mc[i] = map(x -> x[i],counts_mc_vec)
end

# fit every sample histogram and calculate max. loglikelihood
loglike_bf_mc = NaN.*ones(n_samples)
h_mc = h # make copy of data histogram
for i=1:n_samples
h_mc.weights = counts_mc[i] # overwrite counts with MC values
result_fit_mc, report = fit_single_peak_th228(h_mc, ps ; uncertainty=false) # fit MC histogram
fit_par_mc = result_fit_mc[(:μ, :σ, :n, :step_amplitude, :skew_fraction, :skew_width, :background)]
model_func_sample = Base.Fix2(f_fit, fit_par_mc) # fix the fit parameters to ML best-estimate
loglike_bf_mc[i] = -hist_loglike(model_func_sample,h_mc) # loglikelihood for best-fit
end

# calculate p-value
pval= sum(loglike_bf_mc.<=loglike_bf)./n_samples # preliminary. could be improved e.g. with interpolation
return pval
end
export p_value_MC
64 changes: 42 additions & 22 deletions src/specfit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ th228_fit_functions = (
f_bck = (x, v) -> background_peakshape(x, v.μ, v.σ, v.step_amplitude, v.background),
f_sigWithTail = (x, v) -> signal_peakshape(x, v.μ, v.σ, v.n, v.skew_fraction) + lowEtail_peakshape(x, v.μ, v.σ, v.n, v.skew_fraction, v.skew_width)
)

export th228_fit_functions
"""
estimate_single_peak_stats(h::Histogram, calib_type::Symbol=:th228)

Expand Down Expand Up @@ -38,7 +38,6 @@ function estimate_single_peak_stats(h::Histogram,; calib_type::Symbol=:th228)
end
export estimate_single_peak_stats


function estimate_single_peak_stats_th228(h::Histogram{T}) where T<:Real
W = h.weights
E = first(h.edges)
Expand Down Expand Up @@ -88,16 +87,16 @@ Perform a fit of the peakshape to the data in `peakhists` using the initial valu
* `peak_fit_plots`: array of plots of the peak fits
* `return_vals`: dictionary of the fit results
"""
function fit_peaks(peakhists::Array, peakstats::StructArray, th228_lines::Array,; calib_type::Symbol=:th228, uncertainty::Bool=true, low_e_tail::Bool=true)
function fit_peaks(peakhists::Array, peakstats::StructArray, th228_lines::Array,; calib_type::Symbol=:th228, uncertainty::Bool=true, low_e_tail::Bool=true,iterative_fit::Bool=false)
if calib_type == :th228
return fit_peaks_th228(peakhists, peakstats, th228_lines,; uncertainty=uncertainty, low_e_tail=low_e_tail)
return fit_peaks_th228(peakhists, peakstats, th228_lines,; uncertainty=uncertainty, low_e_tail=low_e_tail,iterative_fit=iterative_fit)
else
error("Calibration type not supported")
end
end
export fit_peaks

function fit_peaks_th228(peakhists::Array, peakstats::StructArray, th228_lines::Array{T},; uncertainty::Bool=true, low_e_tail::Bool=true) where T<:Any
function fit_peaks_th228(peakhists::Array, peakstats::StructArray, th228_lines::Array{T},; uncertainty::Bool=true, low_e_tail::Bool=true, iterative_fit::Bool=false) where T<:Any
# create return and result dicts
result = Dict{T, NamedTuple}()
report = Dict{T, NamedTuple}()
Expand All @@ -108,6 +107,17 @@ function fit_peaks_th228(peakhists::Array, peakstats::StructArray, th228_lines::
ps = peakstats[i]
# fit peak
result_peak, report_peak = fit_single_peak_th228(h, ps, ; uncertainty=uncertainty, low_e_tail=low_e_tail)

# check covariance matrix for being semi positive definite (no negative uncertainties)
if uncertainty
if iterative_fit && !isposdef(result_peak.covmat)
@warn "Covariance matrix not positive definite for peak $peak - repeat fit without low energy tail"
pval_save = result_peak.pval
result_peak, report_peak = fit_single_peak_th228(h, ps, ; uncertainty=uncertainty, low_e_tail=false)
@info "New covariance matrix is positive definite: $(isposdef(result_peak.covmat))"
@info "p-val with low-energy tail p=$(round(pval_save,digits=5)) , without low-energy tail: p=$(round((result_peak.pval),digits=5))"
end
end
# save results
result[peak] = result_peak
report[peak] = report_peak
Expand All @@ -124,7 +134,9 @@ Also, FWHM is calculated from the fitted peakshape with MC error propagation. Th
* `result`: NamedTuple of the fit results containing values and errors
* `report`: NamedTuple of the fit report which can be plotted
"""
function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background), NTuple{5, T}}; uncertainty::Bool=true, low_e_tail::Bool=true, fixed_position::Bool=false, pseudo_prior::NamedTupleDist=NamedTupleDist(empty = true)) where T<:Real
function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background), NTuple{5, T}};
uncertainty::Bool=true, low_e_tail::Bool=true, fixed_position::Bool=false, pseudo_prior::NamedTupleDist=NamedTupleDist(empty = true),
fit_fun::Symbol=:f_fit) where T<:Real
# create standard pseudo priors
standard_pseudo_prior = NamedTupleDist(
μ = ifelse(fixed_position, ConstValueDist(ps.peak_pos), Uniform(ps.peak_pos-10, ps.peak_pos+10)),
Expand Down Expand Up @@ -152,8 +164,8 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw
# start values for MLE
v_init = mean(pseudo_prior)

# create loglikehood function
f_loglike = let f_fit=th228_fit_functions.f_fit, h=h
# create loglikehood function: f_loglike(v) that can be evaluated for any set of v (fit parameter)
f_loglike = let f_fit=th228_fit_functions[fit_fun], h=h
v -> hist_loglike(Base.Fix2(f_fit, v), h)
end

Expand All @@ -163,37 +175,41 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw
# best fit results
v_ml = inverse(f_trafo)(Optim.minimizer(opt_r))

f_loglike_array = let f_fit=gamma_peakshape, h=h
v -> - hist_loglike(x -> f_fit(x, v...), h)
f_loglike_array = let f_fit=th228_fit_functions[fit_fun], h=h, v_keys = keys(standard_pseudo_prior) #same loglikelihood function as f_loglike, but has array as input instead of NamedTuple
v -> - hist_loglike( x -> f_fit(x,NamedTuple{v_keys}(v)), h)
end

if uncertainty
# Calculate the Hessian matrix using ForwardDiff
H = ForwardDiff.hessian(f_loglike_array, tuple_to_array(v_ml))

# Calculate the parameter covariance matrix
param_covariance = inv(H)

param_covariance_raw = inv(H)
param_covariance = nearestSPD(param_covariance_raw)

# Extract the parameter uncertainties
v_ml_err = array_to_tuple(sqrt.(abs.(diag(param_covariance))), v_ml)

# calculate p-value
pval, chi2, dof = p_value(th228_fit_functions.f_fit, h, v_ml)

# get fwhm of peak
fwhm, fwhm_err = get_peak_fwhm_th228(v_ml, v_ml_err)
fwhm, fwhm_err = get_peak_fwhm_th228(v_ml, param_covariance)

@debug "Best Fit values"
@debug "μ: $(v_ml.μ) ± $(v_ml_err.μ)"
@debug "σ: $(v_ml.σ) ± $(v_ml_err.σ)"
@debug "n: $(v_ml.n) ± $(v_ml_err.n)"
@debug "p: $pval , chi2 = $(chi2) with $(dof) dof"
@debug "FWHM: $(fwhm) ± $(fwhm_err)"

result = merge(v_ml, (fwhm = fwhm, err = merge(v_ml_err, (fwhm = fwhm_err,))))
result = merge(v_ml, (pval = pval, chi2 = chi2, dof = dof, fwhm = fwhm,covmat = param_covariance, covmat_raw = param_covariance_raw,),(err = merge(v_ml_err, (fwhm = fwhm_err,)),))
report = (
v = v_ml,
h = h,
f_fit = x -> Base.Fix2(th228_fit_functions.f_fit, v_ml)(x),
f_sig = x -> Base.Fix2(th228_fit_functions.f_sig, v_ml)(x),
f_lowEtail = x -> Base.Fix2(th228_fit_functions.f_lowEtail, v_ml)(x),
f_bck = x -> Base.Fix2(th228_fit_functions.f_bck, v_ml)(x)
f_bck = x -> Base.Fix2(th228_fit_functions.f_bck, v_ml)(x),
)
else
# get fwhm of peak
Expand Down Expand Up @@ -240,8 +256,7 @@ function estimate_fwhm(v::NamedTuple)
return NaN
end
end


export estimate_fwhm
"""
get_peak_fwhm_th228(v_ml::NamedTuple, v_ml_err::NamedTuple)
Get the FWHM of a peak from the fit parameters while performing a MC error propagation.
Expand All @@ -250,19 +265,24 @@ Get the FWHM of a peak from the fit parameters while performing a MC error propa
* `fwhm`: the FWHM of the peak
* `fwhm_err`: the uncertainty of the FWHM of the peak
"""
function get_peak_fwhm_th228(v_ml::NamedTuple, v_ml_err::NamedTuple, uncertainty::Bool=true)
function get_peak_fwhm_th228(v_ml::NamedTuple, v_ml_err::Union{Matrix,NamedTuple},uncertainty::Bool=true)
# get fwhm for peak fit
fwhm = estimate_fwhm(v_ml)
if !uncertainty
return fwhm, NaN
end

# get MC for FWHM err
v_mc = get_mc_value_shapes(v_ml, v_ml_err, 1000)
if isa(v_ml_err,Matrix)# use correlated fit parameter uncertainties
v_mc = get_mc_value_shapes(v_ml, v_ml_err, 10000)
elseif isa(v_ml_err,NamedTuple) # use uncorrelated fit parameter uncertainties
v_mc = get_mc_value_shapes(v_ml, v_ml_err, 1000)
end
fwhm_mc = estimate_fwhm.(v_mc)
fwhm_err = std(fwhm_mc[isfinite.(fwhm_mc)])
return fwhm, fwhm_err
end

export get_peak_fwhm_th228

"""
fitCalibration
Expand Down
61 changes: 61 additions & 0 deletions src/specfit_testdata.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# This file is a part of LegendSpecFits.jl, licensed under the MIT License (MIT).
"""
Sample Legend200 calibration data based on "Inverse Transform Sampling" method:
- pdf of th228 calibration calibration peak is estimated from fit model function f_fit from LegendSpecFits
- calculate the cumulative distribution function F(x)
- generate a random number u from a uniform distribution between 0 and 1.
- find the value x such that F(x) = u by solving for x . --> done by interpolation of the inverse cdf
- repeat for many u --> energy samples
"""
function generate_mc_spectrum(n_tot::Int=200000,; f_fit::Base.Callable=th228_fit_functions.f_fit)

th228_lines = [583.191, 727.330, 860.564, 1592.53, 1620.50, 2103.53, 2614.51]
v = [
(μ = 2103.53, σ = 2.11501, n = 385.123, step_amplitude = 1e-242, skew_fraction = 0.005, skew_width = 0.1, background = 55),
(μ = 860.564, σ = 0.817838, n = 355.84, step_amplitude = 1.2, skew_fraction = 0.005, skew_width = 0.099, background = 35),
(μ = 727.33, σ = 0.721594, n = 452.914, step_amplitude = 5.4, skew_fraction = 0.005, skew_width = 0.1, background = 28),
(μ = 1620.5, σ = 1.24034, n = 130.256, step_amplitude = 1e-20, skew_fraction = 0.005, skew_width = 0.1, background = 16),
(μ = 583.191, σ = 0.701544, n = 1865.52, step_amplitude = 17.9, skew_fraction = 0.1, skew_width = 0.1, background = 16),
(μ = 1592.53, σ = 2.09123, n = 206.827, step_amplitude = 1e-21, skew_fraction = 0.005, skew_width = 0.1, background = 17),
(μ = 2614.51, σ = 1.51289, n = 3130.43, step_amplitude = 1e-101, skew_fraction = 0.1, skew_width = 0.003, background = 1)
]

# calculate pdf and cdf functions
bin_centers_all = Array{StepRangeLen,1}(undef, length(th228_lines))
model_counts_all = Array{Array{Float64,1},1}(undef, length(th228_lines))
model_cdf_all = Array{Array{Float64,1},1}(undef, length(th228_lines))
energy_mc_all = Array{Array{Float64,1},1}(undef, length(th228_lines))
PeakMax = zeros(length(th228_lines))

for i=1:length(th228_lines) # get fine binned model function to estimate pdf
n_step = 5000 # fine binning
bin_centers_all[i] = range(v[i].µ-30, stop=v[i].µ+30, length=n_step)
bw = bin_centers_all[i][2]-bin_centers_all[i][1]
bin_widths = range(bw,bw, length=n_step)

# save as intermediate result
model_counts_all[i] = get_model_counts(f_fit, v[i], bin_centers_all[i], bin_widths)
plot(bin_centers_all[i],model_counts_all[i],marker=:dot)
PeakMax[i] = maximum(model_counts_all[i])

# create CDF
model_cdf_all[i] = cumsum(model_counts_all[i])
model_cdf_all[i] = model_cdf_all[i]./maximum(model_cdf_all[i])
end

# weights each peak with amplitude
PeakMaxRel = PeakMax./sum(PeakMax)
n_i = round.(Int,PeakMaxRel.*n_tot)

# do the sampling: drawn from uniform distribution
for i=1:length(th228_lines)
bandwidth = maximum(model_cdf_all[i])-minimum(model_cdf_all[i])
rand_i = minimum(model_cdf_all[i]).+bandwidth.*rand(n_i[i]); # make sure sample is within model range
interp_cdf_inv = LinearInterpolation(model_cdf_all[i],bin_centers_all[i]) # inverse cdf
energy_mc_all[i] = interp_cdf_inv.(rand_i)
end

energy_mc = fast_flatten(energy_mc_all)
return energy_mc, th228_lines
end
export generate_mc_spectrum
Loading
Loading