diff --git a/Project.toml b/Project.toml index b18c21f8..a5fa1819 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6" @@ -51,6 +52,7 @@ Distributions = "0.24, 0.25" FillArrays = "0.7,0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1" Formatting = "0.4" ForwardDiff = "0.10" +Interpolations = "v0.15.1" IntervalSets = "0.7" InverseFunctions = "0.1" IrrationalConstants = "0.1, 0.2" diff --git a/src/LegendSpecFits.jl b/src/LegendSpecFits.jl index db382a9b..2ee6a3af 100644 --- a/src/LegendSpecFits.jl +++ b/src/LegendSpecFits.jl @@ -51,7 +51,7 @@ include("aoe_calibration.jl") include("specfit_combined.jl") include("ctc.jl") include("qc.jl") - +include("gof.jl") include("precompile.jl") end # module diff --git a/src/gof.jl b/src/gof.jl new file mode 100644 index 00000000..98feb0d4 --- /dev/null +++ b/src/gof.jl @@ -0,0 +1,126 @@ +# This file is a part of LegendSpecFits.jl, licensed under the MIT License (MIT). +""" +`gof.jl` +several functions to calculate goodness-of-fit (gof) for fits (-> `specfits.jl`): +""" + +""" +aux. function to convert histogram data into bin edges, bin width and bin counts +""" +function prepare_data(h::Histogram{<:Real,1}) + # get bin center, width and counts from histogrammed data + bin_edges = first(h.edges) + counts = h.weights + bin_centers = (bin_edges[begin:end-1] .+ bin_edges[begin+1:end]) ./ 2 + bin_widths = bin_edges[begin+1:end] .- bin_edges[begin:end-1] + return counts, bin_widths, bin_centers +end +export prepare_data + +""" +aux. function to get modelled peakshape based on histogram binning and best-fit parameter +""" +function get_model_counts(f_fit::Base.Callable,v_ml::NamedTuple,bin_centers::StepRangeLen,bin_widths::StepRangeLen) + model_func = Base.Fix2(f_fit, v_ml) # fix the fit parameters to ML best-estimate + model_counts = bin_widths.*map(energy->model_func(energy), bin_centers) # evaluate model at bin center (= binned measured energies) + return model_counts +end +export get_model_counts + + +""" +`p_value(f_fit, h, v_ml)` : calculate p-value based on least-squares +baseline method to get goodness-of-fit (gof) + input: + * f_fit --> function handle of fit function (peakshape) + * h --> histogram of data + * v_ml --> best-fit parameters + output: + * pval --> p-value of chi2 test + * chi2 --> chi2 value + * dof --> degrees of freedom +""" +function p_value(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple) + # prepare data + counts, bin_widths, bin_centers = prepare_data(h) + + # get peakshape of best-fit + model_counts = get_model_counts(f_fit, v_ml, bin_centers,bin_widths) + + # calculate chi2 + chi2 = sum((model_counts[model_counts.>0]-counts[model_counts.>0]).^2 ./ model_counts[model_counts.>0]) + npar = length(v_ml) + dof = length(counts[model_counts.>0])-npar + pval = ccdf(Chisq(dof),chi2) + if any(model_counts.<=5) + @warn "WARNING: bin with <=$(round(minimum(model_counts),digits=0)) counts - chi2 test might be not valid" + else + @debug "p-value = $(round(pval,digits=2))" + end + return pval, chi2, dof +end +export p_value + + +""" alternative p-value via loglikelihood ratio""" +function p_value_LogLikeRatio(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple) + # prepare data + counts, bin_widths, bin_centers = prepare_data(h) + + # get peakshape of best-fit + model_counts =get_model_counts(f_fit, v_ml, bin_centers,bin_widths) + + # calculate chi2 + chi2 = sum((model_counts[model_counts.>0]-counts[model_counts.>0]).^2 ./ model_counts[model_counts.>0]) + npar = length(v_ml) + dof = length(counts[model_counts.>0])-npar + pval = ccdf(Chisq(dof),chi2) + if any(model_counts.<=5) + @warn "WARNING: bin with <=$(minimum(model_counts)) counts - chi2 test might be not valid" + else + @debug "p-value = $(round(pval,digits=2))" + end + chi2 = 2*sum(model_counts.*log.(model_counts./counts)+model_counts-counts) + pval = ccdf(Chisq(dof),chi2) +return pval, chi2, dof +end +export p_value_LogLikeRatio + +""" alternative p-value calculation via Monte Carlo sampling. Warning: computational more expensive than p_vaule() and p_value_LogLikeRatio() +* Create n_samples randomized histograms. For each bin, samples are drawn from a Poisson distribution with λ = model peak shape (best-fit parameter) +* Each sample histogram is fit using the model function `f_fit` +* For each sample fit, the max. loglikelihood fit is calculated +% p value --> comparison of sample max. loglikelihood and max. loglikelihood of best-fit +""" +function p_value_MC(f_fit::Base.Callable, h::Histogram{<:Real,1},ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background)},v_ml::NamedTuple,;n_samples::Int64=1000) + counts, bin_widths, bin_centers = prepare_data(h) # get data + + # get peakshape of best-fit and maximum likelihood value + model_func = Base.Fix2(f_fit, v_ml) # fix the fit parameters to ML best-estimate + model_counts = bin_widths.*map(energy->model_func(energy), bin_centers) # evaluate model at bin center (= binned measured energies) + loglike_bf = -hist_loglike(model_func,h) + + # draw sample for each bin + dists = Poisson.(model_counts) # create poisson distribution for each bin + counts_mc_vec = rand.(dists,n_samples) # randomized histogram counts + counts_mc = [ [] for _ in 1:n_samples ] #re-structure data_samples_vec to array of arrays, there is probably a better way to do this... + for i = 1:n_samples + counts_mc[i] = map(x -> x[i],counts_mc_vec) + end + + # fit every sample histogram and calculate max. loglikelihood + loglike_bf_mc = NaN.*ones(n_samples) + h_mc = h # make copy of data histogram + for i=1:n_samples + h_mc.weights = counts_mc[i] # overwrite counts with MC values + result_fit_mc, report = fit_single_peak_th228(h_mc, ps ; uncertainty=false) # fit MC histogram + fit_par_mc = result_fit_mc[(:μ, :σ, :n, :step_amplitude, :skew_fraction, :skew_width, :background)] + model_func_sample = Base.Fix2(f_fit, fit_par_mc) # fix the fit parameters to ML best-estimate + loglike_bf_mc[i] = -hist_loglike(model_func_sample,h_mc) # loglikelihood for best-fit + end + + # calculate p-value + pval= sum(loglike_bf_mc.<=loglike_bf)./n_samples # preliminary. could be improved e.g. with interpolation + return pval +end +export p_value_MC \ No newline at end of file diff --git a/src/specfit.jl b/src/specfit.jl index ce25c53c..bae347e3 100644 --- a/src/specfit.jl +++ b/src/specfit.jl @@ -8,7 +8,7 @@ th228_fit_functions = ( f_bck = (x, v) -> background_peakshape(x, v.μ, v.σ, v.step_amplitude, v.background), f_sigWithTail = (x, v) -> signal_peakshape(x, v.μ, v.σ, v.n, v.skew_fraction) + lowEtail_peakshape(x, v.μ, v.σ, v.n, v.skew_fraction, v.skew_width) ) - +export th228_fit_functions """ estimate_single_peak_stats(h::Histogram, calib_type::Symbol=:th228) @@ -38,7 +38,6 @@ function estimate_single_peak_stats(h::Histogram,; calib_type::Symbol=:th228) end export estimate_single_peak_stats - function estimate_single_peak_stats_th228(h::Histogram{T}) where T<:Real W = h.weights E = first(h.edges) @@ -88,16 +87,16 @@ Perform a fit of the peakshape to the data in `peakhists` using the initial valu * `peak_fit_plots`: array of plots of the peak fits * `return_vals`: dictionary of the fit results """ -function fit_peaks(peakhists::Array, peakstats::StructArray, th228_lines::Array,; calib_type::Symbol=:th228, uncertainty::Bool=true, low_e_tail::Bool=true) +function fit_peaks(peakhists::Array, peakstats::StructArray, th228_lines::Array,; calib_type::Symbol=:th228, uncertainty::Bool=true, low_e_tail::Bool=true,iterative_fit::Bool=false) if calib_type == :th228 - return fit_peaks_th228(peakhists, peakstats, th228_lines,; uncertainty=uncertainty, low_e_tail=low_e_tail) + return fit_peaks_th228(peakhists, peakstats, th228_lines,; uncertainty=uncertainty, low_e_tail=low_e_tail,iterative_fit=iterative_fit) else error("Calibration type not supported") end end export fit_peaks -function fit_peaks_th228(peakhists::Array, peakstats::StructArray, th228_lines::Array{T},; uncertainty::Bool=true, low_e_tail::Bool=true) where T<:Any +function fit_peaks_th228(peakhists::Array, peakstats::StructArray, th228_lines::Array{T},; uncertainty::Bool=true, low_e_tail::Bool=true, iterative_fit::Bool=false) where T<:Any # create return and result dicts result = Dict{T, NamedTuple}() report = Dict{T, NamedTuple}() @@ -108,6 +107,17 @@ function fit_peaks_th228(peakhists::Array, peakstats::StructArray, th228_lines:: ps = peakstats[i] # fit peak result_peak, report_peak = fit_single_peak_th228(h, ps, ; uncertainty=uncertainty, low_e_tail=low_e_tail) + + # check covariance matrix for being semi positive definite (no negative uncertainties) + if uncertainty + if iterative_fit && !isposdef(result_peak.covmat) + @warn "Covariance matrix not positive definite for peak $peak - repeat fit without low energy tail" + pval_save = result_peak.pval + result_peak, report_peak = fit_single_peak_th228(h, ps, ; uncertainty=uncertainty, low_e_tail=false) + @info "New covariance matrix is positive definite: $(isposdef(result_peak.covmat))" + @info "p-val with low-energy tail p=$(round(pval_save,digits=5)) , without low-energy tail: p=$(round((result_peak.pval),digits=5))" + end + end # save results result[peak] = result_peak report[peak] = report_peak @@ -124,7 +134,9 @@ Also, FWHM is calculated from the fitted peakshape with MC error propagation. Th * `result`: NamedTuple of the fit results containing values and errors * `report`: NamedTuple of the fit report which can be plotted """ -function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background), NTuple{5, T}}; uncertainty::Bool=true, low_e_tail::Bool=true, fixed_position::Bool=false, pseudo_prior::NamedTupleDist=NamedTupleDist(empty = true)) where T<:Real +function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background), NTuple{5, T}}; + uncertainty::Bool=true, low_e_tail::Bool=true, fixed_position::Bool=false, pseudo_prior::NamedTupleDist=NamedTupleDist(empty = true), + fit_fun::Symbol=:f_fit) where T<:Real # create standard pseudo priors standard_pseudo_prior = NamedTupleDist( μ = ifelse(fixed_position, ConstValueDist(ps.peak_pos), Uniform(ps.peak_pos-10, ps.peak_pos+10)), @@ -152,8 +164,8 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw # start values for MLE v_init = mean(pseudo_prior) - # create loglikehood function - f_loglike = let f_fit=th228_fit_functions.f_fit, h=h + # create loglikehood function: f_loglike(v) that can be evaluated for any set of v (fit parameter) + f_loglike = let f_fit=th228_fit_functions[fit_fun], h=h v -> hist_loglike(Base.Fix2(f_fit, v), h) end @@ -163,8 +175,8 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw # best fit results v_ml = inverse(f_trafo)(Optim.minimizer(opt_r)) - f_loglike_array = let f_fit=gamma_peakshape, h=h - v -> - hist_loglike(x -> f_fit(x, v...), h) + f_loglike_array = let f_fit=th228_fit_functions[fit_fun], h=h, v_keys = keys(standard_pseudo_prior) #same loglikelihood function as f_loglike, but has array as input instead of NamedTuple + v -> - hist_loglike( x -> f_fit(x,NamedTuple{v_keys}(v)), h) end if uncertainty @@ -172,28 +184,32 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw H = ForwardDiff.hessian(f_loglike_array, tuple_to_array(v_ml)) # Calculate the parameter covariance matrix - param_covariance = inv(H) - + param_covariance_raw = inv(H) + param_covariance = nearestSPD(param_covariance_raw) + # Extract the parameter uncertainties v_ml_err = array_to_tuple(sqrt.(abs.(diag(param_covariance))), v_ml) - + # calculate p-value + pval, chi2, dof = p_value(th228_fit_functions.f_fit, h, v_ml) + # get fwhm of peak - fwhm, fwhm_err = get_peak_fwhm_th228(v_ml, v_ml_err) + fwhm, fwhm_err = get_peak_fwhm_th228(v_ml, param_covariance) @debug "Best Fit values" @debug "μ: $(v_ml.μ) ± $(v_ml_err.μ)" @debug "σ: $(v_ml.σ) ± $(v_ml_err.σ)" @debug "n: $(v_ml.n) ± $(v_ml_err.n)" + @debug "p: $pval , chi2 = $(chi2) with $(dof) dof" @debug "FWHM: $(fwhm) ± $(fwhm_err)" - - result = merge(v_ml, (fwhm = fwhm, err = merge(v_ml_err, (fwhm = fwhm_err,)))) + + result = merge(v_ml, (pval = pval, chi2 = chi2, dof = dof, fwhm = fwhm,covmat = param_covariance, covmat_raw = param_covariance_raw,),(err = merge(v_ml_err, (fwhm = fwhm_err,)),)) report = ( v = v_ml, h = h, f_fit = x -> Base.Fix2(th228_fit_functions.f_fit, v_ml)(x), f_sig = x -> Base.Fix2(th228_fit_functions.f_sig, v_ml)(x), f_lowEtail = x -> Base.Fix2(th228_fit_functions.f_lowEtail, v_ml)(x), - f_bck = x -> Base.Fix2(th228_fit_functions.f_bck, v_ml)(x) + f_bck = x -> Base.Fix2(th228_fit_functions.f_bck, v_ml)(x), ) else # get fwhm of peak @@ -240,8 +256,7 @@ function estimate_fwhm(v::NamedTuple) return NaN end end - - +export estimate_fwhm """ get_peak_fwhm_th228(v_ml::NamedTuple, v_ml_err::NamedTuple) Get the FWHM of a peak from the fit parameters while performing a MC error propagation. @@ -250,19 +265,24 @@ Get the FWHM of a peak from the fit parameters while performing a MC error propa * `fwhm`: the FWHM of the peak * `fwhm_err`: the uncertainty of the FWHM of the peak """ -function get_peak_fwhm_th228(v_ml::NamedTuple, v_ml_err::NamedTuple, uncertainty::Bool=true) +function get_peak_fwhm_th228(v_ml::NamedTuple, v_ml_err::Union{Matrix,NamedTuple},uncertainty::Bool=true) # get fwhm for peak fit fwhm = estimate_fwhm(v_ml) if !uncertainty return fwhm, NaN end + # get MC for FWHM err - v_mc = get_mc_value_shapes(v_ml, v_ml_err, 1000) + if isa(v_ml_err,Matrix)# use correlated fit parameter uncertainties + v_mc = get_mc_value_shapes(v_ml, v_ml_err, 10000) + elseif isa(v_ml_err,NamedTuple) # use uncorrelated fit parameter uncertainties + v_mc = get_mc_value_shapes(v_ml, v_ml_err, 1000) + end fwhm_mc = estimate_fwhm.(v_mc) fwhm_err = std(fwhm_mc[isfinite.(fwhm_mc)]) return fwhm, fwhm_err end - +export get_peak_fwhm_th228 """ fitCalibration diff --git a/src/specfit_testdata.jl b/src/specfit_testdata.jl new file mode 100644 index 00000000..8fbb08d6 --- /dev/null +++ b/src/specfit_testdata.jl @@ -0,0 +1,61 @@ +# This file is a part of LegendSpecFits.jl, licensed under the MIT License (MIT). +""" +Sample Legend200 calibration data based on "Inverse Transform Sampling" method: +- pdf of th228 calibration calibration peak is estimated from fit model function f_fit from LegendSpecFits +- calculate the cumulative distribution function F(x) +- generate a random number u from a uniform distribution between 0 and 1. +- find the value x such that F(x) = u by solving for x . --> done by interpolation of the inverse cdf +- repeat for many u --> energy samples +""" +function generate_mc_spectrum(n_tot::Int=200000,; f_fit::Base.Callable=th228_fit_functions.f_fit) + + th228_lines = [583.191, 727.330, 860.564, 1592.53, 1620.50, 2103.53, 2614.51] + v = [ + (μ = 2103.53, σ = 2.11501, n = 385.123, step_amplitude = 1e-242, skew_fraction = 0.005, skew_width = 0.1, background = 55), + (μ = 860.564, σ = 0.817838, n = 355.84, step_amplitude = 1.2, skew_fraction = 0.005, skew_width = 0.099, background = 35), + (μ = 727.33, σ = 0.721594, n = 452.914, step_amplitude = 5.4, skew_fraction = 0.005, skew_width = 0.1, background = 28), + (μ = 1620.5, σ = 1.24034, n = 130.256, step_amplitude = 1e-20, skew_fraction = 0.005, skew_width = 0.1, background = 16), + (μ = 583.191, σ = 0.701544, n = 1865.52, step_amplitude = 17.9, skew_fraction = 0.1, skew_width = 0.1, background = 16), + (μ = 1592.53, σ = 2.09123, n = 206.827, step_amplitude = 1e-21, skew_fraction = 0.005, skew_width = 0.1, background = 17), + (μ = 2614.51, σ = 1.51289, n = 3130.43, step_amplitude = 1e-101, skew_fraction = 0.1, skew_width = 0.003, background = 1) + ] + + # calculate pdf and cdf functions + bin_centers_all = Array{StepRangeLen,1}(undef, length(th228_lines)) + model_counts_all = Array{Array{Float64,1},1}(undef, length(th228_lines)) + model_cdf_all = Array{Array{Float64,1},1}(undef, length(th228_lines)) + energy_mc_all = Array{Array{Float64,1},1}(undef, length(th228_lines)) + PeakMax = zeros(length(th228_lines)) + + for i=1:length(th228_lines) # get fine binned model function to estimate pdf + n_step = 5000 # fine binning + bin_centers_all[i] = range(v[i].µ-30, stop=v[i].µ+30, length=n_step) + bw = bin_centers_all[i][2]-bin_centers_all[i][1] + bin_widths = range(bw,bw, length=n_step) + + # save as intermediate result + model_counts_all[i] = get_model_counts(f_fit, v[i], bin_centers_all[i], bin_widths) + plot(bin_centers_all[i],model_counts_all[i],marker=:dot) + PeakMax[i] = maximum(model_counts_all[i]) + + # create CDF + model_cdf_all[i] = cumsum(model_counts_all[i]) + model_cdf_all[i] = model_cdf_all[i]./maximum(model_cdf_all[i]) + end + + # weights each peak with amplitude + PeakMaxRel = PeakMax./sum(PeakMax) + n_i = round.(Int,PeakMaxRel.*n_tot) + + # do the sampling: drawn from uniform distribution + for i=1:length(th228_lines) + bandwidth = maximum(model_cdf_all[i])-minimum(model_cdf_all[i]) + rand_i = minimum(model_cdf_all[i]).+bandwidth.*rand(n_i[i]); # make sure sample is within model range + interp_cdf_inv = LinearInterpolation(model_cdf_all[i],bin_centers_all[i]) # inverse cdf + energy_mc_all[i] = interp_cdf_inv.(rand_i) + end + + energy_mc = fast_flatten(energy_mc_all) + return energy_mc, th228_lines +end +export generate_mc_spectrum \ No newline at end of file diff --git a/src/utils.jl b/src/utils.jl index acd987d2..b8cd6faf 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -74,7 +74,25 @@ function get_mc_value_shapes(v::NamedTuple, v_err::NamedTuple, n::Int64) vs = BAT.distprod(map(Normal, v, v_err)) NamedTuple.(rand(vs, n)) end - +function get_mc_value_shapes(v::NamedTuple, v_err::Matrix, n::Int64) + if !isposdef(v_err) + v_err = nearestSPD(v_err) + @debug "Covariance matrix not positive definite. Using nearestSPD" + end + v_err = v_err[1:6,1:6] #remove background, keep only relevant for sampling + v_fitpar = v[keys(v)[1:size(v_err,1)]] # only fit parameter + dist = MvNormal([v_fitpar...], v_err) # multivariate distribution using covariance matrix) + v_mc = rand(dist, n) # Draw samples + + # constain fit_par_samples to physical values. warning hardcoded. tbd + Idx_keep = findall((v_mc[3,:].>0) .* # positive amplitude + (v_mc[5,:].<0.25).* # skew fraction + (v_mc[5,:].>0) .* #skew fraction + (v_mc[6,:].>0)) # positive skew width + v_mc = v_mc[:,Idx_keep]; + n = size(v_mc,2) + v_mc = [NamedTuple{keys(v)[1:size(v_err,1)]}(v_mc[:,i]) for i=1:n] # convert back to NamedTuple +end """ get_friedman_diaconis_bin_width(x::AbstractArray) @@ -107,4 +125,18 @@ function get_number_of_bins(x::AbstractArray,; method::Symbol=:sqrt) else @assert false "Method not implemented" end -end \ No newline at end of file +end + +""" +nearestSPD(A) returns the nearest positive definite matrix to A +calculation is based on matrix factorization techniques described in https://www.sciencedirect.com/science/article/pii/0024379588902236 +""" +function nearestSPD(A) + B = (A + A') / 2 # make sure matrix is symmetric + _, s, V = svd(B) # singular value decomposition (SVD), s = singular values (~eigenvalues), V = right singular vector (~eigenvector) + H = V * diagm(0 => max.(s, 0)) * V' # symmetric polar factor of B + B = (B + H) / 2 # calculate nearest positive definite matrix + B = (B + B') / 2 # make sure matrix is symmetric + return B +end +export nearestSPD \ No newline at end of file diff --git a/test/test_specfit.jl b/test/test_specfit.jl index ea408c28..a4ffc347 100644 --- a/test/test_specfit.jl +++ b/test/test_specfit.jl @@ -1,8 +1,15 @@ # This file is a part of LegendSpecFits.jl, licensed under the MIT License (MIT). - using LegendSpecFits using Test - @testset "specfit" begin -end + # load data, simple calibration + energy_test, th228_lines = generate_mc_spectrum(200000) + + # simple calibration fit + window_sizes = vcat([(25.0,25.0) for _ in 1:6], (30.0,30.0)) + result_simple, report_simple = simple_calibration(energy_test, th228_lines, window_sizes, n_bins=10000,; calib_type=:th228, quantile_perc=0.995) + + # fit + result, report = fit_peaks(result_simple.peakhists, result_simple.peakstats, th228_lines,; uncertainty=true); +end \ No newline at end of file