legend-exp · LisaSchlueter · Jan 29, 2024 · Jan 30, 2024 · Jan 30, 2024 · Jan 30, 2024
diff --git a/Project.toml b/Project.toml
@@ -12,6 +12,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
 InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
 IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
@@ -51,6 +52,7 @@ Distributions = "0.24, 0.25"
 FillArrays = "0.7,0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1"
 Formatting = "0.4"
 ForwardDiff = "0.10"
+Interpolations =  "v0.15.1"
 IntervalSets = "0.7"
 InverseFunctions = "0.1"
 IrrationalConstants = "0.1, 0.2"

diff --git a/src/LegendSpecFits.jl b/src/LegendSpecFits.jl
@@ -51,7 +51,7 @@ include("aoe_calibration.jl")
 include("specfit_combined.jl")
 include("ctc.jl")
 include("qc.jl")
-
+include("gof.jl")
 include("precompile.jl")
 
 end # module
diff --git a/src/gof.jl b/src/gof.jl
@@ -0,0 +1,126 @@
+# This file is a part of LegendSpecFits.jl, licensed under the MIT License (MIT).
+"""
+`gof.jl`
+several functions to calculate goodness-of-fit (gof) for fits (-> `specfits.jl`):
+"""
+
+"""
+aux. function to convert histogram data into bin edges, bin width and bin counts
+"""
+function prepare_data(h::Histogram{<:Real,1})
+    # get bin center, width and counts from histogrammed data
+    bin_edges = first(h.edges)
+    counts = h.weights
+    bin_centers = (bin_edges[begin:end-1] .+ bin_edges[begin+1:end]) ./ 2
+    bin_widths = bin_edges[begin+1:end] .- bin_edges[begin:end-1]
+    return counts, bin_widths, bin_centers
+end
+export prepare_data
+
+"""
+aux. function to get modelled peakshape based on  histogram binning and best-fit parameter
+"""
+function get_model_counts(f_fit::Base.Callable,v_ml::NamedTuple,bin_centers::StepRangeLen,bin_widths::StepRangeLen)
+    model_func  = Base.Fix2(f_fit, v_ml) # fix the fit parameters to ML best-estimate
+    model_counts = bin_widths.*map(energy->model_func(energy), bin_centers) # evaluate model at bin center (= binned measured energies)
+    return model_counts
+end
+export get_model_counts
+
+
+""" 
+`p_value(f_fit, h, v_ml)` : calculate p-value based on least-squares
+baseline method to get goodness-of-fit (gof)
+ input:
+ * f_fit --> function handle of fit function (peakshape)
+ * h --> histogram of data
+ * v_ml --> best-fit parameters
+ output:
+ * pval --> p-value of chi2 test
+ * chi2 --> chi2 value
+ * dof --> degrees of freedom
+"""
+function p_value(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
+    # prepare data
+    counts, bin_widths, bin_centers = prepare_data(h)
+
+    # get peakshape of best-fit 
+    model_counts = get_model_counts(f_fit, v_ml, bin_centers,bin_widths)
+
+    # calculate chi2
+    chi2    = sum((model_counts[model_counts.>0]-counts[model_counts.>0]).^2 ./ model_counts[model_counts.>0])
+    npar    = length(v_ml)
+    dof    = length(counts[model_counts.>0])-npar
+    pval    = ccdf(Chisq(dof),chi2)
+    if any(model_counts.<=5)
+              @warn "WARNING: bin with <=$(round(minimum(model_counts),digits=0)) counts -  chi2 test might be not valid"
+    else  
+         @debug "p-value = $(round(pval,digits=2))"
+    end
+    return pval, chi2, dof
+end
+export p_value
+
+
+""" alternative p-value via loglikelihood ratio"""
+function p_value_LogLikeRatio(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
+    # prepare data
+    counts, bin_widths, bin_centers = prepare_data(h)
+
+    # get peakshape of best-fit 
+    model_counts =get_model_counts(f_fit, v_ml, bin_centers,bin_widths)
+
+    # calculate chi2
+    chi2    = sum((model_counts[model_counts.>0]-counts[model_counts.>0]).^2 ./ model_counts[model_counts.>0])
+    npar    = length(v_ml)
+    dof    = length(counts[model_counts.>0])-npar
+    pval    = ccdf(Chisq(dof),chi2)
+    if any(model_counts.<=5)
+              @warn "WARNING: bin with <=$(minimum(model_counts)) counts -  chi2 test might be not valid"
+    else  
+         @debug "p-value = $(round(pval,digits=2))"
+    end
+    chi2   = 2*sum(model_counts.*log.(model_counts./counts)+model_counts-counts)
+    pval   = ccdf(Chisq(dof),chi2)
+return pval, chi2, dof
+end
+export p_value_LogLikeRatio
+
+""" alternative p-value calculation via Monte Carlo sampling. Warning: computational more expensive than p_vaule() and p_value_LogLikeRatio()
+* Create n_samples randomized histograms. For each bin, samples are drawn from a Poisson distribution with λ = model peak shape (best-fit parameter)
+* Each sample histogram is fit using the model function `f_fit`
+* For each sample fit, the max. loglikelihood fit is calculated
+% p value --> comparison of sample max. loglikelihood and max. loglikelihood of best-fit
+"""
+function p_value_MC(f_fit::Base.Callable, h::Histogram{<:Real,1},ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background)},v_ml::NamedTuple,;n_samples::Int64=1000)
+    counts, bin_widths, bin_centers = prepare_data(h) # get data 
+
+    # get peakshape of best-fit and maximum likelihood value
+    model_func  = Base.Fix2(f_fit, v_ml) # fix the fit parameters to ML best-estimate
+    model_counts = bin_widths.*map(energy->model_func(energy), bin_centers) # evaluate model at bin center (= binned measured energies)
+    loglike_bf = -hist_loglike(model_func,h) 
+
+    # draw sample for each bin
+    dists = Poisson.(model_counts) # create poisson distribution for each bin
+    counts_mc_vec = rand.(dists,n_samples) # randomized histogram counts
+    counts_mc = [ [] for _ in 1:n_samples ] #re-structure data_samples_vec to array of arrays, there is probably a better way to do this...
+    for i = 1:n_samples
+        counts_mc[i] = map(x -> x[i],counts_mc_vec)
+    end
+
+    # fit every sample histogram and calculate max. loglikelihood
+    loglike_bf_mc = NaN.*ones(n_samples)
+    h_mc = h # make copy of data histogram
+    for i=1:n_samples
+        h_mc.weights = counts_mc[i] # overwrite counts with MC values
+        result_fit_mc, report = fit_single_peak_th228(h_mc, ps ; uncertainty=false) # fit MC histogram
+        fit_par_mc   = result_fit_mc[(:μ, :σ, :n, :step_amplitude, :skew_fraction, :skew_width, :background)]
+        model_func_sample  = Base.Fix2(f_fit, fit_par_mc) # fix the fit parameters to ML best-estimate
+        loglike_bf_mc[i] = -hist_loglike(model_func_sample,h_mc) # loglikelihood for best-fit
+    end
+
+    # calculate p-value
+    pval= sum(loglike_bf_mc.<=loglike_bf)./n_samples # preliminary. could be improved e.g. with interpolation
+    return pval 
+end
+export p_value_MC
diff --git a/src/specfit.jl b/src/specfit.jl
@@ -8,7 +8,7 @@ th228_fit_functions = (
     f_bck = (x, v) -> background_peakshape(x, v.μ, v.σ, v.step_amplitude, v.background),
     f_sigWithTail = (x, v) -> signal_peakshape(x, v.μ, v.σ, v.n, v.skew_fraction) + lowEtail_peakshape(x, v.μ, v.σ, v.n, v.skew_fraction, v.skew_width)
 )
-
+export th228_fit_functions
 """
     estimate_single_peak_stats(h::Histogram, calib_type::Symbol=:th228)
 
@@ -38,7 +38,6 @@ function estimate_single_peak_stats(h::Histogram,; calib_type::Symbol=:th228)
 end
 export estimate_single_peak_stats
 
-
 function estimate_single_peak_stats_th228(h::Histogram{T}) where T<:Real
     W = h.weights
     E = first(h.edges)
@@ -88,16 +87,16 @@ Perform a fit of the peakshape to the data in `peakhists` using the initial valu
     * `peak_fit_plots`: array of plots of the peak fits
     * `return_vals`: dictionary of the fit results
 """
-function fit_peaks(peakhists::Array, peakstats::StructArray, th228_lines::Array,; calib_type::Symbol=:th228, uncertainty::Bool=true, low_e_tail::Bool=true)
+function fit_peaks(peakhists::Array, peakstats::StructArray, th228_lines::Array,; calib_type::Symbol=:th228, uncertainty::Bool=true, low_e_tail::Bool=true,iterative_fit::Bool=false)
     if calib_type == :th228
-        return fit_peaks_th228(peakhists, peakstats, th228_lines,; uncertainty=uncertainty, low_e_tail=low_e_tail)
+        return fit_peaks_th228(peakhists, peakstats, th228_lines,; uncertainty=uncertainty, low_e_tail=low_e_tail,iterative_fit=iterative_fit)
     else
         error("Calibration type not supported")
     end
 end
 export fit_peaks
 
-function fit_peaks_th228(peakhists::Array, peakstats::StructArray, th228_lines::Array{T},; uncertainty::Bool=true, low_e_tail::Bool=true) where T<:Any
+function fit_peaks_th228(peakhists::Array, peakstats::StructArray, th228_lines::Array{T},; uncertainty::Bool=true, low_e_tail::Bool=true, iterative_fit::Bool=false) where T<:Any
     # create return and result dicts
     result = Dict{T, NamedTuple}()
     report = Dict{T, NamedTuple}()
@@ -108,6 +107,17 @@ function fit_peaks_th228(peakhists::Array, peakstats::StructArray, th228_lines::
         ps = peakstats[i]
         # fit peak
         result_peak, report_peak = fit_single_peak_th228(h, ps, ; uncertainty=uncertainty, low_e_tail=low_e_tail)
+
+        # check covariance matrix for being semi positive definite (no negative uncertainties)
+        if uncertainty
+             if iterative_fit && !isposdef(result_peak.covmat)
+                @warn "Covariance matrix not positive definite for peak $peak - repeat fit without low energy tail"
+                pval_save = result_peak.pval
+                result_peak, report_peak = fit_single_peak_th228(h, ps, ; uncertainty=uncertainty, low_e_tail=false)
+                @info "New covariance matrix is positive definite: $(isposdef(result_peak.covmat))"
+                @info "p-val with low-energy tail  p=$(round(pval_save,digits=5)) , without low-energy tail: p=$(round((result_peak.pval),digits=5))"
+                end
+        end
         # save results
         result[peak] = result_peak
         report[peak] = report_peak
@@ -124,7 +134,9 @@ Also, FWHM is calculated from the fitted peakshape with MC error propagation. Th
     * `result`: NamedTuple of the fit results containing values and errors
     * `report`: NamedTuple of the fit report which can be plotted
 """
-function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background), NTuple{5, T}}; uncertainty::Bool=true, low_e_tail::Bool=true, fixed_position::Bool=false, pseudo_prior::NamedTupleDist=NamedTupleDist(empty = true)) where T<:Real
+function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background), NTuple{5, T}}; 
+    uncertainty::Bool=true, low_e_tail::Bool=true, fixed_position::Bool=false, pseudo_prior::NamedTupleDist=NamedTupleDist(empty = true),
+    fit_fun::Symbol=:f_fit) where T<:Real
     # create standard pseudo priors
     standard_pseudo_prior = NamedTupleDist(
         μ = ifelse(fixed_position, ConstValueDist(ps.peak_pos), Uniform(ps.peak_pos-10, ps.peak_pos+10)),
@@ -152,8 +164,8 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw
     # start values for MLE
     v_init = mean(pseudo_prior)
 
-    # create loglikehood function
-    f_loglike = let f_fit=th228_fit_functions.f_fit, h=h
+    # create loglikehood function: f_loglike(v) that can be evaluated for any set of v (fit parameter)
+    f_loglike = let f_fit=th228_fit_functions[fit_fun], h=h
         v -> hist_loglike(Base.Fix2(f_fit, v), h)
     end
 
@@ -163,37 +175,41 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw
     # best fit results
     v_ml = inverse(f_trafo)(Optim.minimizer(opt_r))
 
-    f_loglike_array = let f_fit=gamma_peakshape, h=h
-        v -> - hist_loglike(x -> f_fit(x, v...), h)
+    f_loglike_array = let f_fit=th228_fit_functions[fit_fun], h=h, v_keys = keys(standard_pseudo_prior) #same loglikelihood function as f_loglike, but has array as input instead of NamedTuple
+        v ->  - hist_loglike(    x -> f_fit(x,NamedTuple{v_keys}(v)), h) 
     end
 
     if uncertainty
         # Calculate the Hessian matrix using ForwardDiff
         H = ForwardDiff.hessian(f_loglike_array, tuple_to_array(v_ml))
 
         # Calculate the parameter covariance matrix
-        param_covariance = inv(H)
-
+        param_covariance_raw = inv(H)
+        param_covariance = nearestSPD(param_covariance_raw)
+
         # Extract the parameter uncertainties
         v_ml_err = array_to_tuple(sqrt.(abs.(diag(param_covariance))), v_ml)
-
+        # calculate p-value
+        pval, chi2, dof = p_value(th228_fit_functions.f_fit, h, v_ml)
+
         # get fwhm of peak
-        fwhm, fwhm_err = get_peak_fwhm_th228(v_ml, v_ml_err)
+        fwhm, fwhm_err = get_peak_fwhm_th228(v_ml, param_covariance)
 
         @debug "Best Fit values"
         @debug "μ: $(v_ml.μ) ± $(v_ml_err.μ)"
         @debug "σ: $(v_ml.σ) ± $(v_ml_err.σ)"
         @debug "n: $(v_ml.n) ± $(v_ml_err.n)"
+        @debug "p: $pval , chi2 = $(chi2) with $(dof) dof"
         @debug "FWHM: $(fwhm) ± $(fwhm_err)"
-
-        result = merge(v_ml, (fwhm = fwhm, err = merge(v_ml_err, (fwhm = fwhm_err,))))
+       
+        result = merge(v_ml, (pval = pval, chi2 = chi2, dof = dof, fwhm = fwhm,covmat = param_covariance, covmat_raw = param_covariance_raw,),(err = merge(v_ml_err, (fwhm = fwhm_err,)),))
         report = (
             v = v_ml,
             h = h,
             f_fit = x -> Base.Fix2(th228_fit_functions.f_fit, v_ml)(x),
             f_sig = x -> Base.Fix2(th228_fit_functions.f_sig, v_ml)(x),
             f_lowEtail = x -> Base.Fix2(th228_fit_functions.f_lowEtail, v_ml)(x),
-            f_bck = x -> Base.Fix2(th228_fit_functions.f_bck, v_ml)(x)
+            f_bck = x -> Base.Fix2(th228_fit_functions.f_bck, v_ml)(x),
         )
     else
         # get fwhm of peak
@@ -240,8 +256,7 @@ function estimate_fwhm(v::NamedTuple)
         return NaN
     end
 end
-
-
+export estimate_fwhm
 """
     get_peak_fwhm_th228(v_ml::NamedTuple, v_ml_err::NamedTuple)
 Get the FWHM of a peak from the fit parameters while performing a MC error propagation.
@@ -250,19 +265,24 @@ Get the FWHM of a peak from the fit parameters while performing a MC error propa
     * `fwhm`: the FWHM of the peak
     * `fwhm_err`: the uncertainty of the FWHM of the peak
 """
-function get_peak_fwhm_th228(v_ml::NamedTuple, v_ml_err::NamedTuple, uncertainty::Bool=true)
+function get_peak_fwhm_th228(v_ml::NamedTuple, v_ml_err::Union{Matrix,NamedTuple},uncertainty::Bool=true)
     # get fwhm for peak fit
     fwhm = estimate_fwhm(v_ml)
     if !uncertainty
         return fwhm, NaN
     end
+
     # get MC for FWHM err
-    v_mc = get_mc_value_shapes(v_ml, v_ml_err, 1000)
+    if isa(v_ml_err,Matrix)# use correlated fit parameter uncertainties 
+        v_mc = get_mc_value_shapes(v_ml, v_ml_err, 10000)
+    elseif isa(v_ml_err,NamedTuple) # use uncorrelated fit parameter uncertainties 
+        v_mc = get_mc_value_shapes(v_ml, v_ml_err, 1000)
+    end
     fwhm_mc = estimate_fwhm.(v_mc)
     fwhm_err = std(fwhm_mc[isfinite.(fwhm_mc)])
     return fwhm, fwhm_err
 end
-
+export get_peak_fwhm_th228
 
 """
     fitCalibration

diff --git a/src/specfit_testdata.jl b/src/specfit_testdata.jl
@@ -0,0 +1,61 @@
+# This file is a part of LegendSpecFits.jl, licensed under the MIT License (MIT).
+"""
+Sample Legend200 calibration data based on "Inverse Transform Sampling" method: 
+- pdf of th228 calibration calibration peak is estimated from fit model function f_fit from LegendSpecFits
+- calculate the cumulative distribution function F(x)
+- generate a random number u from a uniform distribution between 0 and 1.
+- find the value x such that  F(x) = u  by solving for  x . --> done by interpolation of the inverse cdf
+- repeat for many u  --> energy samples 
+"""
+function generate_mc_spectrum(n_tot::Int=200000,; f_fit::Base.Callable=th228_fit_functions.f_fit)
+
+    th228_lines =  [583.191,  727.330,  860.564,  1592.53,    1620.50,    2103.53,    2614.51]
+    v = [ 
+            (μ = 2103.53,   σ = 2.11501,    n = 385.123,    step_amplitude = 1e-242,    skew_fraction = 0.005,  skew_width = 0.1,   background = 55),
+            (μ = 860.564,   σ = 0.817838,   n = 355.84,     step_amplitude = 1.2,       skew_fraction = 0.005,  skew_width = 0.099, background = 35),
+            (μ = 727.33,    σ = 0.721594,   n = 452.914,    step_amplitude = 5.4,       skew_fraction = 0.005,  skew_width = 0.1,   background = 28),
+            (μ = 1620.5,    σ = 1.24034,    n = 130.256,    step_amplitude = 1e-20,     skew_fraction = 0.005,  skew_width = 0.1,   background = 16),
+            (μ = 583.191,   σ = 0.701544,   n = 1865.52,    step_amplitude = 17.9,      skew_fraction = 0.1,    skew_width = 0.1,   background = 16),
+            (μ = 1592.53,   σ = 2.09123,    n = 206.827,    step_amplitude = 1e-21,     skew_fraction = 0.005,  skew_width = 0.1,   background = 17),
+            (μ = 2614.51,   σ = 1.51289,    n = 3130.43,    step_amplitude = 1e-101,    skew_fraction = 0.1,    skew_width = 0.003, background = 1)
+    ]
+
+    # calculate pdf and cdf functions 
+    bin_centers_all     =  Array{StepRangeLen,1}(undef, length(th228_lines))
+    model_counts_all    =  Array{Array{Float64,1},1}(undef, length(th228_lines))
+    model_cdf_all       =  Array{Array{Float64,1},1}(undef, length(th228_lines))
+    energy_mc_all       =  Array{Array{Float64,1},1}(undef, length(th228_lines))
+    PeakMax             = zeros(length(th228_lines))
+
+    for i=1:length(th228_lines)  # get fine binned model function to estimate pdf 
+        n_step = 5000 # fine binning 
+        bin_centers_all[i] = range(v[i].µ-30, stop=v[i].µ+30, length=n_step) 
+        bw = bin_centers_all[i][2]-bin_centers_all[i][1]
+        bin_widths = range(bw,bw, length=n_step) 
+
+        # save as intermediate result 
+        model_counts_all[i] = get_model_counts(f_fit, v[i], bin_centers_all[i], bin_widths)
+        plot(bin_centers_all[i],model_counts_all[i],marker=:dot)
+        PeakMax[i] = maximum(model_counts_all[i])
+
+        # create CDF
+        model_cdf_all[i] = cumsum(model_counts_all[i])
+        model_cdf_all[i] = model_cdf_all[i]./maximum(model_cdf_all[i])
+    end
+
+    # weights each peak with amplitude 
+    PeakMaxRel = PeakMax./sum(PeakMax)
+    n_i = round.(Int,PeakMaxRel.*n_tot)
+
+    # do the sampling: drawn from uniform distribution 
+    for i=1:length(th228_lines)
+        bandwidth = maximum(model_cdf_all[i])-minimum(model_cdf_all[i])
+        rand_i = minimum(model_cdf_all[i]).+bandwidth.*rand(n_i[i]); # make sure sample is within model range 
+        interp_cdf_inv = LinearInterpolation(model_cdf_all[i],bin_centers_all[i]) # inverse cdf
+        energy_mc_all[i] = interp_cdf_inv.(rand_i) 
+    end
+
+    energy_mc = fast_flatten(energy_mc_all)
+    return energy_mc, th228_lines
+end
+export generate_mc_spectrum