fit_response_distributions.py

#!/usr/bin/env python
    # coding: utf-8

"""
Analyses the output from  CoffeaJERCProcessor_L5 and fits the response distributions. 

Author(s): Andris Potrebko (RTU)
"""

# import sys
# coffea_path = '/afs/cern.ch/user/a/anpotreb/top/JERC/coffea/'
# if coffea_path not in sys.path:
#     sys.path.insert(0,coffea_path)

# ak_path = '/afs/cern.ch/user/a/anpotreb/top/JERC/local-packages/'

# if ak_path not in sys.path:
#     sys.path.insert(0,ak_path)

from coffea import util
import numpy as np

# import inspect
import matplotlib.pyplot as plt
import hist
import warnings

from plotters.pltStyle import pltStyle
pltStyle(style='hep') #, font_frac=1.40
plt.rcParams['figure.subplot.left'] = plt.rcParams['figure.subplot.left']*1.4
# plt.rcParams['font.size'] = plt.rcParams['font.size']/0.98
plt.rcParams['figure.dpi'] = 150
import os

### import subpackages
import helpers as h
import plotters.plot_makers as plot_makers
# from common_binning import JERC_Constants
from fileNames.available_datasets import dataset_dictionary, dataset_labels
from plotters.plot_cutflow import plot_cutflow

def fit_response_distributions(data_tag='Pythia-TTBAR', config=None):
    ''' The script fits the response histograms (or calculates the medians) and creates the `txt` files with the fit results
    (one file for each `Mean`, `MeanStd`, `Median`, `MedianStd`, `MeanRecoPt`)
    '''
    # Get the directory of the current script
    script_dir = os.path.dirname(os.path.realpath(__file__))

    if config is None:
        config = {}

    # Use the dictionary values, or the defaults if they are not in the dictionary
    test_run = config.get('test_run', False)
    load_fit_res = config.get('load_fit_res', False)
    saveplots = config.get('saveplots', False)
    combine_antiflavour = config.get('combine_antiflavour', False)
    eta_binning = config.get('eta_binning', 'HCalPart')
    pt_binning = config.get('pt_binning', 'MC_truth')
    sum_neg_pos_eta_bool = config.get('sum_neg_pos_eta_bool', True)
    tag_Lx = config.get('tag_Lx', '_L5')
    add_tag = config.get('add_tag', '')
    fit_tag = config.get('fit_tag', '')
    flavors = config.get('flavors', ['b', 'ud', 'all', 'g', 'c', 's', 'q', 'u', 'd', 'unmatched'])
    pt_to_fit = config.get('pt_to_fit', None)
    eta_to_fit = config.get('eta_to_fit', None)

    ################ End of the parameters of the run and switches  #########################
    # ### Do some logic with the input partameters and the rest of parameters of the run
    
    tag_full = tag_Lx+'_'+data_tag+add_tag
    if test_run:
        tag_full = tag_full+'_test'
    outname = os.path.join(script_dir, 'out', 'CoffeaJERCOutputs'+tag_full+'.coffea')
    
    tag_fit_res = tag_full
    
    if eta_binning != "HCalPart":
        tag_fit_res=tag_full+'_'+eta_binning
    if pt_binning != "MC_truth":
        tag_fit_res=tag_fit_res+'_pt-'+pt_binning
    combine_antiflavour_txt = '_split_antiflav' if not combine_antiflavour else ''
    tag_fit_res += combine_antiflavour_txt+fit_tag

    # if not os.path.exists("out"):
    #     os.mkdir("out")
            
    fig_path = os.path.join(script_dir, 'fig')
    if not os.path.exists(fig_path):
        os.mkdir(fig_path)
        
    if test_run and not os.path.exists(script_dir+"/test"):
        os.mkdir(script_dir+"/test/")
        os.mkdir(script_dir+"/test/fig")
        
    out_txt_path = script_dir+"/out_txt" if not test_run else script_dir+"/test/out_txt"
    if not os.path.exists(out_txt_path):
        os.mkdir(out_txt_path)

    # ### End of do some logic with the input partameters and the rest of parameters of the run
    ################ Load the histograms and scale them according to their cross-sections  #########################
    output = util.load(outname)
    print("Loaded histograms from: ", outname)
    xsec_dict, legend_label = h.get_xsec_dict(data_tag, dataset_dictionary)
    
    keys = output.keys()
    try: ## in older files cutflow was not split into cutflow for jets and events. Should be removed in the future
        Nev = {key: output[key]['cutflow_events']['all_events'].value for key in keys}
    except KeyError:
        Nev = {key: output[key]['cutflow']['all_events'].value for key in keys}
    scale_factors = h.hist_div(xsec_dict, Nev)
    all_histo_keys = output[next(iter(output.keys()))].keys()
    hists_merged = {histo_key:h.sum_subhist(output, histo_key, scale_factors) for histo_key in all_histo_keys }
    
    # ### Fit responses
    
    # Define some global variables for the fit
    from JetEtaBins import JetEtaBins, PtBins
    
    jeteta_bins = JetEtaBins(eta_binning)
    pt_bins = PtBins(pt_binning)
    fiteta_bins = JetEtaBins(eta_binning, absolute=True) if sum_neg_pos_eta_bool else jeteta_bins

    if pt_to_fit is None:
        pt_bins_to_fit = range(pt_bins.nbins)
    elif len(pt_to_fit)==1:
        pt_bins_to_fit = [pt_bins.get_bin_idx(pt_to_fit[0])]
    elif len(pt_to_fit)==2:
        pt_bins_to_fit = range(pt_bins.get_bin_idx(pt_to_fit[0]), pt_bins.get_bin_idx(pt_to_fit[1])+1)

    if eta_to_fit is None:
        eta_bins_to_fit = range(fiteta_bins.nbins)
    elif len(eta_to_fit)==1:
        eta_bins_to_fit = [fiteta_bins.get_bin_idx(eta_to_fit[0])]
    elif len(eta_to_fit)==2:
        eta_bins_to_fit = range(fiteta_bins.get_bin_idx(eta_to_fit[0]), fiteta_bins.get_bin_idx(eta_to_fit[1])+1)
    
    def fit_responses(hists, flavor='b', saveplots = None, scaled_hist=None):
        ''' Extract the jet flavor `flavor` from the histogram dictionary `hists` and fit in all the eta and pt bins.
        Add `scaled_hist` if to produce the response distributions with all the samples stacked up.
        Return a dictionary of ["Mean", "MeanStd", "Median", "MedianStd", "MeanRecoPt"] values.
        
        '''
        warnings.filterwarnings('ignore')  ### filter out the many fit warnings
        if saveplots==None:
            saveplots = False if test_run or eta_binning != "HCalPart" else True
            
        response_hists = {}
        recopt_hists = {}
        if not scaled_hist==None:
            for sample in scaled_hist:
                response_hist, recopt_hist = h.add_flavors(scaled_hist[sample], flavor, combine_antiflavour) 
                response_hist = h.rebin_hist(response_hist, 'jeteta' , jeteta_bins.edges)
                recopt_hist = h.rebin_hist(recopt_hist, 'jeteta' , jeteta_bins.edges)

                if sum_neg_pos_eta_bool==True:
                    response_hist = h.sum_neg_pos_eta(response_hist)
                    recopt_hist = h.sum_neg_pos_eta(recopt_hist)
                response_hists[sample] = response_hist
                recopt_hists[sample] = recopt_hist
            
        response_hist, recopt_hist = h.add_flavors(hists, flavor, combine_antiflavour)
        
        # breakpoint()
        # print("response hist, values = ", response_hist.values()[5,45:55,jeteta_bins.get_bin_idx(4.8)])
        # response_hist.values()[:,:,1] = 0
        # recopt_hist.values()[:,1] = 0
        # response_hist.values()[:,:,-1] = 0
        # recopt_hist.values()[:,-1] = 0
        response_hist = h.rebin_hist(response_hist, 'jeteta' , jeteta_bins.edges)
        recopt_hist = h.rebin_hist(recopt_hist, 'jeteta' , jeteta_bins.edges)

        response_hist = h.rebin_hist(response_hist, 'pt_gen' , pt_bins.edges)
        recopt_hist = h.rebin_hist(recopt_hist, 'pt_gen' , pt_bins.edges)

        if sum_neg_pos_eta_bool==True:
            response_hist = h.sum_neg_pos_eta(response_hist)
            recopt_hist = h.sum_neg_pos_eta(recopt_hist)
        # print("response hist, values = ", response_hist[33j,45:55,1.305j:1.566j].values().flatten())
        # print("response hist, values = ", response_hist[33j,45:55,1.566j:1.74j].values().flatten())
        results = {key:np.zeros((pt_bins.nbins, fiteta_bins.nbins))
                      for key in ["Mean", "MeanStd", "Median", "MedianStd", "MeanRecoPt"]  }
                                
        N_converged = 0
        N_little_ev = 0
        N_failed = 0
    
        FitFigDir1 = fig_path+'/responses/responses'+tag_full
        if saveplots and not os.path.exists(FitFigDir1):
            os.mkdir(FitFigDir1)
        
        FitFigDir = FitFigDir1+'/response_pt_eta_'+flavor+tag_full
        if saveplots:
            if not os.path.exists(FitFigDir):
                os.mkdir(FitFigDir)
            print("Response fit plots will be saved under ", FitFigDir)
        elif not saveplots:
            print("Response fit plots won't be saved")
        
        for i in pt_bins_to_fit:
            for k in eta_bins_to_fit:
                if not scaled_hist==None:
                    histos = {sample: response_hists[sample][i, :, k] for sample in response_hists}
                    histos2plot = {key[10:]:histos[key] for key in histos.keys()}
                    h_stack = hist.Stack.from_dict(histos2plot)
                
                histo = response_hist[i, :, k]
                histopt = recopt_hist[i, k]            
                try:
                    Neff = histo.sum().value**2/(histo.sum().variance)
                except ZeroDivisionError:
                    Neff = histo.sum().value**2/(histo.sum().variance+1e-20)
    
                median, medianstd = h.get_median(histo, Neff) #, x_range=[0, 2]
                
                ##################### Mean of the pt_reco  ######################
                ### (The mean includes events that potentially had ptresponse in the second peak at low pt)
                ### No way to distinguish it if only x*weights are saved instead of the whole histogram.
                mean_reco_pt = histopt.value/np.sum(histo.values())
                
                ####################### Fitting ############################
                p2, cov, chi2, Ndof, status, fitlims = h.fit_response(histo, Neff, Nfit=3, sigma_fit_window=1.5)
                if status == 1:
                    N_converged += 1
                elif status == -1:
                    N_little_ev += 1
                else:
                    N_failed += 1
                    
                
                ####################### Store the results ############################
                results["Mean"][i,k] = p2[1]
                results["MeanStd"][i,k] = np.sqrt(np.abs(cov[1,1]))
                results["Median"][i,k] = median
                results["MedianStd"][i,k] = medianstd
                results["MeanRecoPt"][i,k] = mean_reco_pt
    
        ####################### Plotting ############################
                if  saveplots:
                    figName = FitFigDir+'/ptResponse'+pt_bins.idx2str(i)+fiteta_bins.idx2str(k)
                    hep_txt = pt_bins.idx2plot_str(i)+'\n'+fiteta_bins.idx2plot_str(k)+'\n'+f'{flav} jet' 
            
                    txt2print = ('\n'+r'Mean = {0:0.3f}$\pm${1:0.3f}'.format(p2[1], np.sqrt(cov[1,1]))
                                     + '\nWidth = {0:0.3f}$\pm${1:0.3f}'.format(np.abs(p2[2]), np.sqrt(cov[2,2]))
                                     + '\n'+r'Median = {0:0.3f}$\pm${1:0.3f}'.format(median, medianstd)
                                     + '\n'+r'$\chi^2/ndof$ = {0:0.2g}/{1:0.0f}'.format(chi2, Ndof)
                                     + '\n'+r'Neff = {0:0.3g}'.format(Neff))
                    plot_makers.plot_response_dist(histo, p2, fitlims,
                                       figName, dataset_name=legend_label, hep_txt=hep_txt, txt2print=txt2print, print_txt=True)              
                    if not scaled_hist==None:
                        plot_makers.plot_response_dist_stack(h_stack, p2, fitlims,
                                                 figName+'stack', hep_txt=hep_txt, print_txt=False )
    
        print("fit summary: ")
        print(f"N bins converged = {N_converged}; N bins not fit because of too little data = {N_little_ev}; N bins not converged = {N_failed}")
        warnings.filterwarnings('default')
        
        return results  
    
    # ### Run fitting for each sample
    
    medians = []
    medianstds = []
    if not combine_antiflavour:
        flavors = np.concatenate([[flav, flav+'bar'] if flav in h.barable_flavors else [flav] for flav in flavors ])
    print('-'*25)
    print('-'*25)
    print(f'Starting to fit each flavor in: {flavors}')
    result_each_flav = {}
    for flav in flavors:
        print('-'*25)
        print('-'*25)
        print('Fitting flavor: ', flav)
        if load_fit_res:
            result = {}
            keys = ["Mean", "MeanStd", "Median", "MedianStd", "MeanRecoPt"] 
            for key in keys:
                result[key] = h.read_data(key, flav, tag_fit_res, out_txt_path)
        
        else:
            result = fit_responses(hists_merged, flav, saveplots=saveplots) #scaled_hist
            result_each_flav[flav] = result
            medians.append(result["Median"][0][0])
            medianstds.append(result["MedianStd"][0][0])
            for key in result:
                h.save_data(result[key], key, flav, tag_fit_res, pt_bins.centres, fiteta_bins.edges, out_txt_path)
                pass

    #     print("result = ", result)
        # median = result["Median"]
        # medianStd = result["MedianStd"] 
        # meanstd = np.sqrt(result["MeanStd"])
                
        if eta_binning=="onebin": #or fine_etabins:
            plot_makers.plot_corrections_eta(result["Median"], result["MedianStd"], pt_bins, fiteta_bins.centres, tag_fit_res, flav, plotptvals=[20, 35, 150, 400])
        else:
            plot_makers.plot_corrections(result, pt_bins.centres, fiteta_bins, tag_fit_res, flav, plotetavals=[0, 1.305, 2.5, 3.139], plotmean=True)
    #         plotters.plot_corrections_eta(result["Median"], result["MedianStd"], pt_bins, fiteta_bins.centres, tag_fit_res, flav, plotptvals=[20, 35, 150, 400])

    from save_json import save_json
    save_json(result_each_flav, pt_bins, fiteta_bins, out_txt_path+'/response_fit_results'+tag_fit_res+'.json')


    print('-'*25)
    print('-'*25)
    print('Saving cutflow')
    rc_bottom_def = plt.rcParams['figure.subplot.bottom']
    plt.rcParams['figure.subplot.bottom'] = 0.39
    tag_cutflow = tag_full[4:]
    # hist1 = output[list(keys)[0]] ### plotting only the first

    scale_cutflow = {key: 1 for key in output}
    sum_cutflow = {histo_key:h.sum_subhist(output, histo_key, scale_cutflow) for histo_key in ['cutflow_events', 'cutflow_jets'] }

    ### normalize the cutflow histograms to the number of events
    def normalize_cutflow(hists_cutflow):
        for key in ['cutflow_events', 'cutflow_jets']:
            hists_cutflow[key].variances()[:] = hists_cutflow[key].values()
        return hists_cutflow
    
    sum_cutflow = normalize_cutflow(sum_cutflow)

    dataset_label = dataset_labels[tag_cutflow] if tag_cutflow in dataset_labels else tag_cutflow 
    print("dataset label: ", dataset_label)
    # sum_cutflow['cutflow_events'].variances()[:] = sum_cutflow['cutflow_events'].values()
    # sum_cutflow['cutflow_jets'].variances()[:] = sum_cutflow['cutflow_jets'].values()
    jets_pet_ev = sum_cutflow['cutflow_jets']/sum_cutflow['cutflow_events']['all_events'].value
    plot_cutflow(sum_cutflow['cutflow_events'], tag_cutflow, ylab='N events', fig_name='cutflow_Nevents', title_name=dataset_label, figdir=fig_path+'/cutflow/'+tag_cutflow)
    plot_cutflow(sum_cutflow['cutflow_jets'], tag_cutflow, ylab='N jets', fig_name='cutflow_Njets', title_name=dataset_label, figdir=fig_path+'/cutflow/'+tag_cutflow)
    plot_cutflow(jets_pet_ev, tag_cutflow, ylab='N jets/N events', fig_name='cutflow_Njets_per_ev', title_name=dataset_label, figdir=fig_path+'/cutflow/'+tag_cutflow)
    # if 'cutflow_events' in hists_cutflow:
        # drawing only for the first sample as in the hist_merged, the total number of events are normalized
    if len(list(keys))>1:
        for key in keys:
            cutflow = normalize_cutflow(output[key])
            dataset_label = dataset_labels[key] if key in dataset_labels else key
            dataset_label.replace('_', ' ')

            jets_pet_ev = cutflow['cutflow_jets']/cutflow['cutflow_events']['all_events'].value
            plot_cutflow(cutflow['cutflow_events'], key, ylab='N events', fig_name='cutflow_Nevents', title_name=dataset_label, figdir=fig_path+'/cutflow/'+tag_cutflow)
            plot_cutflow(cutflow['cutflow_jets'], key, ylab='N jets', fig_name='cutflow_Njets', title_name=dataset_label, figdir=fig_path+'/cutflow/'+tag_cutflow)
            plot_cutflow(jets_pet_ev, key, ylab='N jets/N events', fig_name='cutflow_Njets_per_ev', title_name=dataset_label, figdir=fig_path+'/cutflow/'+tag_cutflow)

    # else:
    #     print("cutflow histograms cannot be drawn because the cutflow isn't split into events and jets. Potentially not all events are selected")
    #     cutflow = hist1['cutflow'][['all_events', 'selected_events',  'events passing the lepton selection',  'events, alpha cut' ]]
    #     plot_cutflow(cutflow, list(keys)[0], ylab='N events', fig_name='cutflow_Nevents')

    #     cutflow = hist1['cutflow'][[ 'all_jets', 'gen_matched', 'jets, tight lepton id',
    #                                     'jets, dR cut with leptons', 'jetpt cut',
    #                                     'alpha cut; leading jets','iso jets']]
    #     plot_cutflow(cutflow, list(keys)[0], ylab='N jets', fig_name='cutflow_Njets')
    plt.rcParams['figure.subplot.bottom'] = rc_bottom_def
    print('-----'*10)
    print("All done. Congrats!")
  
if __name__ == "__main__":
    data_tags = ['Pythia-TTBAR', 'Herwig-TTBAR', 'QCD-MG-Py', 'QCD-MG-Her', 'QCD-Py', 'DY-MG-Py', 'DY-MG-Her']
    # data_tags = ['Pythia-TTBAR_iso_dr_0p8','Pythia-TTBAR_iso_dr_1p2', 'Pythia-TTBAR_iso_dr_1p5'] #Pythia-semilep-TTBAR
    # data_tags = ['Herwig-TTBAR'] #, 'scaled_pion', 'not_scaled_pion'] #Pythia-semilep-TTBAR
    # data_tags = ['scaled_times2_pion', 'scaled_times5_pion', 'scaled_times10_pion', 'scaled_pion', 'not_scaled_pion'] #Pythia-semilep-TTBAR

    # data_tags = ['QCD-Py_noiso'] # , 'Pythia-TTBAR_100files_noiso', 'DY-MG-Py_noiso', 'QCD-MG-Py_noiso'] # 'Pythia-non-semilep-TTBAR', 'DY-MG-Py', 'QCD-MG-Py' Pythia-semilep-TTBAR
    # data_tags = ['QCD-Py' ] #Pythia-semilep-TTBAR

    config = {
         ################ Parameters of the run and switches  #########################
        "test_run"            : False,   ### True check on a file that was created with a processor with `test_run=True` (maybe obsolete because this can be specified just in the data_tag)
        "load_fit_res"        : False,   ###  (also kind of obsolete because plotting scripts exist in `plotters` ) True if only replot the fit results without redoing histogram fits
        "saveplots"           : False,    ### True if save all the response distributions. There are many eta/pt bins so it takes time and space
        "combine_antiflavour" : True,    ### True if combine the flavor and anti-flavour jets into one histogram
        
        ### Choose eta binning for the response fits.
        ### HCalPart: bin in HCal sectors, CaloTowers: the standard JERC binning,
        ### CoarseCalo: like 'CaloTowers' but many bins united; onebin: combine all eta bins
        ### Preprocessing always done in CaloTowers. For the reponse distributions, the bins can be merged.
        "eta_binning"         : "Summer20Flavor",  ### HCalPart, CoarseCalo, JERC, CaloTowers, Summer20Flavor, onebin;
        "pt_binning"          : "MC_truth", ### MC_truth, Uncert, Coarse, onebin
        "sum_neg_pos_eta_bool": True,  ### if combining the positive and negative eta bins
        "tag_Lx" : '_L5',                 ### L5 or L23, but L23 not supported since ages.
        
        ### Define the dataset either by using a `data_tag` available in `dataset_dictionary`
        ### Or manualy by defining `dataset` (below) with the path to the .txt file with the file names (without the redirectors).
        ### Or manually by defining `fileslist` as the list with file names.
        ### data_tag will be used to name output figures and histograms.
        # data_tag = 'Herwig-TTBAR' # 'QCD-MG-Her' #'Herwig-TTBAR' 
        # data_tag = 'DY-FxFx'
        ### name of the specific run if parameters changed used for saving figures and output histograms.
        "add_tag":             '',   
        ### if the fit strategy changed and the results need to be stored with a different name
        "fit_tag":              '', #_remove_bad_eta_bin   


        ### Define which flavors should be fit
        # "flavors":                ['b', 'ud', 'all', 'g', 'c', 's', 'q', 'u', 'd', 'unmatched'],
        "flavors":                ['b_gluon_splitting', "b_prompt", 'ud', 'all', 'g', 'c_gluon_splitting', "c_prompt", 'b', 'c', 's', 'q', 'u', 'd', 'unmatched'],
        # "flavors": ['b_gluon_splitting', "b_prompt", 'c_gluon_splitting', "c_prompt", 'b', 'c'],

        ### None if all the pt bins should be fit, otherwise a list of two numbers for the range of pt bins to fit, or just one number for a single pt bin
        # "pt_to_fit": None,
        # "pt_to_fit": [30],
        # "eta_to_fit": [0],
    }

    for data_tag in data_tags:
        fit_response_distributions(data_tag=data_tag, config=config)