Skip to content

Commit

Permalink
add snowstorm_dataset and IceCubehosted class
Browse files Browse the repository at this point in the history
  • Loading branch information
sevmag committed Jan 27, 2025
1 parent 79d7baf commit c31582f
Show file tree
Hide file tree
Showing 5 changed files with 310 additions and 1 deletion.
6 changes: 5 additions & 1 deletion src/graphnet/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,8 @@
from .pre_configured import I3ToParquetConverter
from .pre_configured import I3ToSQLiteConverter
from .datamodule import GraphNeTDataModule
from .curated_datamodule import CuratedDataset, ERDAHostedDataset
from .curated_datamodule import (
CuratedDataset,
ERDAHostedDataset,
IceCubeHostedDataset,
)
72 changes: 72 additions & 0 deletions src/graphnet/data/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,27 @@ class FEATURES:
"sensor_pos_z",
"t",
]
SNOWSTORM = [
"dom_x",
"dom_y",
"dom_z",
"charge",
"dom_time",
"width",
"pmt_area",
"rde",
"is_bright_dom",
"is_bad_dom",
"is_saturated_dom",
"is_errata_dom",
"event_time",
"hlc",
"awtd",
"string",
"pmt_number",
"dom_number",
"dom_type",
]
KAGGLE = ["x", "y", "z", "time", "charge", "auxiliary"]
LIQUIDO = ["sipm_x", "sipm_y", "sipm_z", "t"]

Expand Down Expand Up @@ -84,6 +105,57 @@ class TRUTH:
"primary_hadron_1_energy",
"total_energy",
]
SNOWSTORM = [
"energy",
"position_x",
"position_y",
"position_z",
"azimuth",
"zenith",
"pid",
"event_time",
"interaction_type",
"elasticity",
"RunID",
"SubrunID",
"EventID",
"SubEventID",
"dbang_decay_length",
"track_length",
"stopped_muon",
"energy_track",
"energy_cascade",
"inelasticity",
"DeepCoreFilter_13",
"CascadeFilter_13",
"MuonFilter_13",
"OnlineL2Filter_17",
"L3_oscNext_bool",
"L4_oscNext_bool",
"L5_oscNext_bool",
"L6_oscNext_bool",
"L7_oscNext_bool",
"Homogenized_QTot",
"MCLabelClassification",
"MCLabelCoincidentMuons",
"MCLabelBgMuonMCPE",
"MCLabelBgMuonMCPECharge",
"GNLabelTrackEnergyDeposited",
"GNLabelTrackEnergyOnEntrance",
"GNLabelTrackEnergyOnEntrancePrimary",
"GNLabelTrackEnergyDepositedPrimary",
"GNLabelEnergyPrimary",
"GNLabelCascadeEnergyDepositedPrimary",
"GNLabelCascadeEnergyDeposited",
"GNLabelEnergyDepositedTotal",
"GNLabelEnergyDepositedPrimary",
"GNLabelHighestEInIceParticleIsChild",
"GNLabelHighestEInIceParticleDistance",
"GNLabelHighestEInIceParticleEFraction",
"GNLabelHighestEInIceParticleEOnEntrance",
"GNLabelHighestEDaughterDistance",
"GNLabelHighestEDaughterEFraction",
]
KAGGLE = ["zenith", "azimuth"]
LIQUIDO = [
"vertex_x",
Expand Down
73 changes: 73 additions & 0 deletions src/graphnet/data/curated_datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import Dict, Any, Optional, List, Tuple, Union
from abc import abstractmethod
import os
from glob import glob

from .datamodule import GraphNeTDataModule
from graphnet.models.graphs import GraphDefinition
Expand Down Expand Up @@ -280,3 +281,75 @@ def prepare_data(self) -> None:
os.system(f"wget -O {file_path} {self._mirror}/{file_hash}")
os.system(f"tar -xf {file_path} -C {self.dataset_dir}")
os.system(f"rm {file_path}")


class IceCubeHostedDataset(CuratedDataset):
"""A base class for dataset/datamodule hosted on the IceCube cluster.
Inheriting subclasses will need to do:
- fill out the `_zipped_files` attribute, which
should be a list of paths to files that are compressed using `tar` with
extension ".tar.gz" and are stored on the IceCube Cluster in "/data/".
- implement the `_get_dir_name` method, which should return the
directory name where the files resulting from the unzipping of a
compressed file should end up.
"""

_mirror = "https://convey.icecube.wisc.edu"

def prepare_data(self) -> None:
"""Prepare the dataset for training."""
assert hasattr(self, "_zipped_files") and (len(self._zipped_files) > 0)

# Check which files still need to be downloaded
files_to_dl = self._resolve_downloads()
if files_to_dl == []:
return

# Download files
USER = input("Username: ")
source_file_paths = " ".join(
[f"{self._mirror}{f}" for f in files_to_dl]
)
os.system(
f"wget -P {self.dataset_dir} --user={USER} "
+ f"--ask-password {source_file_paths}"
)

# unzip files
for file in glob(os.path.join(self.dataset_dir, "*.tar.gz")):
tmp_dir = os.path.join(self.dataset_dir, "tmp")
os.mkdir(tmp_dir)
os.system(f"tar -xzf {file} -C {tmp_dir}")
unzip_dir = self._get_dir_name(file)
os.makedirs(unzip_dir)
for db_file in glob(
os.path.join(tmp_dir, "**/*.db"), recursive=True
):
os.system(f"mv {db_file} {unzip_dir}")

os.system(f"rm {file}")
os.system(f"rm -r {tmp_dir}")

@abstractmethod
def _get_dir_name(self, source_file_path: str) -> str:
"""Get directory name from source file path.
E.g. if `source_file_path` is "/data/set/file.tar.gz",
return os.path.join(self.dataset_dir, source_file_path.split("/")[-2])
to have 'set' as the directory name where all files resulting from the
unzipping of `source_file_path` end up. If no substrucutre is desired,
just return `self.dataset_dir`
"""
raise NotImplementedError

def _resolve_downloads(self) -> List[str]:
"""Resolve which files still need to be downloaded."""
if not os.path.exists(self.dataset_dir):
return self._zipped_files
dir_names = [self._get_dir_name(f) for f in self._zipped_files]
ret = []
for i, dir in enumerate(dir_names):
if not os.path.exists(dir):
ret.append(self._zipped_files[i])
return ret
1 change: 1 addition & 0 deletions src/graphnet/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@

from .test_dataset import TestDataset
from .prometheus_datasets import TRIDENTSmall, BaikalGVDSmall, PONESmall
from .snowstorm_dataset import SnowStormDataset
159 changes: 159 additions & 0 deletions src/graphnet/datasets/snowstorm_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
"""Snowstorm dataset module hosted on the IceCube Collaboration servers."""

import pandas as pd
import re
import os
from typing import Dict, Any, Optional, List, Tuple, Union
from glob import glob
from sklearn.model_selection import train_test_split

from graphnet.data.constants import FEATURES, TRUTH
from graphnet.data.curated_datamodule import IceCubeHostedDataset
from graphnet.data.utilities import query_database
from graphnet.models.graphs import GraphDefinition


class SnowStormDataset(IceCubeHostedDataset):
"""IceCube SnowStorm simulation dataset.
More information can be found at
https://wiki.icecube.wisc.edu/index.php/SnowStorm_MC#File_Locations
This is a IceCube Collaboration simulation dataset.
Requires a username and password.
"""

_experiment = "IceCube SnowStorm dataset"
_creator = "Severin Magel"
_citation = "arXiv:1909.01530"
_available_backends = ["sqlite"]

_pulsemaps = ["SRTInIcePulses"]
_truth_table = "truth"
_pulse_truth = None
_features = FEATURES.SNOWSTORM
_event_truth = TRUTH.SNOWSTORM
_data_root_dir = "/data/ana/graphnet/Snowstorm_l2"

def __init__(
self,
run_ids: List[int],
graph_definition: GraphDefinition,
download_dir: str,
truth: Optional[List[str]] = None,
features: Optional[List[str]] = None,
train_dataloader_kwargs: Optional[Dict[str, Any]] = None,
validation_dataloader_kwargs: Optional[Dict[str, Any]] = None,
test_dataloader_kwargs: Optional[Dict[str, Any]] = None,
):
"""Initialize SnowStorm dataset."""
self._run_ids = run_ids
self._zipped_files = [
os.path.join(self._data_root_dir, f"{s}.tar.gz") for s in run_ids
]

super().__init__(
graph_definition=graph_definition,
download_dir=download_dir,
truth=truth,
features=features,
backend="sqlite",
train_dataloader_kwargs=train_dataloader_kwargs,
validation_dataloader_kwargs=validation_dataloader_kwargs,
test_dataloader_kwargs=test_dataloader_kwargs,
)

def _prepare_args(
self, backend: str, features: List[str], truth: List[str]
) -> Tuple[Dict[str, Any], Union[List[int], None], Union[List[int], None]]:
"""Prepare arguments for dataset."""
assert backend == "sqlite"
dataset_paths = []
for rid in self._run_ids:
dataset_paths += glob(
os.path.join(self.dataset_dir, str(rid), "**/*.db"),
recursive=True,
)

# get event numbers from all datasets
event_no = []

# get RunID
pattern = rf"{re.escape(self.dataset_dir)}/(\d+)/.*"
event_counts: Dict[str, int] = {}
event_counts = {}
for path in dataset_paths:

# Extract the ID
match = re.search(pattern, path)
assert match
run_id = match.group(1)

query_df = query_database(
database=path,
query=f"SELECT event_no FROM {self._truth_table}",
)
query_df["path"] = path
event_no.append(query_df)

# save event count for description
if run_id in event_counts:
event_counts[run_id] += query_df.shape[0]
else:
event_counts[run_id] = query_df.shape[0]

event_no = pd.concat(event_no, axis=0)

# split the non-unique event numbers into train/val and test
train_val, test = train_test_split(
event_no,
test_size=0.10,
random_state=42,
shuffle=True,
)

train_val = train_val.groupby("path")
test = test.groupby("path")

# parse into right format for CuratedDataset
train_val_selection = []
test_selection = []
for path in dataset_paths:
train_val_selection.append(
train_val["event_no"].get_group(path).tolist()
)
test_selection.append(test["event_no"].get_group(path).tolist())

dataset_args = {
"truth_table": self._truth_table,
"pulsemaps": self._pulsemaps,
"path": dataset_paths,
"graph_definition": self._graph_definition,
"features": features,
"truth": truth,
}

self._create_comment(event_counts)

return dataset_args, train_val_selection, test_selection

@classmethod
def _create_comment(cls, event_counts: Dict[str, int] = {}) -> None:
"""Print the number of events in each RunID."""
fixed_string = (
" Simulation produced by the IceCube Collaboration, "
+ "https://wiki.icecube.wisc.edu/index.php/SnowStorm_MC#File_Locations" # noqa: E501
)
tot = 0
runid_string = ""
for k, v in event_counts.items():
runid_string += f"RunID {k} contains {v:10d} events\n"
tot += v
cls._comments = (
f"Contains ~{tot/1e6:.1f} million events:\n"
+ runid_string
+ fixed_string
)

def _get_dir_name(self, source_file_path: str) -> str:
file_name = os.path.basename(source_file_path).split(".")[0]
return str(os.path.join(self.dataset_dir, file_name))

0 comments on commit c31582f

Please sign in to comment.