update get_split.py

UNAOUN · UNAOUN · commit ec6d99d26409 · 2024-05-31T16:07:52.000+08:00
diff --git a/examples/glnn/readme.md b/examples/glnn/readme.md
@@ -35,7 +35,7 @@ TL_BACKEND="mindspore" python train_student.py --dataset cora --teacher SAGE
 | Cora      | 80.54±1.35 | 80.94±0.31 | 80.84±0.30 | 80.90±0.21 | 81.04±0.30 |
 | Citeseer  | 71.77±2.01 | 70.74±0.87 | 71.34±0.55 | 71.18±1.20 | 70.58±1.14 |
 | Pubmed    | 75.42±2.31 | 77.90±0.07 | 77.88±0.23 | 77.78±0.19 | 77.78±0.13 |
-| Computers | 83.03±1.87 | 81.51±0.60 | 81.73±0.48 | 81.46±0.72 | 81.24±1.27 |
-| Photo     | 92.11±1.08 | 92.05±0.56 | 91.92±0.53 | 92.00±0.55 | 91.77±0.91 |
+| Computers | 83.03±1.87 | 83.45±0.61 | 82.78±0.47 | 83.03±0.14 | 83.40±0.45 |
+| Photo     | 92.11±1.08 | 91.93±0.16 | 91.91±0.24 | 91.89±0.27 | 91.88±0.21 |
 
 - The model performance is the average of 5 tests
diff --git a/examples/glnn/train_student.py b/examples/glnn/train_student.py
@@ -72,8 +72,10 @@ def train_student(args):
         raise ValueError('Unknown dataset: {}'.format(args.dataset))
     if args.dataset in ['cora', 'pubmed', 'citeseer']:
         dataset = Planetoid(args.dataset_path, args.dataset)
-    elif args.dataset in ['computers', 'photo']:
-        dataset = Amazon(args.dataset_path, args.dataset, train_per_class=20, val_per_class=30)
+    elif args.dataset == 'computers':
+        dataset = Amazon(args.dataset_path, args.dataset, train_ratio=200/13752, val_ratio=(200/13752)*1.5)
+    elif args.dataset == 'photo':
+        dataset = Amazon(args.dataset_path, args.dataset, train_ratio=160/7650, val_ratio=(160/7650)*1.5)
     graph = dataset[0]
 
     # load teacher_logits from .npy file
diff --git a/examples/glnn/train_teacher.py b/examples/glnn/train_teacher.py
@@ -65,8 +65,10 @@ def train_teacher(args):
         raise ValueError('Unknown dataset: {}'.format(args.dataset))
     if args.dataset in ['cora', 'pubmed', 'citeseer']:
         dataset = Planetoid(args.dataset_path, args.dataset)
-    elif args.dataset in ['computers', 'photo']:
-        dataset = Amazon(args.dataset_path, args.dataset, train_per_class=20, val_per_class=30)
+    elif args.dataset == 'computers':
+        dataset = Amazon(args.dataset_path, args.dataset, train_ratio=200/13752, val_ratio=(200/13752)*1.5)
+    elif args.dataset == 'photo':
+        dataset = Amazon(args.dataset_path, args.dataset, train_ratio=160/7650, val_ratio=(160/7650)*1.5)
     graph = dataset[0]
     edge_index = graph.edge_index
     edge_weight = tlx.convert_to_tensor(calc_gcn_norm(edge_index, graph.num_nodes))
diff --git a/gammagl/datasets/amazon.py b/gammagl/datasets/amazon.py
@@ -35,12 +35,12 @@ class Amazon(InMemoryDataset):
     force_reload : bool, optional
         Whether to re-process the dataset.
         (default: :obj:`False`)
-    train_per_class : int, optional
-        Number of training samples per class.
-        (default: :obj:`20`)
-    val_per_class : int, optional
-        Number of validation samples per class.
-        (default: :obj:`20`)
+    train_ratio : float, optional
+        Ratio of training samples.
+        (default: :obj:`0.1`)
+    val_ratio : float, optional
+        Ratio of validation samples.
+        (default: :obj:`0.15`)
 
     Stats:
         .. list-table::
@@ -70,15 +70,15 @@ def __init__(self, root: str = None, name: str = 'computers',
                  transform: Optional[Callable] = None,
                  pre_transform: Optional[Callable] = None,
                  force_reload: bool = False,
-                 train_per_class: int = 20,
-                 val_per_class: int = 20):
+                 train_ratio: float = 0.1,
+                 val_ratio: float = 0.15):
         self.name = name.lower()
         assert self.name in ['computers', 'photo']
         super().__init__(root, transform, pre_transform, force_reload = force_reload)
         self.data, self.slices = self.load_data(self.processed_paths[0])
 
         data = self.get(0)
-        data.train_mask, data.val_mask, data.test_mask = get_train_val_test_split(self.data, train_per_class, val_per_class, self.num_classes)
+        data.train_mask, data.val_mask, data.test_mask = get_train_val_test_split(self.data, train_ratio, val_ratio)
         self.data, self.slices = self.collate([data])
 
     @property
diff --git a/gammagl/utils/get_split.py b/gammagl/utils/get_split.py
@@ -1,64 +1,57 @@
 import tensorlayerx as tlx
 import numpy as np
+from sklearn.model_selection import train_test_split
 
 
-def get_train_val_test_split(graph, train_per_class, val_per_class, num_classes):
-    """Split the dataset into train, validation, and test sets.
+def get_train_val_test_split(graph, train_ratio, val_ratio):
+    """
+    Split the dataset into train, validation, and test sets.
 
     Parameters
     ----------
     graph :
         The graph to split.
-    train_per_class : int
-        The number of training examples per class.
-    val_per_class : int
-        The number of validation examples per class.
-    num_classes : int
-        The number of classes in the dataset.
+    train_ratio : float
+        The proportion of the dataset to include in the train split.
+    val_ratio : float
+        The proportion of the dataset to include in the validation split.
 
     Returns
     -------
     :class:`tuple` of :class:`tensor`
-    
     """
-    random_state = np.random.RandomState(0)
-    labels = tlx.nn.OneHot(depth=num_classes)(graph.y).numpy()
-    num_samples, num_classes = graph.num_nodes, num_classes
-    remaining_indices = set(range(num_samples))
-    forbidden_indices = set()
 
-    train_indices = sample_per_class(random_state, num_samples, num_classes, labels, train_per_class, forbidden_indices=forbidden_indices)
-    forbidden_indices.update(train_indices)
-    val_indices = sample_per_class(random_state, num_samples, num_classes, labels, val_per_class, forbidden_indices=forbidden_indices)
-    forbidden_indices.update(val_indices)
-    test_indices = np.array(list(remaining_indices - forbidden_indices))
+    random_state = np.random.RandomState(0)
+    num_samples = graph.num_nodes
+    all_indices = np.arange(num_samples)
 
-    return generate_masks(graph.num_nodes, train_indices, val_indices, test_indices)
+    # split into train and (val + test)
+    train_indices, val_test_indices = train_test_split(
+        all_indices, train_size=train_ratio, random_state=random_state
+    )
 
+    # calculate the ratio of validation and test splits in the remaining data
+    test_ratio = 1.0 - train_ratio - val_ratio
+    val_size_ratio = val_ratio / (val_ratio + test_ratio)
 
-def sample_per_class(random_state, num_samples, num_classes, labels, num_examples_per_class, forbidden_indices=None):
-    sample_indices_per_class = {index: [] for index in range(num_classes)}
-    forbidden_set = set(forbidden_indices) if forbidden_indices is not None else set()
+    # split val + test into validation and test sets
+    val_indices, test_indices = train_test_split(
+        val_test_indices, train_size=val_size_ratio, random_state=random_state
+    )
 
-    for class_index in range(num_classes):
-        for sample_index in range(num_samples):
-            if labels[sample_index, class_index] > 0.0 and sample_index not in forbidden_set:
-                sample_indices_per_class[class_index].append(sample_index)
+    return generate_masks(num_samples, train_indices, val_indices, test_indices)
 
-    return np.concatenate(
-        [random_state.choice(sample_indices_per_class[class_index], num_examples_per_class, replace=False)
-         for class_index in range(num_classes)
-         ])
 
+def generate_masks(num_nodes, train_indices, val_indices, test_indices):
+    np_train_mask = np.zeros(num_nodes, dtype=bool)
+    np_train_mask[train_indices] = 1
+    np_val_mask = np.zeros(num_nodes, dtype=bool)
+    np_val_mask[val_indices] = 1
+    np_test_mask = np.zeros(num_nodes, dtype=bool)
+    np_test_mask[test_indices] = 1
 
-def generate_masks(num_nodes, train_indices, val_indices, test_indices): 
-    np_train_mask = np.zeros(num_nodes) 
-    np_train_mask[train_indices] = 1 
-    np_val_mask = np.zeros(num_nodes) 
-    np_val_mask[val_indices] = 1 
-    np_test_mask = np.zeros(num_nodes) 
-    np_test_mask[test_indices] = 1 
-    train_mask = tlx.ops.convert_to_tensor(np_train_mask, dtype=tlx.bool) 
+    train_mask = tlx.ops.convert_to_tensor(np_train_mask, dtype=tlx.bool)
     val_mask = tlx.ops.convert_to_tensor(np_val_mask, dtype=tlx.bool)
-    test_mask = tlx.ops.convert_to_tensor(np_test_mask, dtype=tlx.bool) 
+    test_mask = tlx.ops.convert_to_tensor(np_test_mask, dtype=tlx.bool)
+
     return train_mask, val_mask, test_mask