Merge pull request #3196 from PrincetonUniversity/fix-softmax

younesStrittmatter · web-flow · commit bbb3fb0576aa · 2025-02-16T17:09:52.000-05:00
Fix softmax
diff --git a/psyneulink/core/components/functions/nonstateful/transferfunctions.py b/psyneulink/core/components/functions/nonstateful/transferfunctions.py
@@ -2828,9 +2828,10 @@ class SoftMax(TransferFunction):
     <SoftMax.gain>` parametrically based on the `variable <SoftMax.variable>`:
 
     - *mask_threshold* -- setting the **mask_threshold** argument to a scalar value causes the `variable
-      <SoftMax.variable>` to be thresholded by that value before applying the SoftMax function; any elements of
-      `variable <SoftMax.variable>` with an absolute value below the threshold are set to 0; all others are scaled
-      by the specified `gain <SoftMax.gain>` and then passed through the SoftMax function.  This only applies if the
+      <SoftMax.variable>` to be thresholded by that value before applying the SoftMax function; Each element in
+      variable <SoftMax.variable> is first scaled by gain <SoftMax.gain>. Then, any elements with an absolute
+      value below *mask_threshold* are set to negative infinity (``-inf``), effectively masking them since
+      ``exp(-inf) = 0``. The remaining values are then passed through the SoftMax function. This only applies if the
       **gain** argument is specified as a scalar; if it is specified as *ADAPTIVE*, then the **mask_threshold**
       argument is ignored.
 
@@ -2920,10 +2921,11 @@ class SoftMax(TransferFunction):
 
     mask_threshold : scalar or None
         determines whether the `variable <SoftMax.variable>` is thresholded before applying the SoftMax function;
-        if it is a scalar, only elements of `variable <SoftMax.variable>` with an absolute value greater than that
-        value are considered when applying the SoftMax function (which are then scaled by the `gain <SoftMax.gain>`
-        parameter; all other elements are assigned 0.  This only applies if `gain <SoftMax.gain>` is specified as a
-        scalar;  otherwise it is ignored (see `Thresholding and Adaptive Gain <SoftMax_AdaptGain>` for details).
+        if it is a scalar, each elements of `variable <SoftMax.variable>` is first scaled by `<SoftMax.gain>`. Then,
+        only elements with an absolute value greater than *mask_threshold* are considered when applying the SoftMax
+        function, while all other elements are set to ``-inf`` effectively masking them since ``exp(-inf) = 0``.
+        This only applies if `gain <SoftMax.gain>` is specified as a scalar;  otherwise it is ignored
+        (see `Thresholding and Adaptive Gain <SoftMax_AdaptGain>` for details).
 
     adapt_scale : scalar
         determines the *scale* parameter using by the `adapt_gain <SoftMax.adapt_gain>` method (see method for details).
@@ -3149,22 +3151,31 @@ def _validate_variable(self, variable, context=None):
         return np.asarray(variable)
 
     def apply_softmax(self, input_value, gain, mask_threshold, output_type):
-
         # Modulate input_value by gain
         v = gain * input_value
-        # Shift by max to avoid extreme values:
-        v = v - np.max(v)
+
+        # Mask threshold
+        if mask_threshold is not None:
+            if np.any(v < 0):
+                warnings.warn(f"SoftMax function: mask_threshold is set "
+                              f"to {mask_threshold} but input_value contains negative values."
+                              f"Masking will be applied to the magnitude of the input.")
+
+            v = np.where(np.abs(v) > mask_threshold, v, -np.inf)
+
+        # Make numerically stable by shifting by max value
+        if np.any(v != -np.inf):
+            v = v - np.max(v)
+
         # Exponentiate
         v = np.exp(v)
-        # Threshold if specified:
-        if mask_threshold:
-            v = v * np.where(input_value > mask_threshold, v, 0)
+
         # Normalize (to sum to 1)
-        if not any(v):
+        if not np.any(v):
             # If v is all zeros, avoid divide by zero in normalize and return all zeros for softmax
             sm = v
         else:
-            sm = v / np.sum(v, axis=0)
+            sm = v / np.sum(v)
 
         # Generate one-hot encoding based on selected output_type
         if output_type in {ARG_MAX, ARG_MAX_INDICATOR, MAX_VAL, MAX_INDICATOR}:
@@ -3472,15 +3483,34 @@ def _gen_pytorch_fct(self, device, context=None):
         if isinstance(gain, str) and gain == ADAPTIVE:
             return lambda x: (torch.softmax(self._gen_pytorch_adapt_gain_fct(device, context)(x) * x, -1))
 
-        elif mask_threshold:
+        elif mask_threshold is not None:
             def pytorch_thresholded_softmax(_input: torch.Tensor) -> torch.Tensor:
-                # Mask elements of input below threshold
-                _mask = (torch.abs(_input) > mask_threshold)
-                # Subtract off the max value in the input to eliminate extreme values, exponentiate, and apply mask
-                masked_exp = _mask * torch.exp(gain * (_input - torch.max(_input, -1, keepdim=True)[0]))
-                if (masked_exp == 0).all():
-                    return masked_exp
-                return masked_exp / torch.sum(masked_exp, -1, keepdim=True)
+                v = gain * _input
+
+                # Apply threshold-based masking
+                if mask_threshold is not None:
+                    if torch.any(_input < 0):
+                        warnings.warn(f"Softmax function: mask_threshold is set to {mask_threshold}, "
+                                      f"but input contains negative values. "
+                                      f"Masking will be applied to the magnitude of the input.")
+
+                    # Create a mask where values below threshold are set to -inf
+                    mask = torch.abs(v) > mask_threshold
+                    v = v.masked_fill(~mask, float('-inf'))  # More stable than torch.where()
+
+                # Handle case where all values are masked (return tensor with gradient support)
+                if torch.all(~mask):
+                    return torch.full_like(v, 0.0, requires_grad=True)
+
+                # Make numerically stable by shifting max value
+                max_v = torch.max(v[mask])  # Avoid computing max over -inf
+                v = v - max_v
+
+                # Compute softmax (PyTorch handles -inf correctly)
+                exp_v = torch.exp(v)
+                sm = exp_v / torch.sum(exp_v, dim=-1, keepdim=True)
+
+                return sm
             # Return the function
             return pytorch_thresholded_softmax
 
diff --git a/tests/composition/test_emcomposition.py b/tests/composition/test_emcomposition.py
@@ -225,14 +225,17 @@ def test_memory_fill(start, memory_fill):
             test_memory_fill(start=repeat, memory_fill=memory_fill)
 
     @pytest.mark.parametrize("softmax_choice, expected",
-                             [(pnl.WEIGHTED_AVG, [[0.93016008, 0.1, 0.16983992]]),
+                             [(pnl.WEIGHTED_AVG, [[0.8479525858370621, 0.1, 0.25204741416293786]]),
                               (pnl.ARG_MAX, [[1, .1, .1]]),
                               (pnl.PROBABILISTIC, [[1, .1, .1]]), # NOTE: actual stochasticity not tested here
                              ])
     def test_softmax_choice(self, softmax_choice, expected):
         em = EMComposition(memory_template=[[[1,.1,.1]], [[1,.1,.1]], [[.1,.1,1]]],
                            softmax_choice=softmax_choice,
-                           enable_learning=False)
+                           enable_learning=False,
+                           softmax_threshold=None,
+                           memory_decay_rate=0,
+                           normalize_memories=False)
         result = em.run(inputs={em.query_input_nodes[0]:[[1,0,0]]})
 
         np.testing.assert_allclose(result, expected)
@@ -739,11 +742,12 @@ def test_assign_field_weights_and_0_vs_None(self,
 
         em = pnl.EMComposition(memory_template=memory_template,
                                memory_capacity=4,
-                               memory_decay_rate= 0,
-                               learn_field_weights = False,
+                               memory_decay_rate=0,
+                               learn_field_weights=False,
                                softmax_choice=softmax_choice,
                                field_weights=field_weights,
-                               field_names=['A','B','C'])
+                               field_names=['A', 'B', 'C'],
+                               )
         # Confirm initial weight assignments (that favor A)
         assert em.nodes['A [WEIGHT]'].input_port.defaults.variable == [.75]
         assert em.nodes['B [WEIGHT]'].input_port.defaults.variable == [.25]
@@ -774,9 +778,9 @@ def test_assign_field_weights_and_0_vs_None(self,
         # Note: field_weights favors A
         if softmax_choice == pnl.MAX_VAL:
             if operation == pnl.L0:
-                expected = [[1.70381182], [0.], [3.40762364]]
+                expected = [[1.467373], [0.], [2.934746]]
             else:
-                expected = [[1.56081243, 0.0], [0.0, 1.56081243], [3.12162487, 3.12162487]]
+                expected = [[1.419423, 0.0], [0.0, 1.419423], [2.838846, 2.838846]]
         else:
             expected = memory_template[0]
         np.testing.assert_allclose(result, expected)
@@ -899,7 +903,7 @@ def test_backpropagation_of_error_in_learning(self):
                            memory_capacity=50,
                            memory_decay_rate=0,
                            softmax_gain=10,
-                           softmax_threshold=.001,
+                           softmax_threshold=0.001,
                            fields = {'STATE': {pnl.FIELD_WEIGHT: None,
                                                pnl.LEARN_FIELD_WEIGHT: False,
                                                pnl.TARGET_FIELD: True},
@@ -1026,12 +1030,16 @@ def test_backpropagation_of_error_in_learning(self):
                   [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]]
 
         result = EGO.learn(inputs={'STATE':INPUTS}, learning_rate=.5, execution_mode=pnl.ExecutionMode.PyTorch)
-        expected = [[ 0.00000000e+00,  1.35476414e-03,  1.13669378e-03,  2.20434260e-03,  6.61008388e-04, 9.88672202e-01,
-                      6.52088276e-04,  1.74149507e-03,  1.09769133e-03,  2.47971436e-03,  0.00000000e+00],
-                    [ 0.00000000e+00, -6.75284069e-02, -1.28930436e-03, -2.10726610e-01, -1.41050716e-03, -5.92286989e-01,
-                     -2.75196416e-03, -2.21010605e-03, -7.14369243e-03, -2.05167374e-02,  0.00000000e+00],
-                    [ 0.00000000e+00,  1.18578255e-03,  1.29393181e-03,  1.35476414e-03,  1.13669378e-03, 2.20434260e-03,
-                      6.61008388e-04,  9.88672202e-01,  6.52088276e-04,  2.83918640e-03,  0.00000000e+00]]
+        expected = [
+            [0.00000000e+00, 1.35933540e-03, 1.13114366e-03, 2.20590015e-03,
+             1.09314885e-03, 9.87722281e-01, 1.10371450e-03, 1.72925210e-03,
+             1.17352360e-03, 2.48170027e-03, 0.00000000e+00],
+            [0.00000000e+00, -6.54396065e-02,  1.41905061e-03, -2.08500295e-01,
+             -5.03985394e-05, -5.90196484e-01, -5.33017075e-03, -2.33024404e-03,
+             -2.02730870e-02, -1.58091223e-02,  0.00000000e+00],
+            [0.00000000e+00, 1.19576382e-03, 1.28593645e-03, 1.35933540e-03,
+             1.13114366e-03, 2.20590015e-03, 1.09314885e-03, 9.87722281e-01,
+             1.10371450e-03, 2.90277570e-03, 0.00000000e+00]]
         np.testing.assert_allclose(result, expected)
 
         # Plot (for during debugging):
diff --git a/tests/functions/test_transfer.py b/tests/functions/test_transfer.py
@@ -23,6 +23,17 @@
 softmax_helper  = np.exp(softmax_helper) / np.sum(np.exp(softmax_helper))
 softmax_helper2 = np.array((softmax_helper, softmax_helper)).reshape(2, -1)
 
+# Here, we use RAND1 * .5 as threshold so that the expected inputs to be masked is 50%.
+softmax_threshold_helper = RAND1 * test_var
+softmax_threshold_helper = np.where(np.abs(softmax_threshold_helper) > RAND1 * .5, softmax_threshold_helper, -np.inf)
+if np.any(softmax_threshold_helper != -np.inf):
+    softmax_threshold_helper = softmax_threshold_helper - np.max(softmax_threshold_helper)
+softmax_threshold_helper = np.exp(softmax_threshold_helper)
+if np.any(softmax_threshold_helper):
+    softmax_threshold_helper = softmax_threshold_helper / np.sum(softmax_threshold_helper)
+softmax_threshold_helper2 = np.array((softmax_threshold_helper, softmax_threshold_helper)).reshape(2, -1)
+
+
 tanh_helper = (RAND1 * (test_var + RAND2 - RAND3) + RAND4)
 tanh_helper = np.tanh(tanh_helper)
 
@@ -106,6 +117,77 @@ def binomial_distort_helper(seed):
     pytest.param(pnl.SoftMax, [test_var, test_var], {kw.GAIN:RAND1, kw.OUTPUT_TYPE:kw.MAX_INDICATOR, kw.PER_ITEM: True},
                  np.where(softmax_helper2 == np.max(softmax_helper2), 1, 0), id="SOFT_MAX MAX_INDICATOR PER_ITEM"),
 
+    # SoftMax with mask_threshold 1D input
+    pytest.param(pnl.SoftMax, test_var,
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.PER_ITEM:False},
+                 softmax_threshold_helper, id="SOFT_MAX MASK_THRESHOLD ALL",
+                 marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, test_var,
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:pnl.ARG_MAX, kw.PER_ITEM:False},
+                 np.where(softmax_threshold_helper == np.max(softmax_threshold_helper), softmax_threshold_helper, 0),
+                 id="SOFT_MAX MASK_THRESHOLD ARG_MAX", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, test_var,
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:pnl.ARG_MAX_INDICATOR, kw.PER_ITEM:False},
+                 np.where(softmax_threshold_helper == np.max(softmax_threshold_helper), 1, 0),
+                 id="SOFT_MAX MASK_THRESHOLD ARG_MAX_INDICATOR", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, test_var,
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:kw.MAX_VAL, kw.PER_ITEM:False},
+                 np.where(softmax_threshold_helper == np.max(softmax_threshold_helper), softmax_threshold_helper, 0),
+                 id="SOFT_MAX MASK_THRESHOLD MAX_VAL", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, test_var,
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:kw.MAX_INDICATOR, kw.PER_ITEM:False},
+                 np.where(softmax_threshold_helper == np.max(softmax_threshold_helper), 1, 0),
+                 id="SOFT_MAX MASK_THRESHOLD MAX_INDICATOR", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, test_var,
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:kw.PROB, kw.PER_ITEM:False},
+                 [0.0, 0.0, 0.0, test_var[3], test_var[4], 0.0, 0.0, 0.0, 0.0, 0.0],
+                 id="SOFT_MAX MASK_THRESHOLD PROB", marks=pytest.mark.llvm_not_implemented),
+    #
+    # # SoftMax 2D threshold testing per-item
+    pytest.param(pnl.SoftMax, [test_var],
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.PER_ITEM:True}, [softmax_threshold_helper],
+                 id="SOFT_MAX MASK_THRESHOLD ALL 2D", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, [test_var],
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:pnl.ARG_MAX, kw.PER_ITEM:True},
+                 [np.where(softmax_threshold_helper == np.max(softmax_threshold_helper), softmax_threshold_helper, 0)],
+                 id="SOFT_MAX MASK_THRESHOLD ARG_MAX 2D", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, [test_var],
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:pnl.ARG_MAX_INDICATOR, kw.PER_ITEM:True},
+                 [np.where(softmax_threshold_helper == np.max(softmax_threshold_helper), 1, 0)],
+                 id="SOFT_MAX MASK_THRESHOLD ARG_MAX_INDICATOR 2D", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, [test_var],
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:kw.MAX_VAL, kw.PER_ITEM:True},
+                 [np.where(softmax_threshold_helper == np.max(softmax_threshold_helper), softmax_threshold_helper, 0)],
+                 id="SOFT_MAX MASK_THRESHOLD MAX_VAL 2D", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, [test_var],
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:kw.MAX_INDICATOR, kw.PER_ITEM:True},
+                 [np.where(softmax_threshold_helper == np.max(softmax_threshold_helper), 1, 0)],
+                 id="SOFT_MAX MASK_THRESHOLD MAX_INDICATOR 2D", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, [test_var],
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:kw.PROB, kw.PER_ITEM:True},
+                 [[0.0, 0.0, 0.0, test_var[3], test_var[4], 0.0, 0.0, 0.0, 0.0, 0.0]],
+                 id="SOFT_MAX MASK_THRESHOLD PROB 2D", marks=pytest.mark.llvm_not_implemented),
+
+    # SoftMax threshold per-item with 2 elements in input
+    pytest.param(pnl.SoftMax, [test_var, test_var],
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.PER_ITEM:True}, softmax_threshold_helper2,
+                 id="SOFT_MAX MASK_THRESHOLD ALL 2D", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, [test_var, test_var], {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:pnl.ARG_MAX, kw.PER_ITEM:True},
+                 np.where(softmax_threshold_helper2 == np.max(softmax_threshold_helper2), softmax_threshold_helper2, 0),
+                 id="SOFT_MAX MASK_THRESHOLD ARG_MAX 2D", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, [test_var, test_var],
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:pnl.ARG_MAX_INDICATOR, kw.PER_ITEM:True},
+                 np.where(softmax_threshold_helper2 == np.max(softmax_threshold_helper2), 1, 0),
+                 id="SOFT_MAX MASK_THRESHOLD ARG_MAX_INDICATOR 2D", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, [test_var, test_var],
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:kw.MAX_VAL, kw.PER_ITEM:True},
+                 np.where(softmax_threshold_helper == np.max(softmax_threshold_helper2), softmax_threshold_helper2, 0),
+                 id="SOFT_MAX MASK_THRESHOLD MAX_VAL 2D", marks=pytest.mark.llvm_not_implemented),
+    pytest.param(pnl.SoftMax, [test_var, test_var],
+                 {kw.GAIN:RAND1, 'mask_threshold': RAND1 * .5, kw.OUTPUT_TYPE:kw.MAX_INDICATOR, kw.PER_ITEM:True},
+                 np.where(softmax_threshold_helper2 == np.max(softmax_threshold_helper2), 1, 0),
+                 id="SOFT_MAX MASK_THRESHOLD MAX_INDICATOR 2D", marks=pytest.mark.llvm_not_implemented),
+
     # Linear Matrix
     pytest.param(pnl.MatrixTransform, test_var, {kw.MATRIX:test_matrix}, np.dot(test_var, test_matrix), id="LINEAR_MATRIX SQUARE"),
     pytest.param(pnl.MatrixTransform, test_var, {kw.MATRIX:test_matrix_l}, np.dot(test_var, test_matrix_l), id="LINEAR_MATRIX WIDE"),