fix bug

clearhanhui · clearhanhui · commit f4368dc62e3f · 2023-01-13T16:15:11.000+08:00
diff --git a/gammagl/mpops/paddle.py b/gammagl/mpops/paddle.py
@@ -18,7 +18,7 @@ def unsorted_segment_sum(x, segment_ids, num_segments=None):
     else:
         num_segments = pd.max(segment_ids)+1
     if use_ext:
-        return paddle_segment.segment_sum(x, segment_ids, num_segments)
+        return paddle_ext.segment_sum(x, segment_ids, num_segments)
     idx_ = pd.argsort(segment_ids)
     x = pd.gather(x, idx_)
     segment_ids = pd.gather(segment_ids, idx_)
diff --git a/gammagl/mpops/paddle_ext/cuda/segment_sum_cuda.cu b/gammagl/mpops/paddle_ext/cuda/segment_sum_cuda.cu
@@ -12,10 +12,8 @@ __global__ void segment_sum_cuda_forward_kernel(const data_t *x_data, const int6
   int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int64_t e = (thread_idx / K) % E;
   int64_t k = thread_idx % K;
-  printf(" thread_idx = %d \n", thread_idx);
   if (thread_idx < numel)  {
     int64_t idx = index_data[e];
-    printf("%f \n", *out_data);
     atomicAdd(out_data + idx * K + k,
               x_data[thread_idx]);
   }
diff --git a/gammagl/mpops/paddle_ext/readme.md b/gammagl/mpops/paddle_ext/readme.md
@@ -5,3 +5,5 @@ Compile Steps:  (CMake, not work)
 TODO: support cmake
 
 > In `paddle/utils/cpp_extension/extension_utils.py:341L`, flags `"-ccbin"` & `"cc"` may cause error, since Paddle needs `nvcc` compiling with higher c++ standard while these flags may cause flag like `-std=c++14` does not work, when you are using a lower version gcc. Besides, we usually recommand to set gcc path to `CC` rather than `cc` in Linux, it will also cause error. Just annotate them.
+
+> Please keep the version of nvcc and the paddle-cuda consist, it may occur `the provided PTX was compiled with an unsupported toolchain.`
diff --git a/gammagl/mpops/torch_ext/cpu/segment_max_cpu.cpp b/gammagl/mpops/torch_ext/cpu/segment_max_cpu.cpp
@@ -50,15 +50,13 @@ std::tuple<torch::Tensor, torch::Tensor> segment_max_cpu_forward(torch::Tensor&
   for (auto e = 0; e < E; ++e) {
     idx = index_data[e];
     for (auto k = 0; k < K; ++k) {
-      if (out_data[idx * K + k] < x_data[e * K + k]) {
 #ifdef COMPILE_WITH_OMP
-#pragma omp atomic
+#pragma omp critical
 #endif
-      {
+      if (out_data[idx * K + k] < x_data[e * K + k]) {
         out_data[idx * K + k] = x_data[e * K + k];
         arg_out_data[idx * K + k] = e;
       }
-      }
     }
   }
   out.masked_fill_(out == std::numeric_limits<int64_t>::lowest(), (scalar_t)0);
diff --git a/profiler/mpops/paddle_ext_.py b/profiler/mpops/paddle_ext_.py
@@ -2,7 +2,7 @@
 from paddle_ext import unsorted_segment_sum
 
 src = paddle.to_tensor([[1, 1], [2, 2], [3, 3]], dtype='float32', stop_gradient=False)
-## TODO: it still successfully run, but it will get wrong answer. 
+## TODO: it still successfully run, but it will get wrong answer on GPU. 
 # src = paddle.to_tensor([1, 2, 3, 4, 5, 6], dtype='float32').reshape((2, 3)) 
 index = paddle.to_tensor([0, 1, 0], dtype=paddle.int64)
 out = unsorted_segment_sum(src, index, 3)

Original file line number	Diff line number	Diff line change
`@@ -50,15 +50,13 @@ std::tuple<torch::Tensor, torch::Tensor> segment_max_cpu_forward(torch::Tensor&`
`50`	`50`	`for (auto e = 0; e < E; ++e) {`
`51`	`51`	`idx = index_data[e];`
`52`	`52`	`for (auto k = 0; k < K; ++k) {`
`53`		`- if (out_data[idx * K + k] < x_data[e * K + k]) {`
`54`	`53`	`#ifdef COMPILE_WITH_OMP`
`55`		`-#pragma omp atomic`
	`54`	`+#pragma omp critical`
`56`	`55`	`#endif`
`57`		`- {`
	`56`	`+ if (out_data[idx * K + k] < x_data[e * K + k]) {`
`58`	`57`	`out_data[idx * K + k] = x_data[e * K + k];`
`59`	`58`	`arg_out_data[idx * K + k] = e;`
`60`	`59`	`}`
`61`		`- }`
`62`	`60`	`}`
`63`	`61`	`}`
`64`	`62`	`out.masked_fill_(out == std::numeric_limits<int64_t>::lowest(), (scalar_t)0);`