bytedeco
diff --git a/‎.github/actions/deploy-ubuntu/action.yml
+4-4 b/‎.github/actions/deploy-ubuntu/action.yml
+4-4
diff --git a/‎.github/actions/deploy-windows/action.yml
+3-3 b/‎.github/actions/deploy-windows/action.yml
+3-3
diff --git a/‎CHANGELOG.md
+1-1 b/‎CHANGELOG.md
+1-1
diff --git a/‎README.md
+2-2 b/‎README.md
+2-2
diff --git a/‎onnxruntime/README.md
+3-3 b/‎onnxruntime/README.md
+3-3
diff --git a/‎onnxruntime/cppbuild.sh
+4-2 b/‎onnxruntime/cppbuild.sh
+4-2
diff --git a/‎onnxruntime/platform/gpu/pom.xml
+1-1 b/‎onnxruntime/platform/gpu/pom.xml
+1-1
diff --git a/‎onnxruntime/platform/pom.xml
+1-1 b/‎onnxruntime/platform/pom.xml
+1-1
diff --git a/‎onnxruntime/pom.xml
+1-1 b/‎onnxruntime/pom.xml
+1-1
diff --git a/‎onnxruntime/samples/pom.xml
+2-2 b/‎onnxruntime/samples/pom.xml
+2-2
diff --git a/‎onnxruntime/src/gen/java/org/bytedeco/onnxruntime/BaseOrtLoraAdapter.java
+50 b/‎onnxruntime/src/gen/java/org/bytedeco/onnxruntime/BaseOrtLoraAdapter.java
+50
diff --git a/‎onnxruntime/src/gen/java/org/bytedeco/onnxruntime/LoraAdapter.java
+57 b/‎onnxruntime/src/gen/java/org/bytedeco/onnxruntime/LoraAdapter.java
+57
@@ -213,16 +213,16 @@ runs:
         if [[ "$CI_DEPLOY_PLATFORM" == "linux-arm64" ]] && [[ "$CI_DEPLOY_MODULE" == "tensorrt" ]]; then
           echo Installing TensorRT
           # python3 -m gdown 1LZRCv4ZAGiDQAu4pvADJIGntq4cGl5tU
-          curl -LO https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/tars/TensorRT-10.5.0.18.Ubuntu-24.04.aarch64-gnu.cuda-12.6.tar.gz
-          $SUDO tar -hxvf TensorRT-10.5.0.18.Ubuntu-24.04.aarch64-gnu.cuda-12.6.tar.gz -C /usr/local/
+          curl -LO https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Ubuntu-24.04.aarch64-gnu.cuda-12.6.tar.gz
+          $SUDO tar -hxvf TensorRT-10.6.0.26.Ubuntu-24.04.aarch64-gnu.cuda-12.6.tar.gz -C /usr/local/
           $SUDO ln -sf /usr/local/TensorRT* /usr/local/tensorrt
         fi
 
         if [[ "$CI_DEPLOY_PLATFORM" == "linux-x86_64" ]] && [[ "$CI_DEPLOY_MODULE" == "tensorrt" ]]; then
           echo Installing TensorRT
           # python3 -m gdown 1dVhD-DEYY42QbZe1GXl-vxe3k6KqWGsL
-          curl -LO https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/tars/TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz
-          $SUDO tar -hxvf TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz -C /usr/local/
+          curl -LO https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz
+          $SUDO tar -hxvf TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz -C /usr/local/
           $SUDO ln -sf /usr/local/TensorRT* /usr/local/tensorrt
         fi
 
 
@@ -142,9 +142,9 @@ runs:
         if "%CI_DEPLOY_MODULE%"=="tensorrt" (
           echo Installing TensorRT
           rem python -m gdown 1GfmJ1BKbacLpUU-0i_mGu0sjrAS0Xzzi
-          curl -LO https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/zip/TensorRT-10.5.0.18.Windows.win10.cuda-12.6.zip
-          unzip TensorRT-10.5.0.18.Windows.win10.cuda-12.6.zip
-          move TensorRT-10.5.0.18 "%ProgramFiles%\NVIDIA GPU Computing Toolkit\TensorRT"
+          curl -LO https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/zip/TensorRT-10.6.0.26.Windows.win10.cuda-12.6.zip
+          unzip TensorRT-10.6.0.26.Windows.win10.cuda-12.6.zip
+          move TensorRT-10.6.0.26 "%ProgramFiles%\NVIDIA GPU Computing Toolkit\TensorRT"
         )
 
         if "%CI_DEPLOY_MODULE%"=="mkl" (
 
@@ -9,7 +9,7 @@
  * Build FFmpeg with zimg to enable zscale filter ([pull #1481](https://github.com/bytedeco/javacpp-presets/pull/1481))
  * Enable PulseAudio support for FFmpeg on Linux ([pull #1472](https://github.com/bytedeco/javacpp-presets/pull/1472))
  * Virtualize `btCollisionWorld`, `btOverlapFilterCallback`, `btOverlapCallback` from Bullet Physics SDK ([pull #1475](https://github.com/bytedeco/javacpp-presets/pull/1475))
- * Upgrade presets for OpenCV 4.10.0, FFmpeg 7.1, Spinnaker 4.0.0.116 ([pull #1524](https://github.com/bytedeco/javacpp-presets/pull/1524)), MKL 2025.0, DNNL 3.6, OpenBLAS 0.3.28, CMINPACK 1.3.11, GSL 2.8, CPython 3.13.0, NumPy 2.1.2, SciPy 1.14.1, LLVM 19.1.3, LibRaw 0.21.2 ([pull #1520](https://github.com/bytedeco/javacpp-presets/pull/1520)), Leptonica 1.85.0, Tesseract 5.4.1, libffi 3.4.6, CUDA 12.6.2, cuDNN 9.5.1, NCCL 2.23.4, nvCOMP 4.1.0.6, OpenCL 3.0.16, NVIDIA Video Codec SDK 12.2.72, PyTorch 2.5.1 ([pull #1466](https://github.com/bytedeco/javacpp-presets/pull/1466)), SentencePiece 0.2.0, TensorFlow Lite 2.18.0, TensorRT 10.5.0.18, Triton Inference Server 2.51.0, ONNX 1.17.0, ONNX Runtime 1.19.2, TVM 0.18.0, and their dependencies
+ * Upgrade presets for OpenCV 4.10.0, FFmpeg 7.1, Spinnaker 4.0.0.116 ([pull #1524](https://github.com/bytedeco/javacpp-presets/pull/1524)), MKL 2025.0, DNNL 3.6, OpenBLAS 0.3.28, CMINPACK 1.3.11, GSL 2.8, CPython 3.13.0, NumPy 2.1.2, SciPy 1.14.1, LLVM 19.1.3, LibRaw 0.21.2 ([pull #1520](https://github.com/bytedeco/javacpp-presets/pull/1520)), Leptonica 1.85.0, Tesseract 5.4.1, libffi 3.4.6, CUDA 12.6.2, cuDNN 9.5.1, NCCL 2.23.4, nvCOMP 4.1.0.6, OpenCL 3.0.16, NVIDIA Video Codec SDK 12.2.72, PyTorch 2.5.1 ([pull #1466](https://github.com/bytedeco/javacpp-presets/pull/1466)), SentencePiece 0.2.0, TensorFlow Lite 2.18.0, TensorRT 10.6.0.26, Triton Inference Server 2.51.0, ONNX 1.17.0, ONNX Runtime 1.20.0, TVM 0.18.0, and their dependencies
 
 ### January 29, 2024 version 1.5.10
  * Introduce `macosx-arm64` builds for PyTorch ([pull #1463](https://github.com/bytedeco/javacpp-presets/pull/1463))
 
@@ -227,13 +227,13 @@ Each child module in turn relies by default on the included [`cppbuild.sh` scrip
  * SentencePiece 0.2.0  https://github.com/google/sentencepiece
  * TensorFlow 1.15.x  https://github.com/tensorflow/tensorflow
  * TensorFlow Lite 2.18.x  https://github.com/tensorflow/tensorflow
- * TensorRT 10.5.x  https://developer.nvidia.com/tensorrt
+ * TensorRT 10.6.x  https://developer.nvidia.com/tensorrt
  * Triton Inference Server 2.51.x  https://developer.nvidia.com/nvidia-triton-inference-server
  * The Arcade Learning Environment 0.8.x  https://github.com/mgbellemare/Arcade-Learning-Environment
  * DepthAI 2.24.x  https://github.com/luxonis/depthai-core
  * ONNX 1.17.x  https://github.com/onnx/onnx
  * nGraph 0.26.0  https://github.com/NervanaSystems/ngraph
- * ONNX Runtime 1.19.x  https://github.com/microsoft/onnxruntime
+ * ONNX Runtime 1.20.x  https://github.com/microsoft/onnxruntime
  * TVM 0.18.x  https://github.com/apache/tvm
  * Bullet Physics SDK 3.25  https://pybullet.org
  * LiquidFun  http://google.github.io/liquidfun/
 
@@ -9,7 +9,7 @@ Introduction
 ------------
 This directory contains the JavaCPP Presets module for:
 
- * ONNX Runtime 1.19.2  https://microsoft.github.io/onnxruntime/
+ * ONNX Runtime 1.20.0  https://microsoft.github.io/onnxruntime/
 
 Please refer to the parent README.md file for more detailed information about the JavaCPP Presets.
 
@@ -46,14 +46,14 @@ We can use [Maven 3](http://maven.apache.org/) to download and install automatic
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>onnxruntime-platform</artifactId>
-            <version>1.19.2-1.5.11-SNAPSHOT</version>
+            <version>1.20.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies required to use CUDA and cuDNN -->
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>onnxruntime-platform-gpu</artifactId>
-            <version>1.19.2-1.5.11-SNAPSHOT</version>
+            <version>1.20.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies to use bundled CUDA and cuDNN -->
 
@@ -22,7 +22,7 @@ if [[ "$EXTENSION" == *gpu ]]; then
     GPU_FLAGS="--use_cuda"
 fi
 
-ONNXRUNTIME=1.19.2
+ONNXRUNTIME=1.20.0
 
 mkdir -p "$PLATFORM$EXTENSION"
 cd "$PLATFORM$EXTENSION"
@@ -84,7 +84,7 @@ sedinplace 's/MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_I8MM()/false/g' onnxrunt
 
 # work around toolchain issues on Mac and Windows
 patch -p1 < ../../../onnxruntime.patch
-patch -p1 < ../../../onnxruntime-cuda.patch # https://github.com/microsoft/onnxruntime/pull/22316
+#patch -p1 < ../../../onnxruntime-cuda.patch # https://github.com/microsoft/onnxruntime/pull/22316
 #patch -p1 < ../../../onnxruntime-windows.patch # https://github.com/microsoft/onnxruntime/pull/7883
 sedinplace '/--Werror/d' cmake/CMakeLists.txt
 sedinplace '/-DCMAKE_CUDA_COMPILER=/d' tools/ci_build/build.py
@@ -113,6 +113,8 @@ sedinplace 's/, data_dims);/);/g' onnxruntime/core/providers/dnnl/subgraph/dnnl_
 sedinplace 's/, dims);/);/g' onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_qdq.cc
 sedinplace '/omp_get_max_threads/d' onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
 sedinplace '/omp_set_num_threads/d' onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc
+sedinplace '/cvtfp16Avx/d' cmake/onnxruntime_mlas.cmake
+sedinplace 's/MlasCastF16ToF32KernelAvx;/MlasCastF16ToF32KernelAvx2;/g' onnxruntime/core/mlas/lib/platform.cpp
 
 # use PTX instead of compiling for all CUDA archs to reduce library size
 sedinplace 's/-gencode=arch=compute_52,code=sm_52/-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_90,code=sm_90/g' cmake/CMakeLists.txt
 
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>onnxruntime-platform-gpu</artifactId>
-  <version>1.19.2-${project.parent.version}</version>
+  <version>1.20.0-${project.parent.version}</version>
   <name>JavaCPP Presets Platform GPU for ONNX Runtime</name>
 
   <properties>
 
@@ -12,7 +12,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>onnxruntime-platform</artifactId>
-  <version>1.19.2-${project.parent.version}</version>
+  <version>1.20.0-${project.parent.version}</version>
   <name>JavaCPP Presets Platform for ONNX Runtime</name>
 
   <properties>
 
@@ -11,7 +11,7 @@
 
   <groupId>org.bytedeco</groupId>
   <artifactId>onnxruntime</artifactId>
-  <version>1.19.2-${project.parent.version}</version>
+  <version>1.20.0-${project.parent.version}</version>
   <name>JavaCPP Presets for ONNX Runtime</name>
 
   <properties>
 
@@ -12,14 +12,14 @@
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>onnxruntime-platform</artifactId>
-            <version>1.19.2-1.5.11-SNAPSHOT</version>
+            <version>1.20.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies required to use CUDA and cuDNN -->
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>onnxruntime-platform-gpu</artifactId>
-            <version>1.19.2-1.5.11-SNAPSHOT</version>
+            <version>1.20.0-1.5.11-SNAPSHOT</version>
         </dependency>
 
         <!-- Additional dependencies to use bundled CUDA and cuDNN -->
 
@@ -0,0 +1,50 @@
+// Targeted by JavaCPP version 1.5.11-SNAPSHOT: DO NOT EDIT THIS FILE
+
+package org.bytedeco.onnxruntime;
+
+import java.nio.*;
+import org.bytedeco.javacpp.*;
+import org.bytedeco.javacpp.annotation.*;
+
+import static org.bytedeco.javacpp.presets.javacpp.*;
+import org.bytedeco.opencl.*;
+import static org.bytedeco.opencl.global.OpenCL.*;
+import org.bytedeco.dnnl.*;
+import static org.bytedeco.dnnl.global.dnnl.*;
+
+import static org.bytedeco.onnxruntime.global.onnxruntime.*;
+
+@Name("Ort::detail::Base<OrtLoraAdapter>") @NoOffset @Properties(inherit = org.bytedeco.onnxruntime.presets.onnxruntime.class)
+public class BaseOrtLoraAdapter extends Pointer {
+    static { Loader.load(); }
+    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+    public BaseOrtLoraAdapter(Pointer p) { super(p); }
+    /** Native array allocator. Access with {@link Pointer#position(long)}. */
+    public BaseOrtLoraAdapter(long size) { super((Pointer)null); allocateArray(size); }
+    private native void allocateArray(long size);
+    @Override public BaseOrtLoraAdapter position(long position) {
+        return (BaseOrtLoraAdapter)super.position(position);
+    }
+    @Override public BaseOrtLoraAdapter getPointer(long i) {
+        return new BaseOrtLoraAdapter((Pointer)this).offsetAddress(i);
+    }
+
+
+  public BaseOrtLoraAdapter() { super((Pointer)null); allocate(); }
+  private native void allocate();
+  public BaseOrtLoraAdapter(@Cast("Ort::detail::Base<OrtLoraAdapter>::contained_type*") OrtLoraAdapter p) { super((Pointer)null); allocate(p); }
+  @NoException(true) private native void allocate(@Cast("Ort::detail::Base<OrtLoraAdapter>::contained_type*") OrtLoraAdapter p);
+
+  
+  
+
+  public BaseOrtLoraAdapter(@ByRef(true) BaseOrtLoraAdapter v) { super((Pointer)null); allocate(v); }
+  @NoException(true) private native void allocate(@ByRef(true) BaseOrtLoraAdapter v);
+  public native @ByRef @Name("operator =") @NoException(true) BaseOrtLoraAdapter put(@ByRef(true) BaseOrtLoraAdapter v);
+
+  public native @Cast("Ort::detail::Base<OrtLoraAdapter>::contained_type*") @Name("operator Ort::detail::Base<OrtLoraAdapter>::contained_type*") @NoException(true) OrtLoraAdapter asOrtLoraAdapter();
+
+  /** \brief Relinquishes ownership of the contained C object pointer
+   *  The underlying object is not destroyed */
+  public native @Cast("Ort::detail::Base<OrtLoraAdapter>::contained_type*") OrtLoraAdapter release();
+}
@@ -0,0 +1,57 @@
+// Targeted by JavaCPP version 1.5.11-SNAPSHOT: DO NOT EDIT THIS FILE
+
+package org.bytedeco.onnxruntime;
+
+import java.nio.*;
+import org.bytedeco.javacpp.*;
+import org.bytedeco.javacpp.annotation.*;
+
+import static org.bytedeco.javacpp.presets.javacpp.*;
+import org.bytedeco.opencl.*;
+import static org.bytedeco.opencl.global.OpenCL.*;
+import org.bytedeco.dnnl.*;
+import static org.bytedeco.dnnl.global.dnnl.*;
+
+import static org.bytedeco.onnxruntime.global.onnxruntime.*;
+
+
+/** \brief LoraAdapter holds a set of Lora Parameters loaded from a single file */
+@Namespace("Ort") @Properties(inherit = org.bytedeco.onnxruntime.presets.onnxruntime.class)
+public class LoraAdapter extends BaseOrtLoraAdapter {
+    static { Loader.load(); }
+    /** Default native constructor. */
+    public LoraAdapter() { super((Pointer)null); allocate(); }
+    /** Native array allocator. Access with {@link Pointer#position(long)}. */
+    public LoraAdapter(long size) { super((Pointer)null); allocateArray(size); }
+    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+    public LoraAdapter(Pointer p) { super(p); }
+    private native void allocate();
+    private native void allocateArray(long size);
+    @Override public LoraAdapter position(long position) {
+        return (LoraAdapter)super.position(position);
+    }
+    @Override public LoraAdapter getPointer(long i) {
+        return new LoraAdapter((Pointer)this).offsetAddress(i);
+    }
+
+  /** \brief Wraps OrtApi::CreateLoraAdapter
+   * 
+   *  The function attempts to load the adapter from the specified file
+   *  @param adapter_path The path to the Lora adapter
+   *  @param allocator optional pointer to a device allocator. If nullptr, the data stays on CPU. It would still
+   *         be copied to device if required by the model at inference time. */
+  
+  ///
+  public static native @ByVal LoraAdapter CreateLoraAdapter(@Cast("const std::basic_string<ORTCHAR_T>*") @ByRef Pointer adapter_path,
+                                         OrtAllocator allocator);
+
+  /** \brief Wraps OrtApi::CreateLoraAdapterFromArray
+   * 
+   *  The function attempts to load the adapter from the specified byte array.
+   *  @param bytes The byte array containing file LoraAdapter format
+   *  @param num_bytes The number of bytes in the byte array
+   *  @param allocator optional pointer to a device allocator. If nullptr, the data stays on CPU. It would still
+   *         be copied to device if required by the model at inference time. */
+  public static native @ByVal LoraAdapter CreateLoraAdapterFromArray(@Const Pointer bytes, @Cast("size_t") long num_bytes,
+                                                  OrtAllocator allocator);
+}