tsukumijima
diff --git a/‎docs/Rockchip_FAQ_RGA_CN.md
+72-8 b/‎docs/Rockchip_FAQ_RGA_CN.md
+72-8
diff --git a/‎docs/Rockchip_FAQ_RGA_EN.md
+66-4 b/‎docs/Rockchip_FAQ_RGA_EN.md
+66-4
diff --git a/‎include/im2d_type.h
+2-2 b/‎include/im2d_type.h
+2-2
diff --git a/‎include/im2d_version.h
+1-1 b/‎include/im2d_version.h
+1-1
diff --git a/‎libs/AndroidNdk/arm64-v8a/librga.a
-8.35 KB b/‎libs/AndroidNdk/arm64-v8a/librga.a
-8.35 KB
diff --git a/‎libs/AndroidNdk/arm64-v8a/librga.so
0 Bytes b/‎libs/AndroidNdk/arm64-v8a/librga.so
0 Bytes
diff --git a/‎libs/AndroidNdk/armeabi-v7a/librga.a
984 Bytes b/‎libs/AndroidNdk/armeabi-v7a/librga.a
984 Bytes
diff --git a/‎libs/AndroidNdk/armeabi-v7a/librga.so
-4 KB b/‎libs/AndroidNdk/armeabi-v7a/librga.so
-4 KB
diff --git a/‎libs/Linux/gcc-aarch64/librga.a
-880 Bytes b/‎libs/Linux/gcc-aarch64/librga.a
-880 Bytes
diff --git a/‎libs/Linux/gcc-aarch64/librga.so
0 Bytes b/‎libs/Linux/gcc-aarch64/librga.so
0 Bytes
diff --git a/‎libs/Linux/gcc-armhf/librga.a
-552 Bytes b/‎libs/Linux/gcc-armhf/librga.a
-552 Bytes
diff --git a/‎libs/Linux/gcc-armhf/librga.so
8 Bytes b/‎libs/Linux/gcc-armhf/librga.so
8 Bytes
diff --git a/‎libs/Linux/gcc-uclib-armhf/librga.a
-552 Bytes b/‎libs/Linux/gcc-uclib-armhf/librga.a
-552 Bytes
diff --git a/‎libs/Linux/gcc-uclib-armhf/librga.so
8 Bytes b/‎libs/Linux/gcc-uclib-armhf/librga.so
8 Bytes
diff --git a/‎samples/allocator_demo/src/rga_allocator_graphicbuffer_demo.cpp
+7-3 b/‎samples/allocator_demo/src/rga_allocator_graphicbuffer_demo.cpp
+7-3
diff --git a/‎samples/async_demo/src/rga_async_demo.cpp
-1 b/‎samples/async_demo/src/rga_async_demo.cpp
-1
diff --git a/‎samples/config_demo/src/rga_config_single_core_demo.cpp
-1 b/‎samples/config_demo/src/rga_config_single_core_demo.cpp
-1
diff --git a/‎samples/config_demo/src/rga_config_thread_core_demo.cpp
-1 b/‎samples/config_demo/src/rga_config_thread_core_demo.cpp
-1
@@ -568,13 +568,27 @@ rga2: 00000000 00000000 00000000 00000000
   rga2: sync one cmd end time 2414					//打印本次工作RGA硬件的耗时，单位为us
   ```
 
-  - multi
+  - multi-rga
 
+  > 1.3.0以下版本
+  
   ```
-  rga3_reg: set cmd use time = 196					//开始处理请求到配置寄存器的耗时
-  rga_job: hw use time = 554							//硬件启动到硬件中断返回耗时
-  rga_job: (pid:3197) job done use time = 751			//开始处理请求到请求完成的耗时
-  rga_job: (pid:3197) job clean use time = 933		//开始处理请求到请求资源处理完毕的耗时
+  rga3_reg: set cmd use time = 196					//开始处理请求到配置寄存器的耗时，单位为us
+  rga_job: hw use time = 554							//硬件启动到硬件中断返回耗时，单位为us
+  rga_job: (pid:3197) job done use time = 751			//开始处理请求到请求完成的耗时，单位为us
+  rga_job: (pid:3197) job clean use time = 933		//开始处理请求到请求资源处理完毕的耗时，单位为us
+  ```
+  
+  > 1.3.0及以上版本
+  
+  ```
+  rga_mm: request[3300], get buffer_handle info cost 188 us		//获取当前buffer_handle信息耗时（虚拟地址则包含cache同步的耗时）
+  rga3_reg: request[3300], generate register cost time 2 us		//生成寄存器配置耗时
+  rga3_reg: request[3300], set register cost time 301 us			//配置寄存器耗时
+  rga_job: request[3300], hardware[RGA3_core0] cost time 539 us	//对应的硬件核心完成任务耗时
+  rga_mm: request[3300], put buffer_handle info cost 153 us		//释放当前buffer_handle信息耗时（虚拟地址则包含cache同步的耗时）
+  rga_job: request[3300], job done total cost time 1023 us		//当前job从提交到完成返回用户态的全部耗时
+  rga_job: request[3300], job cleanup total cost time 1030 us		//当前job从提交到资源释放完毕的全部耗时
   ```
 
 
@@ -813,7 +827,7 @@ rga_debugger: dump image to: /data/rga_image/1_core1_dst_plane0_virt_addr_w1280_
 
 **Q1.4**：RGA的效率不能满足我们产品的需求，有什么办法可以提升么？
 
-**A1.4**：部分芯片的出厂固件的RGA频率并不是最高频率，例如3399、1126等芯片RGA的频率最高可以到400M，可以通过以下两种方式实现RGA提频：
+**A1.4**：部分芯片早期（2021年之前）的出厂固件的RGA频率并不是最高频率，例如3399、1126等芯片RGA的频率最高可以到400M，可以通过以下两种方式实现RGA提频：
 
 - 通过命令设置（临时修改，设备重启则恢复频率）
 
@@ -876,15 +890,15 @@ index 02938b0..10a1dc4 100644
 
 			2). 当使用的虚拟地址是cacheable的，由于使能了cache，RGA驱动会在硬件访问内存前后强制同步cache数据，因此会增加CPU同步cache和内存的负载。由于常见的虚拟地址分配器并不是设计用于给其他硬件访问的，并存在同步cache的接口，因此驱动针对虚拟地址强制同步cache也是必要的。
 
-			3). 当时用dma-buf fd调用RGA时，有些分配器默认分配的是cacheable的buffer，并且kernel中dma-buf的处理会强制同步cache的情况，这是也会存在每次调用RGA时会有较大的CPU负载，也是因为CPU同步cache和内存引入的负载。该种情况下建议分配禁用cache的dma-buf。
+			3). 当使用dma-buf fd调用RGA时，有些分配器默认分配的是cacheable的buffer，并且kernel中dma-buf的处理会强制同步cache的情况，这是也会存在每次调用RGA时会有较大的CPU负载，也是因为CPU同步cache和内存引入的负载。该种情况下建议分配禁用cache的dma-buf。
 
 
 
 **Q1.9**：为什么当搭载8G DDR时，RGA效率较于4G时性能下降严重？
 
 **A1.9**：由于部分RGA1/RGA2的IOMMU仅支持最大32位的物理地址，而RGA Device Driver、RGA2 Device Driver中对于不满足硬件内存要求的调用申请，默认是通过swiotlb机制进行访问访问受限制的内存（原理上相当于通过CPU将高位内存拷贝至复合硬件要求的低位内存中，再交由硬件进行处理，处理完毕后再通过CPU将低位内存搬运回目标的高位内存上。）因此效率十分低下，通常在正常耗时的3-4倍之间浮动，并且引入受CPU负载影响。
 
-RGA Multicore Device Driver中针对访问受限制的内存会禁用swiotlb机制，直接通过调用失败的方式显示的通知调用者申请合理的内存再调用，来保证RGA的高效。通常伴随着以下日志：
+RGA Multicore Device Driver中针对访问受限制的内存会禁用swiotlb机制，直接通过调用失败的方式显示的通知调用者申请符合要求的内存再调用，来保证RGA的高效。通常伴随着以下日志：
 
 > HAL层日志：
 
@@ -925,6 +939,56 @@ rga_policy: start policy on core = 4
 
 
 
+**Q1.10**：为什么调用RGA API时发现API返回耗时远高于驱动打印硬件耗时？
+
+			**Q1.10.1**：通过“TIME”运行日志发现map/unmap buffer耗时过大。
+			**Q1.10.2**：对比kernel日志时间戳发现打印参数日志到寄存器打印之间存在较大的空白时间。
+			**Q1.10.3**：相同的参数配置，仅使用不同的内存分配器得到的运行耗时差异较大。
+
+**A1.10**：这里的耗时异常的原因均为外部buffer的内存映射行为（map/unmap）导致。所有的外部buffer都需要映射、绑定到RGA驱动中才能保证硬件最终能够访问指定的buffer。而不同的分配器对应的底层实现差异会导致驱动映射、绑定内存时耗时不一，从而导致看起来好像API耗时会比硬件实际耗时高很多的情况。常见的会存在较高额外耗时的dma-buf分配器有ION、V4L2等，通常这些差异与cache的同步有关，针对这类型问题可以通过横向对比不同分配器进行确认。
+
+这类问题通常可以通过以下几种方式进行优化：
+
+			1). 使用map/unmap耗时合理的内存分配器，常见的有dma_heap、DRM以及对应的封装内存分配器，以下是对应内存分配器分配内存调用RGA的示例代码：
+
+**<librga_souce_path>/samples/allocator_demo/src/rga_allocator_dma_demo.cpp**
+
+**<librga_souce_path>/samples/allocator_demo/src/rga_allocator_drm_demo.cpp**
+
+			2). 该问题对应的调用场景为通过wrapbuffer_fd()封装rga_buffer_t或者使用importbuffer_fd后仅运行一帧就立即releasebuffer_handle，这对于临时的测试或者每一帧buffer都是变化的场景是正常的，但本身在实际产品中buffer反复的重新分配这个行为就是性能较差且不合理的，建议整体性的进行优化buffer流程。
+
+通常我们建议整体流程按照以下方式进行设计：
+
+> 1. 构造buffer_pool，分配n个buffer用于作为轮转buffer，n的大小视实际场景进行配置。
+>
+> 2. 将这部分buffer 通过importbuffer_fd()导入RGA，获取到RGA的buffer_handle。
+>
+> 3. 使用轮转到的buffer_handle调用RGA执行图像操作，反复轮转、循环。
+> 4. 当不再需要这个buffer_pool内的buffer时，调用releasebuffer_handle()释放这部分buffer在RGA内部的引用，以保证后续该buffer能够被释放、销毁。
+> 5. 释放buffer_pool内不需要的buffer。
+
+按照上述流程设计，那么即使分配器的map/unmap行为会导致异常耗时也被收敛到importbuffer_fd()/releasebuffer_handle()的调用上，对于实际运行时每一帧调用将不再会有影响，这是一种很好的规避由于内存分配器实现差异引入性能差异的方案。
+
+			3). 对于无法更改内存分配器以及业务流程的场景，将只能通过修改使用的内存分配器map/unmap流程进行优化耗时，这是十分危险的行为，需要确保自己知晓全部使用该内存分配器的模块的应用行为后，提交redmine咨询对应内存分配器维护者来获取技术支持。
+
+
+
+**Q1.11**：为什么importbuffer_fd()/importbuffer_virtualaddr()调用耗时很高，为什么要调用该API？
+
+**A1.11**：该接口相关用法以及说明可以查看源码目录下docs文件夹内的[《Rockchip_Developer_Guide_RGA_CN》](./Rockchip_Developer_Guide_RGA_CN.md)中 “概述” 章节——“[图像缓冲区预处理](./Rockchip_Developer_Guide_RGA_CN.md#图像缓冲区预处理)” 了解用法说明。importbuffer_xx()的作用是将外部的buffer导入到RGA驱动内，使后续每一帧RGA调用都可以通过buffer_handle快速的访问该buffer，而导入外部buffer是比较耗时的操作，需要将外部的buffer映射到RGA驱动内，并保存对应的物理地址以及buffer信息，这对于调用RGA来说是不可缺少的行为。
+
+
+
+**Q1.12**：RGA支持并行的操作么？为什么多线程调用RGA时会出现个别帧耗时增多、翻倍的情况？
+
+**A1.12**：RGA API是可以支持多线程/进程并行调用的，但实际硬件上是否并行执行图像操作取决于当前使用芯片搭载的RGA核心数量，即搭载的核心数量则为最大支持的并行任务数量，超过核心数量的任务则会进入等待状态，直到有核心进入空闲状态。因此当并行调用的数量超过了硬件最大支持的并行数量后，那么个别帧的调用将会增加等待硬件空闲的耗时。具体可以通过以下调试节点（具体说明可以查看“驱动调试节点”小节中“硬件信息查询”部分）获取当前芯片搭载的核心数量以及支持的功能：
+
+```shell
+/# cat hardware
+```
+
+
+
 ### 功能咨询
 
 **Q2.1**：如何知道我当前的芯片平台搭载的RGA版本以及可以实现的功能？
 
@@ -565,16 +565,30 @@ rga2: 00000000 00000000 00000000 00000000
   rga2: sync one cmd end time 2414					//Print the RGA. hardware time of the work,in us.
   ```
 
-  - multi
+  - multi-rga
 
+  > Versions below 1.3.0
+  
   ```
   rga3_reg: set cmd use time = 196					//Time elapsed from start processing request to configuration register.
   rga_job: hw use time = 554							//Time-consuming from hardware startup to hardware interrupt return.
   rga_job: (pid:3197) job done use time = 751			//Time-consuming from the start of processing the request to the completion of the request.
   rga_job: (pid:3197) job clean use time = 933		//Time-consuming from the start of processing the request to the completion of the request resource processing.
   ```
-
-
+  
+  > Version 1.3.0 and above
+  
+  ```
+  rga_mm: request[3300], get buffer_handle info cost 188 us
+  rga3_reg: request[3300], generate register cost time 2 us
+  rga3_reg: request[3300], set register cost time 301 us
+  rga_job: request[3300], hardware[RGA3_core0] cost time 539 us
+  rga_mm: request[3300], put buffer_handle info cost 153 us
+  rga_job: request[3300], job done total cost time 1023 us
+  rga_job: request[3300], job cleanup total cost time 1030 us
+  ```
+  
+  
 
 ##### Version Information Query
 
@@ -807,7 +821,7 @@ This section introduces common questions about RGA in the form of Q&A. If the pr
 
 **Q1.4**：The efficiency of RGA cannot meet the needs of our products. Is there any way to improve it?
 
-**A1.4**：The RGA frequency of the factory firmware of some chips is not the highest frequency. For example, the RGA frequency of chips such as 3399 and 1126 can be up to 400M. The RGA frequency can be improved in the following two ways:
+**A1.4**：The RGA frequency of the factory firmware of some chips(Before 2021) is not the highest frequency. For example, the RGA frequency of chips such as 3399 and 1126 can be up to 400M. The RGA frequency can be improved in the following two ways:
 
 - Set by command (temporarily modified, frequency restored upon device restart)
 
@@ -919,6 +933,54 @@ Therefore, for this scenario, it is recommended to apply for memory within 4G to
 
 
 
+**Q1.10**: Why is the API time-consuming higher than the hardware time printed in the log?
+
+ **Q1.10.1**: Through the "TIME" running log, it is found that the map/unmap buffer takes too much time.
+ **Q1.10.2**: A comparison of the kernel log timestamps reveals a large gap between the timestamps of the "MSG" log and the "REG" log.
+ **Q1.10.3**: The same parameter configuration, but using different memory allocators only results in a large difference in running time.
+
+**A1.10**: The reasons for the time-consuming exception here are all caused by the memory mapping behavior (map/unmap) of the external buffer. All external buffers need to be mapped and bound to the RGA driver to ensure that the hardware can eventually access the specified buffer. The differences in the underlying implementations corresponding to the different allocators can lead to different time consumptions when the driver maps and binds the memory, resulting in a situation where it looks as if the API time consumptions will be much larger than the hardware time consumptions. Common dma-buf allocators with high extra time consumption are ION, V4L2, etc. Usually these differences are related to the synchronization of the cache, and this type of problem can be confirmed by comparing the time consumption of using different allocators.
+
+This type of issue can usually be optimized in the following ways:
+
+ 1). You can choose a memory allocator that is relatively more reasonable in terms of time consumption for the map/unmap process. Common ones are dma_heap, DRM, and the corresponding wrapper memory allocator. The following is sample code for calling RGA using memory allocated by these memory allocators:
+
+**<librga_souce_path>/samples/allocator_demo/src/rga_allocator_dma_demo.cpp**
+
+**<librga_souce_path>/samples/allocator_demo/src/rga_allocator_drm_demo.cpp**
+
+2). The calling scenario corresponding to this problem is to encapsulate rga_buffer_t through wrapbuffer_fd() or use importbuffer_fd to run only one frame and then immediately releasebuffer_handle. This is normal for temporary tests or scenarios where the buffer changes every frame, but it itself In actual products, repeated buffer reallocation has poor performance and is unreasonable. It is recommended to optimize the buffer process as a whole.
+
+Generally we recommend that the overall process be designed in the following way:
+
+> 1. Construct buffer_pool and allocate <n> buffers to be used as rotation buffers. The size of <n> is configured according to the actual scenario.
+> 2. Import this buffer into RGA through importbuffer_fd() and obtain the buffer_handle of RGA.
+> 3. Use the rotated buffer_handle to call RGA to perform image operations, and repeatedly rotate and loop.
+> 4. When the buffer in this buffer_pool is no longer needed, call releasebuffer_handle() to release the reference of this part of the buffer in RGA to ensure that the buffer can be released and destroyed subsequently.
+> 5. Release unnecessary buffers in buffer_pool.
+
+According to the above process design, even if the allocator's map/unmap behavior will cause abnormal time-consuming, it will be converged to the call of importbuffer_fd()/releasebuffer_handle(), and the call will no longer have an impact on each frame of the actual runtime. This is A good way to avoid performance differences due to differences in memory allocator implementation.
+
+ 3). For scenarios where the memory allocator and business process cannot be changed, the time-consuming optimization can only be done by modifying the map/unmap process of the memory allocator used. This is a very dangerous behavior, and you need to ensure that you are aware of all use of the memory. After applying the behavior of the allocator module, submit it to redmine to consult the corresponding memory allocator maintainer for technical support.
+
+
+
+**Q1.11**: Why is the importbuffer_fd()/importbuffer_virtualaddr() call time-consuming? Why do we need to call this API?
+
+**A1.11**: The related usage and instructions of this interface can be viewed in the "Overview" chapter of ["Rockchip_Developer_Guide_RGA_EN"](./Rockchip_Developer_Guide_RGA_EN.md) in the docs folder in the source code directory - "[Image Buffer Preprocessing ](./Rockchip_Developer_Guide_RGA_EN.md#Image Buffer Preprocessing)" for usage instructions. The function of importbuffer_xx() is to import the external buffer into the RGA driver, so that every subsequent frame RGA call can quickly access the buffer through buffer_handle. Importing an external buffer is a time-consuming operation. It is necessary to map the external buffer to the RGA driver and save the corresponding physical address and buffer information. This is indispensable behavior for calling RGA.
+
+
+
+**Q1.12**: Does RGA support parallel operations? Why does the time consumption of individual frames increase or double when calling RGA from multiple threads?
+
+**A1.12**: The RGA API can support parallel calls by multiple threads/processes, but whether image operations can be executed in parallel on the actual hardware depends on the number of RGA cores currently used on the chip. That is, the number of cores installed is the maximum supported number of parallel tasks. Tasks that exceed the number of cores will enter the waiting state until a core enters the idle state. Therefore, when the number of parallel calls exceeds the maximum number of parallel calls supported by the hardware, some frame calls will increase the time spent waiting for the hardware to become idle. Specifically, you can obtain the number of cores and supported functions of the current chip through the following debugging nodes (for specific instructions, please see the "Hardware Information Query" section in the "Drive Debugging Node" section):
+
+```shell
+/# cat hardware
+```
+
+
+
 ### Functions Consulting
 
 **Q2.1**：How do I know what version of RGA is available on my current chip platform and what functions are available?
 
@@ -294,10 +294,10 @@ typedef struct {
     int format;                         /* format */
 
     int color_space_mode;               /* color_space_mode */
-    int global_alpha;                   /* global_alpha */
+    int global_alpha;                   /* global_alpha, the default should be 0xff */
     int rd_mode;
 
-    /* legarcy */
+    /* legacy */
     int color;                          /* color, used by color fill */
     im_colorkey_range colorkey_range;   /* range value of color key */
     im_nn_t nn;
 
@@ -26,7 +26,7 @@
 #define RGA_API_MAJOR_VERSION       1
 #define RGA_API_MINOR_VERSION       10
 #define RGA_API_REVISION_VERSION    0
-#define RGA_API_BUILD_VERSION       2
+#define RGA_API_BUILD_VERSION       5
 
 #define RGA_API_SUFFIX
 
 
@@ -76,6 +76,10 @@ int main(void) {
     dst_buf_size = dst_width * dst_height * get_bpp_from_format(dst_format);
 
     /* allocate GraphicBuffer */
+    src_gb_flags |= GRALLOC_USAGE_SW_WRITE_OFTEN | GRALLOC_USAGE_SW_READ_OFTEN;
+    dst_gb_flags |= GRALLOC_USAGE_SW_WRITE_OFTEN | GRALLOC_USAGE_SW_READ_OFTEN;
+
+    /* for CORE_RGA2 */
     src_gb_flags |= RK_GRALLOC_USAGE_WITHIN_4G;
     dst_gb_flags |= RK_GRALLOC_USAGE_WITHIN_4G;
 
@@ -90,7 +94,7 @@ int main(void) {
         return -1;
     }
 
-    ret = src_gb->lock(0, (void **)&src_buf);
+    ret = src_gb->lock(GRALLOC_USAGE_SW_WRITE_OFTEN, (void **)&src_buf);
     if (ret) {
         printf("lock buffer error : %s\n",strerror(errno));
         return -1;
@@ -108,7 +112,7 @@ int main(void) {
         return -1;
     }
 
-    ret = dst_gb->lock(0, (void **)&dst_buf);
+    ret = dst_gb->lock(GRALLOC_USAGE_SW_WRITE_OFTEN, (void **)&dst_buf);
     if (ret) {
         printf("lock buffer error : %s\n",strerror(errno));
         return -1;
@@ -154,7 +158,7 @@ int main(void) {
         goto release_buffer;
     }
 
-    ret = dst_gb->lock(0, (void **)&dst_buf);
+    ret = dst_gb->lock(GRALLOC_USAGE_SW_READ_OFTEN, (void **)&dst_buf);
     if (ret) {
         printf("lock buffer error : %s\n",strerror(errno));
         return -1;
 
@@ -29,7 +29,6 @@
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/mman.h>
-#include <math.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <unistd.h>
 
@@ -29,7 +29,6 @@
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/mman.h>
-#include <math.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <unistd.h>
 
@@ -29,7 +29,6 @@
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/mman.h>
-#include <math.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <unistd.h>