perf: improve add and sub performance by avoiding `array::IntoIter` #316

DaniPopes · 2023-09-30T04:05:27Z

Motivation

array::IntoIter is known to have worse performance than slice::Iter.

Solution

Use slice::Iter instead of array::IntoIter.

This makes codegen of U256::add identical to LLVM native add i256, while slightly improves the one in sub.

Rest of changes are &(\w+)\.limbs -> $1.as_limbs().

Ideally we would deny inclusion of new array::IntoIter calls unless explicitly allowed, but I don't know how to do this since clippy does not catch it with disallowed-methods.

Test

pub extern "C" fn u256_add(a: &mut aliases::U256, b: &aliases::U256) {
    *a += *b;
}

pub extern "C" fn u256_sub(a: &mut aliases::U256, b: &aliases::U256) {
    *a -= *b;
}

`u256_add` before/after diff

diff --git a/add.before.s b/add.after.s
index 2970898..528fed6 100644
--- a/add.before.s
+++ b/add.after.s
@@ -4,25 +4,14 @@ u256_add:
 	mov rax, qword ptr [rsi]
 	mov rcx, qword ptr [rsi + 8]
 
-	add rax, qword ptr [rdi]
+	mov rdx, qword ptr [rsi + 16]
 
-	adc rcx, 0
-	setb dl
-	add rcx, qword ptr [rdi + 8]
+	mov rsi, qword ptr [rsi + 24]
 
-	movzx edx, dl
-	adc rdx, qword ptr [rsi + 16]
-	setb r8b
-	add rdx, qword ptr [rdi + 16]
-
-	mov qword ptr [rdi], rax
-	mov qword ptr [rdi + 8], rcx
-
-	movzx r8d, r8b
-	adc r8, qword ptr [rsi + 24]
-
-	mov qword ptr [rdi + 16], rdx
-	add qword ptr [rdi + 24], r8
+	add qword ptr [rdi], rax
+	adc qword ptr [rdi + 8], rcx
+	adc qword ptr [rdi + 16], rdx
+	adc qword ptr [rdi + 24], rsi
 
 	ret

`u256_sub` before/after diff

diff --git a/sub.before.s b/sub.after.s
index bc0ce52..2a87c7a 100644
--- a/sub.before.s
+++ b/sub.after.s
@@ -1,47 +1,50 @@
 u256_sub:
 
 	.cfi_startproc
-	mov rdx, qword ptr [rdi + 24]
-
 	mov rcx, qword ptr [rdi + 16]
 
 	xor r8d, r8d
 
 	mov rax, qword ptr [rdi + 8]
 
+	mov rdx, qword ptr [rsi]
+
 	mov r9d, 0
+	mov r10d, 0
 
-	mov r10, qword ptr [rsi]
+	mov r11, qword ptr [rdi + 24]
 
-	sub rdx, qword ptr [rsi + 24]
 	sub rcx, qword ptr [rsi + 16]
 	sbb r9, r9
+
 	sub rax, qword ptr [rsi + 8]
-	mov esi, 0
+	sbb r10, r10
 
-	sbb rsi, rsi
+	sub r11, qword ptr [rsi + 24]
 
-	sub qword ptr [rdi], r10
+	sub qword ptr [rdi], rdx
 
 	sbb r8, r8
-	mov r10, r8
-	sar r10, 63
+
+	mov rdx, r8
+	sar rdx, 63
+
 	add r8, rax
 
-	adc r10, rsi
+	adc rdx, r10
 
 	mov qword ptr [rdi + 8], r8
 
-	mov rax, r10
+	mov rax, rdx
 	sar rax, 63
 
-	add r10, rcx
+	add rdx, rcx
 
 	adc rax, r9
 
-	mov qword ptr [rdi + 16], r10
+	mov qword ptr [rdi + 16], rdx
 
-	add rax, rdx
+	add rax, r11
 
 	mov qword ptr [rdi + 24], rax

Optimal codegen

From Zig (Godbolt):

pub export fn u256_add(a: *u256, b: *u256) void {
    a.* += b.*;
}

pub export fn u256_sub(a: *u256, b: *u256) void {
    a.* -= b.*;
}

u256_add:
        mov     rax, qword ptr [rsi + 24]
        mov     rcx, qword ptr [rsi + 16]
        mov     rdx, qword ptr [rsi]
        mov     rsi, qword ptr [rsi + 8]
        add     qword ptr [rdi], rdx
        adc     qword ptr [rdi + 8], rsi
        adc     qword ptr [rdi + 16], rcx
        adc     qword ptr [rdi + 24], rax
        ret

u256_sub:
        mov     rax, qword ptr [rsi + 24]
        mov     rcx, qword ptr [rsi + 16]
        mov     rdx, qword ptr [rsi]
        mov     rsi, qword ptr [rsi + 8]
        sub     qword ptr [rdi], rdx
        sbb     qword ptr [rdi + 8], rsi
        sbb     qword ptr [rdi + 16], rcx
        sbb     qword ptr [rdi + 24], rax
        ret

Actual codegen (after this PR)

u256_add:

	.cfi_startproc
	mov rax, qword ptr [rsi]
	mov rcx, qword ptr [rsi + 8]

	mov rdx, qword ptr [rsi + 16]

	mov rsi, qword ptr [rsi + 24]

	add qword ptr [rdi], rax
	adc qword ptr [rdi + 8], rcx
	adc qword ptr [rdi + 16], rdx
	adc qword ptr [rdi + 24], rsi

	ret

u256_sub:

	.cfi_startproc
	mov rcx, qword ptr [rdi + 16]

	xor r8d, r8d

	mov rax, qword ptr [rdi + 8]

	mov rdx, qword ptr [rsi]

	mov r9d, 0
	mov r10d, 0

	mov r11, qword ptr [rdi + 24]

	sub rcx, qword ptr [rsi + 16]
	sbb r9, r9

	sub rax, qword ptr [rsi + 8]
	sbb r10, r10

	sub r11, qword ptr [rsi + 24]

	sub qword ptr [rdi], rdx

	sbb r8, r8

	mov rdx, r8
	sar rdx, 63

	add r8, rax

	adc rdx, r10

	mov qword ptr [rdi + 8], r8

	mov rax, rdx
	sar rax, 63

	add rdx, rcx

	adc rax, r9

	mov qword ptr [rdi + 16], rdx

	add rax, r11

	mov qword ptr [rdi + 24], rax

	ret

codecov · 2023-09-30T17:53:19Z

Codecov Report

All modified lines are covered by tests ✅

❗ No coverage uploaded for pull request base (main@d070e8b). Click here to learn what that means.

Additional details and impacted files

@@           Coverage Diff           @@
##             main     #316   +/-   ##
=======================================
  Coverage        ?   80.64%           
=======================================
  Files           ?       54           
  Lines           ?     6045           
  Branches        ?        0           
=======================================
  Hits            ?     4875           
  Misses          ?     1170           
  Partials        ?        0

Files	Coverage Δ
src/add.rs	`74.10% <100.00%> (ø)`
src/base_convert.rs	`87.01% <100.00%> (ø)`
src/bits.rs	`77.82% <100.00%> (ø)`
src/modular.rs	`99.43% <100.00%> (ø)`
src/mul.rs	`90.79% <100.00%> (ø)`

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

DaniPopes requested a review from prestwich as a code owner September 30, 2023 04:05

This comment was marked as outdated.

Sign in to view

perf: improve add and sub performance by avoiding array::IntoIter

480da03

DaniPopes force-pushed the perf-array-intoiter branch from aafab82 to 480da03 Compare September 30, 2023 17:46

prestwich approved these changes Sep 30, 2023

View reviewed changes

prestwich merged commit 567f786 into recmo:main Sep 30, 2023

DaniPopes deleted the perf-array-intoiter branch September 30, 2023 18:21

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

perf: improve add and sub performance by avoiding `array::IntoIter` #316

perf: improve add and sub performance by avoiding `array::IntoIter` #316

DaniPopes commented Sep 30, 2023

This comment was marked as outdated.

codecov bot commented Sep 30, 2023

perf: improve add and sub performance by avoiding array::IntoIter #316

perf: improve add and sub performance by avoiding array::IntoIter #316

Conversation

DaniPopes commented Sep 30, 2023

Motivation

Solution

Test

u256_add before/after diff

u256_sub before/after diff

Optimal codegen

Actual codegen (after this PR)

This comment was marked as outdated.

codecov bot commented Sep 30, 2023

Codecov Report

perf: improve add and sub performance by avoiding `array::IntoIter` #316

perf: improve add and sub performance by avoiding `array::IntoIter` #316

`u256_add` before/after diff

`u256_sub` before/after diff