Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: improve add and sub performance by avoiding array::IntoIter #316

Merged
merged 1 commit into from
Sep 30, 2023

Conversation

DaniPopes
Copy link
Contributor

Motivation

array::IntoIter is known to have worse performance than slice::Iter.

Solution

Use slice::Iter instead of array::IntoIter.

This makes codegen of U256::add identical to LLVM native add i256, while slightly improves the one in sub.

Rest of changes are &(\w+)\.limbs -> $1.as_limbs().

Ideally we would deny inclusion of new array::IntoIter calls unless explicitly allowed, but I don't know how to do this since clippy does not catch it with disallowed-methods.

Test

pub extern "C" fn u256_add(a: &mut aliases::U256, b: &aliases::U256) {
    *a += *b;
}

pub extern "C" fn u256_sub(a: &mut aliases::U256, b: &aliases::U256) {
    *a -= *b;
}

u256_add before/after diff

diff --git a/add.before.s b/add.after.s
index 2970898..528fed6 100644
--- a/add.before.s
+++ b/add.after.s
@@ -4,25 +4,14 @@ u256_add:
 	mov rax, qword ptr [rsi]
 	mov rcx, qword ptr [rsi + 8]
 
-	add rax, qword ptr [rdi]
+	mov rdx, qword ptr [rsi + 16]
 
-	adc rcx, 0
-	setb dl
-	add rcx, qword ptr [rdi + 8]
+	mov rsi, qword ptr [rsi + 24]
 
-	movzx edx, dl
-	adc rdx, qword ptr [rsi + 16]
-	setb r8b
-	add rdx, qword ptr [rdi + 16]
-
-	mov qword ptr [rdi], rax
-	mov qword ptr [rdi + 8], rcx
-
-	movzx r8d, r8b
-	adc r8, qword ptr [rsi + 24]
-
-	mov qword ptr [rdi + 16], rdx
-	add qword ptr [rdi + 24], r8
+	add qword ptr [rdi], rax
+	adc qword ptr [rdi + 8], rcx
+	adc qword ptr [rdi + 16], rdx
+	adc qword ptr [rdi + 24], rsi
 
 	ret

u256_sub before/after diff

diff --git a/sub.before.s b/sub.after.s
index bc0ce52..2a87c7a 100644
--- a/sub.before.s
+++ b/sub.after.s
@@ -1,47 +1,50 @@
 u256_sub:
 
 	.cfi_startproc
-	mov rdx, qword ptr [rdi + 24]
-
 	mov rcx, qword ptr [rdi + 16]
 
 	xor r8d, r8d
 
 	mov rax, qword ptr [rdi + 8]
 
+	mov rdx, qword ptr [rsi]
+
 	mov r9d, 0
+	mov r10d, 0
 
-	mov r10, qword ptr [rsi]
+	mov r11, qword ptr [rdi + 24]
 
-	sub rdx, qword ptr [rsi + 24]
 	sub rcx, qword ptr [rsi + 16]
 	sbb r9, r9
+
 	sub rax, qword ptr [rsi + 8]
-	mov esi, 0
+	sbb r10, r10
 
-	sbb rsi, rsi
+	sub r11, qword ptr [rsi + 24]
 
-	sub qword ptr [rdi], r10
+	sub qword ptr [rdi], rdx
 
 	sbb r8, r8
-	mov r10, r8
-	sar r10, 63
+
+	mov rdx, r8
+	sar rdx, 63
+
 	add r8, rax
 
-	adc r10, rsi
+	adc rdx, r10
 
 	mov qword ptr [rdi + 8], r8
 
-	mov rax, r10
+	mov rax, rdx
 	sar rax, 63
 
-	add r10, rcx
+	add rdx, rcx
 
 	adc rax, r9
 
-	mov qword ptr [rdi + 16], r10
+	mov qword ptr [rdi + 16], rdx
 
-	add rax, rdx
+	add rax, r11
 
 	mov qword ptr [rdi + 24], rax

Optimal codegen

From Zig (Godbolt):

pub export fn u256_add(a: *u256, b: *u256) void {
    a.* += b.*;
}

pub export fn u256_sub(a: *u256, b: *u256) void {
    a.* -= b.*;
}
u256_add:
        mov     rax, qword ptr [rsi + 24]
        mov     rcx, qword ptr [rsi + 16]
        mov     rdx, qword ptr [rsi]
        mov     rsi, qword ptr [rsi + 8]
        add     qword ptr [rdi], rdx
        adc     qword ptr [rdi + 8], rsi
        adc     qword ptr [rdi + 16], rcx
        adc     qword ptr [rdi + 24], rax
        ret

u256_sub:
        mov     rax, qword ptr [rsi + 24]
        mov     rcx, qword ptr [rsi + 16]
        mov     rdx, qword ptr [rsi]
        mov     rsi, qword ptr [rsi + 8]
        sub     qword ptr [rdi], rdx
        sbb     qword ptr [rdi + 8], rsi
        sbb     qword ptr [rdi + 16], rcx
        sbb     qword ptr [rdi + 24], rax
        ret

Actual codegen (after this PR)

u256_add:

	.cfi_startproc
	mov rax, qword ptr [rsi]
	mov rcx, qword ptr [rsi + 8]

	mov rdx, qword ptr [rsi + 16]

	mov rsi, qword ptr [rsi + 24]

	add qword ptr [rdi], rax
	adc qword ptr [rdi + 8], rcx
	adc qword ptr [rdi + 16], rdx
	adc qword ptr [rdi + 24], rsi

	ret

u256_sub:

	.cfi_startproc
	mov rcx, qword ptr [rdi + 16]

	xor r8d, r8d

	mov rax, qword ptr [rdi + 8]

	mov rdx, qword ptr [rsi]

	mov r9d, 0
	mov r10d, 0

	mov r11, qword ptr [rdi + 24]

	sub rcx, qword ptr [rsi + 16]
	sbb r9, r9

	sub rax, qword ptr [rsi + 8]
	sbb r10, r10

	sub r11, qword ptr [rsi + 24]

	sub qword ptr [rdi], rdx

	sbb r8, r8

	mov rdx, r8
	sar rdx, 63

	add r8, rax

	adc rdx, r10

	mov qword ptr [rdi + 8], r8

	mov rax, rdx
	sar rax, 63

	add rdx, rcx

	adc rax, r9

	mov qword ptr [rdi + 16], rdx

	add rax, r11

	mov qword ptr [rdi + 24], rax

	ret

@DaniPopes

This comment was marked as outdated.

@codecov
Copy link

codecov bot commented Sep 30, 2023

Codecov Report

All modified lines are covered by tests ✅

❗ No coverage uploaded for pull request base (main@d070e8b). Click here to learn what that means.

Additional details and impacted files
@@           Coverage Diff           @@
##             main     #316   +/-   ##
=======================================
  Coverage        ?   80.64%           
=======================================
  Files           ?       54           
  Lines           ?     6045           
  Branches        ?        0           
=======================================
  Hits            ?     4875           
  Misses          ?     1170           
  Partials        ?        0           
Files Coverage Δ
src/add.rs 74.10% <100.00%> (ø)
src/base_convert.rs 87.01% <100.00%> (ø)
src/bits.rs 77.82% <100.00%> (ø)
src/modular.rs 99.43% <100.00%> (ø)
src/mul.rs 90.79% <100.00%> (ø)

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

@prestwich prestwich merged commit 567f786 into recmo:main Sep 30, 2023
@DaniPopes DaniPopes deleted the perf-array-intoiter branch September 30, 2023 18:21
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants