Skip to content

Commit b61427f

Browse files
authored
AVRO-4060: Use JDK to Hash Byte Array in UTF8 (#3175)
1 parent c75e6d5 commit b61427f

File tree

2 files changed

+34
-3
lines changed

2 files changed

+34
-3
lines changed

lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java

+14-3
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ public Utf8(byte[] bytes) {
6868
this.length = length;
6969
}
7070

71+
Utf8(String string, int length) {
72+
this(string);
73+
this.length = length;
74+
}
75+
7176
/**
7277
* Return UTF-8 encoded bytes. Only valid through {@link #getByteLength()}
7378
* assuming the bytes have been fully copied into the underlying buffer from the
@@ -173,9 +178,15 @@ public int hashCode() {
173178
if (h == 0) {
174179
byte[] bytes = this.bytes;
175180
int length = this.length;
176-
h = 1;
177-
for (int i = 0; i < length; i++) {
178-
h = h * 31 + bytes[i];
181+
// If the array is filled, use the underlying JDK hash functionality.
182+
// Starting with JDK 21, the underlying implementation is vectorized.
183+
if (length > 7 && bytes.length == length) {
184+
h = Arrays.hashCode(bytes);
185+
} else {
186+
h = 1;
187+
for (int i = 0; i < length; i++) {
188+
h = h * 31 + bytes[i];
189+
}
179190
}
180191
this.hash = h;
181192
}

lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java

+20
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,26 @@ void hashCodeReused() {
9999
assertEquals(4122302, u.hashCode());
100100
}
101101

102+
/**
103+
* There are two different code paths that hashcode() can call depending on the
104+
* state of the internal buffer. If the buffer is full (string length is equal
105+
* to buffer length) then the JDK hashcode function can be used. However, if the
106+
* buffer is not full (string length is less than the internal buffer length),
107+
* then the JDK does not support this prior to JDK 23 and a scalar
108+
* implementation is the only option today. This difference can be resolved with
109+
* JDK 23 as it supports both cases.
110+
*/
111+
@Test
112+
void hashCodeBasedOnCapacity() {
113+
// string = 8; buffer = 8
114+
Utf8 fullCapacity = new Utf8("abcdefgh", 8);
115+
116+
// string = 8; buffer = 9
117+
Utf8 partialCapacity = new Utf8("abcdefghX", 8);
118+
119+
assertEquals(fullCapacity.hashCode(), partialCapacity.hashCode());
120+
}
121+
102122
@Test
103123
void oversizeUtf8() {
104124
Utf8 u = new Utf8();

0 commit comments

Comments
 (0)