Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implement dynamic resizing array buffers #54656

Merged
Merged
99 changes: 99 additions & 0 deletions src/libs/DynamicArrayBuffer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
type TypedArray = Int8Array | Uint8Array | Uint8ClampedArray | Int16Array | Uint16Array | Int32Array | Uint32Array | Float32Array | Float64Array;

type TypedArrayConstructor<T extends TypedArray> = {
new (buffer: ArrayBuffer): T;
new (buffer: ArrayBuffer, byteOffset: number, length: number): T;
BYTES_PER_ELEMENT: number;
};

/**
* A TypedArray that can grow dynamically (similar to c++ std::vector).
* You still need to provide an initial size. If the array grows beyond the initial size, it will be resized to double the size.
*/
class DynamicArrayBuffer<T extends TypedArray> {
private buffer: ArrayBuffer;

public array: T;

private size: number;

private readonly TypedArrayConstructor: TypedArrayConstructor<T>;

constructor(initialCapacity: number, TypedArrayConstructor: TypedArrayConstructor<T>) {
this.buffer = new ArrayBuffer(initialCapacity * this.getBytesPerElement(TypedArrayConstructor));
this.array = new TypedArrayConstructor(this.buffer);
this.size = 0;
this.TypedArrayConstructor = TypedArrayConstructor;
}

private getBytesPerElement(constructor: TypedArrayConstructor<T>): number {
return constructor.BYTES_PER_ELEMENT;
}

get capacity(): number {
return this.array.length;
}

get length(): number {
return this.size;
}

push(value: number): void {
const capacity = this.array.length; // avoid function calls for performance
if (this.size === capacity) {
this.resize(capacity * 2);
}
this.array[this.size++] = value;
}

private resize(newCapacity: number): void {
if (typeof this.buffer.transfer === 'function') {
this.buffer = this.buffer.transfer(newCapacity * this.getBytesPerElement(this.TypedArrayConstructor));
this.array = new this.TypedArrayConstructor(this.buffer);
} else {
const newBuffer = new ArrayBuffer(newCapacity * this.getBytesPerElement(this.TypedArrayConstructor));
const newArray = new this.TypedArrayConstructor(newBuffer);
newArray.set(this.array);
this.buffer = newBuffer;
this.array = newArray;
}
}

set(index: number, value: number): void {
if (index < 0) {
throw new Error('Index out of bounds');
}

// If the index is beyond our current capacity, resize
const capacity = this.array.length; // avoid function calls for performance
while (index >= capacity) {
this.resize(capacity * 2);
}

this.size = Math.max(this.size, index + 1);
this.array[index] = value;
}

truncate(end = this.size): DynamicArrayBuffer<T> {
const length = end;
this.buffer = this.buffer.slice(0, length * this.getBytesPerElement(this.TypedArrayConstructor));
this.array = new this.TypedArrayConstructor(this.buffer);

this.size = length;
return this;
}

[Symbol.iterator](): Iterator<number> {
let index = 0;
return {
next: (): IteratorResult<number> => {
if (index < this.size) {
return {value: this.array[index++], done: false};
}
return {value: undefined, done: true};
},
};
}
}

export default DynamicArrayBuffer;
45 changes: 26 additions & 19 deletions src/libs/FastSearch.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* eslint-disable rulesdir/prefer-at */
import CONST from '@src/CONST';
import Timing from './actions/Timing';
import DynamicArrayBuffer from './DynamicArrayBuffer';
import SuffixUkkonenTree from './SuffixUkkonenTree';

type SearchableData<T> = {
Expand All @@ -25,6 +26,8 @@ type SearchableData<T> = {

// There are certain characters appear very often in our search data (email addresses), which we don't need to search for.
const charSetToSkip = new Set(['@', '.', '#', '$', '%', '&', '*', '+', '-', '/', ':', ';', '<', '=', '>', '?', '_', '~', '!', ' ', ',', '(', ')']);
// For an account with 12k+ personal details the average search value length was ~60 characters.
const averageSearchValueLength = 60;

/**
* Creates a new "FastSearch" instance. "FastSearch" uses a suffix tree to search for substrings in a list of strings.
Expand All @@ -35,27 +38,30 @@ const charSetToSkip = new Set(['@', '.', '#', '$', '%', '&', '*', '+', '-', '/',
*/
function createFastSearch<T>(dataSets: Array<SearchableData<T>>) {
Timing.start(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES);
const maxNumericListSize = 400_000;
const itemsCount = dataSets.reduce((acc, {data}) => acc + data.length, 0);
// An approximation of how many chars the final search string will have (if it gets bigger the underlying buffer will resize aromatically, but its best to avoid resizes):
const initialListSize = itemsCount * averageSearchValueLength;
// The user might provide multiple data sets, but internally, the search values will be stored in this one list:
let concatenatedNumericList = new Uint8Array(maxNumericListSize);
const concatenatedNumericList = new DynamicArrayBuffer(initialListSize, Uint8Array);
// Here we store the index of the data item in the original data list, so we can map the found occurrences back to the original data:
const occurrenceToIndex = new Uint32Array(maxNumericListSize * 4);
// As we are working with ArrayBuffers, we need to keep track of the current offset:
const offset = {value: 1};
const occurrenceToIndex = new DynamicArrayBuffer(initialListSize, Uint32Array);
// We store the last offset for a dataSet, so we can map the found occurrences to the correct dataSet:
const listOffsets: number[] = [];

// The tree is 1-indexed, so we need to add a 0 at the beginning:
concatenatedNumericList.push(0);

for (const {data, toSearchableString} of dataSets) {
// Performance critical: the array parameters are passed by reference, so we don't have to create new arrays every time:
dataToNumericRepresentation(concatenatedNumericList, occurrenceToIndex, offset, {data, toSearchableString});
listOffsets.push(offset.value);
dataToNumericRepresentation(concatenatedNumericList, occurrenceToIndex, {data, toSearchableString});
listOffsets.push(concatenatedNumericList.length);
}
concatenatedNumericList[offset.value++] = SuffixUkkonenTree.END_CHAR_CODE;
listOffsets[listOffsets.length - 1] = offset.value;
concatenatedNumericList.push(SuffixUkkonenTree.END_CHAR_CODE);
listOffsets[listOffsets.length - 1] = concatenatedNumericList.length;
Timing.end(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES);

// The list might be larger than necessary, so we clamp it to the actual size:
concatenatedNumericList = concatenatedNumericList.slice(0, offset.value);
concatenatedNumericList.truncate();

// Create & build the suffix tree:
Timing.start(CONST.TIMING.SEARCH_MAKE_TREE);
Expand Down Expand Up @@ -84,7 +90,7 @@ function createFastSearch<T>(dataSets: Array<SearchableData<T>>) {
// eslint-disable-next-line @typescript-eslint/prefer-for-of
for (let i = 0; i < result.length; i++) {
const occurrenceIndex = result[i];
const itemIndexInDataSet = occurrenceToIndex[occurrenceIndex];
const itemIndexInDataSet = occurrenceToIndex.array[occurrenceIndex];
const dataSetIndex = listOffsets.findIndex((listOffset) => occurrenceIndex < listOffset);

if (dataSetIndex === -1) {
Expand Down Expand Up @@ -128,7 +134,11 @@ function createFastSearch<T>(dataSets: Array<SearchableData<T>>) {
* This function converts the user data (which are most likely objects) to a numeric representation.
* Additionally a list of the original data and their index position in the numeric list is created, which is used to map the found occurrences back to the original data.
*/
function dataToNumericRepresentation<T>(concatenatedNumericList: Uint8Array, occurrenceToIndex: Uint32Array, offset: {value: number}, {data, toSearchableString}: SearchableData<T>): void {
function dataToNumericRepresentation<T>(
concatenatedNumericList: DynamicArrayBuffer<Uint8Array>,
occurrenceToIndex: DynamicArrayBuffer<Uint32Array>,
{data, toSearchableString}: SearchableData<T>,
): void {
data.forEach((option, index) => {
const searchStringForTree = toSearchableString(option);
const cleanedSearchStringForTree = cleanString(searchStringForTree);
Expand All @@ -140,16 +150,13 @@ function dataToNumericRepresentation<T>(concatenatedNumericList: Uint8Array, occ
SuffixUkkonenTree.stringToNumeric(cleanedSearchStringForTree, {
charSetToSkip,
out: {
outArray: concatenatedNumericList,
offset,
outOccurrenceToIndex: occurrenceToIndex,
index,
occurrenceToIndex,
array: concatenatedNumericList,
},
});
// eslint-disable-next-line no-param-reassign
occurrenceToIndex[offset.value] = index;
// eslint-disable-next-line no-param-reassign
concatenatedNumericList[offset.value++] = SuffixUkkonenTree.DELIMITER_CHAR_CODE;
occurrenceToIndex.set(concatenatedNumericList.length, index);
concatenatedNumericList.push(SuffixUkkonenTree.DELIMITER_CHAR_CODE);
});
}

Expand Down
15 changes: 8 additions & 7 deletions src/libs/SuffixUkkonenTree/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// .at() has a performance overhead we explicitly want to avoid here

/* eslint-disable no-continue */
import type DynamicArrayBuffer from '@libs/DynamicArrayBuffer';
import {ALPHABET_SIZE, DELIMITER_CHAR_CODE, END_CHAR_CODE, SPECIAL_CHAR_CODE, stringToNumeric} from './utils';

/**
Expand All @@ -20,7 +21,7 @@ import {ALPHABET_SIZE, DELIMITER_CHAR_CODE, END_CHAR_CODE, SPECIAL_CHAR_CODE, st
*
* The tree will be built using the Ukkonen's algorithm: https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
*/
function makeTree(numericSearchValues: Uint8Array) {
function makeTree(numericSearchValues: DynamicArrayBuffer<Uint8Array>) {
// Every leaf represents a suffix. There can't be more than n suffixes.
// Every internal node has to have at least 2 children. So the total size of ukkonen tree is not bigger than 2n - 1.
// + 1 is because an extra character at the beginning to offset the 1-based indexing.
Expand Down Expand Up @@ -85,7 +86,7 @@ function makeTree(numericSearchValues: Uint8Array) {
currentNode = transitionNodes[currentNode * ALPHABET_SIZE + char];
currentPosition = rangeStart[currentNode];
}
if (currentPosition === 0 || char === numericSearchValues[currentPosition]) {
if (currentPosition === 0 || char === numericSearchValues.array[currentPosition]) {
currentPosition++;
} else {
splitEdge(char);
Expand All @@ -109,14 +110,14 @@ function makeTree(numericSearchValues: Uint8Array) {
rangeEnd[nodeCounter] = currentPosition - 1;
parent[nodeCounter] = parent[currentNode];

transitionNodes[nodeCounter * ALPHABET_SIZE + numericSearchValues[currentPosition]] = currentNode;
transitionNodes[nodeCounter * ALPHABET_SIZE + numericSearchValues.array[currentPosition]] = currentNode;
transitionNodes[nodeCounter * ALPHABET_SIZE + c] = nodeCounter + 1;
rangeStart[nodeCounter + 1] = currentIndex;
parent[nodeCounter + 1] = nodeCounter;
rangeStart[currentNode] = currentPosition;
parent[currentNode] = nodeCounter;

transitionNodes[parent[nodeCounter] * ALPHABET_SIZE + numericSearchValues[rangeStart[nodeCounter]]] = nodeCounter;
transitionNodes[parent[nodeCounter] * ALPHABET_SIZE + numericSearchValues.array[rangeStart[nodeCounter]]] = nodeCounter;
nodeCounter += 2;
handleDescent(nodeCounter);
}
Expand All @@ -125,7 +126,7 @@ function makeTree(numericSearchValues: Uint8Array) {
currentNode = suffixLink[parent[latestNodeIndex - 2]];
currentPosition = rangeStart[latestNodeIndex - 2];
while (currentPosition <= rangeEnd[latestNodeIndex - 2]) {
currentNode = transitionNodes[currentNode * ALPHABET_SIZE + numericSearchValues[currentPosition]];
currentNode = transitionNodes[currentNode * ALPHABET_SIZE + numericSearchValues.array[currentPosition]];
currentPosition += rangeEnd[currentNode] - rangeStart[currentNode] + 1;
}
if (currentPosition === rangeEnd[latestNodeIndex - 2] + 1) {
Expand All @@ -139,7 +140,7 @@ function makeTree(numericSearchValues: Uint8Array) {
function build() {
initializeTree();
for (currentIndex = 1; currentIndex < numericSearchValues.length; ++currentIndex) {
const c = numericSearchValues[currentIndex];
const c = numericSearchValues.array[currentIndex];
processCharacter(c);
}
}
Expand All @@ -165,7 +166,7 @@ function makeTree(numericSearchValues: Uint8Array) {
const rangeLen = node === 1 ? 0 : rightRange - leftRange + 1;

for (let i = 0; i < rangeLen && depth + i < searchValue.length && leftRange + i < numericSearchValues.length; i++) {
if (searchValue[depth + i] !== numericSearchValues[leftRange + i]) {
if (searchValue[depth + i] !== numericSearchValues.array[leftRange + i]) {
return;
}
}
Expand Down
40 changes: 20 additions & 20 deletions src/libs/SuffixUkkonenTree/utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
/* eslint-disable rulesdir/prefer-at */ // .at() has a performance overhead we explicitly want to avoid here
/* eslint-disable rulesdir/prefer-at */
// .at() has a performance overhead we explicitly want to avoid here

/* eslint-disable no-continue */
import DynamicArrayBuffer from '@libs/DynamicArrayBuffer';

const CHAR_CODE_A = 'a'.charCodeAt(0);
const ALPHABET = 'abcdefghijklmnopqrstuvwxyz';
Expand Down Expand Up @@ -57,58 +60,55 @@ function stringToNumeric(
charSetToSkip?: Set<string>;
// When out is provided, the function will write the result to the provided arrays instead of creating new ones (performance)
out?: {
outArray: Uint8Array;
// As outArray is a ArrayBuffer we need to keep track of the current offset
offset: {value: number};
array: DynamicArrayBuffer<Uint8Array>;
// A map of <PositionInOutArray, IndexInOriginalData> to map the found occurrences to the correct data set
// As the search string can be very long for high traffic accounts (500k+), this has to be big enough, thus its a Uint32Array
outOccurrenceToIndex?: Uint32Array;
occurrenceToIndex?: DynamicArrayBuffer<Uint32Array>;
// The index that will be used in the outOccurrenceToIndex array (this is the index of your original data position)
index?: number;
};
// By default false. By default the outArray may be larger than necessary. If clamp is set to true the outArray will be clamped to the actual size.
clamp?: boolean;
},
): {
numeric: Uint8Array;
occurrenceToIndex: Uint32Array;
offset: {value: number};
numeric: DynamicArrayBuffer<Uint8Array>;
occurrenceToIndex: DynamicArrayBuffer<Uint32Array>;
} {
// The out array might be longer than our input string length, because we encode special characters as multiple numbers using the base26 encoding.
// * 6 is because the upper limit of encoding any char in UTF-8 to base26 is at max 6 numbers.
const outArray = options?.out?.outArray ?? new Uint8Array(input.length * 6);
const offset = options?.out?.offset ?? {value: 0};
const occurrenceToIndex = options?.out?.outOccurrenceToIndex ?? new Uint32Array(input.length * 16 * 4);
const outArray = options?.out?.array ?? new DynamicArrayBuffer(input.length * 6, Uint8Array);
const occurrenceToIndex = options?.out?.occurrenceToIndex ?? new DynamicArrayBuffer(input.length * 16 * 4, Uint32Array);
const index = options?.out?.index ?? 0;

// eslint-disable-next-line @typescript-eslint/prefer-for-of -- for-i is slightly faster
for (let i = 0; i < input.length; i++) {
const char = input[i];

if (options?.charSetToSkip?.has(char)) {
continue;
}

const charCode = char.charCodeAt(0);

if (char >= 'a' && char <= 'z') {
// char is an alphabet character
occurrenceToIndex[offset.value] = index;
outArray[offset.value++] = char.charCodeAt(0) - CHAR_CODE_A;
occurrenceToIndex.push(index);
outArray.push(charCode - CHAR_CODE_A);
} else {
const charCode = input.charCodeAt(i);
occurrenceToIndex[offset.value] = index;
outArray[offset.value++] = SPECIAL_CHAR_CODE;
occurrenceToIndex.push(index);
outArray.push(SPECIAL_CHAR_CODE);
const asBase26Numeric = convertToBase26(charCode);
// eslint-disable-next-line @typescript-eslint/prefer-for-of
for (let j = 0; j < asBase26Numeric.length; j++) {
occurrenceToIndex[offset.value] = index;
outArray[offset.value++] = asBase26Numeric[j];
occurrenceToIndex.push(index);
outArray.push(asBase26Numeric[j]);
}
}
}

return {
numeric: options?.clamp ? outArray.slice(0, offset.value) : outArray,
numeric: options?.clamp ? outArray.truncate() : outArray,
occurrenceToIndex,
offset,
};
}

Expand Down
7 changes: 7 additions & 0 deletions src/types/global.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,10 @@ interface NodeRequire {
// eslint-disable-next-line @typescript-eslint/prefer-function-type, @typescript-eslint/no-explicit-any
<T = any>(id: string): T;
}

// Define ArrayBuffer.transfer as its a relatively new API and not yet present in all environments
// eslint-disable-next-line @typescript-eslint/consistent-type-definitions
interface ArrayBuffer {
// Might be defined in browsers, in RN hermes it's not implemented yet
transfer?: (length: number) => ArrayBuffer;
}
Loading
Loading