Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RFC] Support references to multiple DDicts #2446

Merged
merged 11 commits into from
Jan 8, 2021
2 changes: 1 addition & 1 deletion contrib/freestanding_lib/freestanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ def _replace_xxh64_prefix(self):
)
if self._xxh64_prefix is not None:
replacements.append(
(re.compile(r"([^\w]|^)(?P<orig>XXH64)_"), self._xxh64_prefix)
(re.compile(r"([^\w]|^)(?P<orig>XXH64)[\(_]"), self._xxh64_prefix)
)
for filepath in self._dst_lib_file_paths():
file = FileLines(filepath)
Expand Down
203 changes: 202 additions & 1 deletion lib/decompress/zstd_decompress.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
#include "../common/fse.h"
#define HUF_STATIC_LINKING_ONLY
#include "../common/huf.h"
#include "../common/xxhash.h" /* XXH64_reset, XXH64_update, XXH64_digest, XXH64 */
#include "../common/zstd_internal.h" /* blockProperties_t */
#include "zstd_decompress_internal.h" /* ZSTD_DCtx */
#include "zstd_ddict.h" /* ZSTD_DDictDictContent */
Expand All @@ -72,6 +73,144 @@
#endif



/*************************************
* Multiple DDicts Hashset internals *
*************************************/

#define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
* Currently, that means a 0.75 load factor.
* So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
* the load factor of the ddict hash set.
*/

#define DDICT_HASHSET_TABLE_BASE_SIZE 64
#define DDICT_HASHSET_RESIZE_FACTOR 2

/* Hash function to determine starting position of dict insertion within the table
* Returns an index between [0, hashSet->ddictPtrTableSize]
*/
static size_t ZSTD_DDictHashSet_getIndex(const ZSTD_DDictHashSet* hashSet, U32 dictID) {
const U64 hash = XXH64(&dictID, sizeof(U32), 0);
/* DDict ptr table size is a multiple of 2, use size - 1 as mask to get index within [0, hashSet->ddictPtrTableSize) */
return hash & (hashSet->ddictPtrTableSize - 1);
}

/* Adds DDict to a hashset without resizing it.
* If inserting a DDict with a dictID that already exists in the set, replaces the one in the set.
* Returns 0 if successful, or a zstd error code if something went wrong.
*/
static size_t ZSTD_DDictHashSet_emplaceDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict) {
const U32 dictID = ZSTD_getDictID_fromDDict(ddict);
size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID);
const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1;
RETURN_ERROR_IF(hashSet->ddictPtrCount == hashSet->ddictPtrTableSize, GENERIC, "Hash set is full!");
DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx);
while (hashSet->ddictPtrTable[idx] != NULL) {
/* Replace existing ddict if inserting ddict with same dictID */
if (ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]) == dictID) {
DEBUGLOG(4, "DictID already exists, replacing rather than adding");
hashSet->ddictPtrTable[idx] = ddict;
return 0;
}
idx &= idxRangeMask;
idx++;
}
DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx);
hashSet->ddictPtrTable[idx] = ddict;
hashSet->ddictPtrCount++;
return 0;
}

/* Expands hash table by factor of DDICT_HASHSET_RESIZE_FACTOR and
* rehashes all values, allocates new table, frees old table.
* Returns 0 on success, otherwise a zstd error code.
*/
static size_t ZSTD_DDictHashSet_expand(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) {
size_t newTableSize = hashSet->ddictPtrTableSize * DDICT_HASHSET_RESIZE_FACTOR;
const ZSTD_DDict** newTable = (const ZSTD_DDict**)ZSTD_customCalloc(sizeof(ZSTD_DDict*) * newTableSize, customMem);
const ZSTD_DDict** oldTable = hashSet->ddictPtrTable;
size_t oldTableSize = hashSet->ddictPtrTableSize;
size_t i;

DEBUGLOG(4, "Expanding DDict hash table! Old size: %zu new size: %zu", oldTableSize, newTableSize);
RETURN_ERROR_IF(!newTable, memory_allocation, "Expanded hashset allocation failed!");
hashSet->ddictPtrTable = newTable;
hashSet->ddictPtrTableSize = newTableSize;
hashSet->ddictPtrCount = 0;
for (i = 0; i < oldTableSize; ++i) {
if (oldTable[i] != NULL) {
FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, oldTable[i]), "");
}
}
ZSTD_customFree((void*)oldTable, customMem);
DEBUGLOG(4, "Finished re-hash");
return 0;
}

/* Fetches a DDict with the given dictID
* Returns the ZSTD_DDict* with the requested dictID. If it doesn't exist, then returns NULL.
*/
static const ZSTD_DDict* ZSTD_DDictHashSet_getDDict(ZSTD_DDictHashSet* hashSet, U32 dictID) {
size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID);
const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1;
DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx);
for (;;) {
size_t currDictID = ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]);
if (currDictID == dictID || currDictID == 0) {
/* currDictID == 0 implies a NULL ddict entry */
break;
} else {
idx &= idxRangeMask; /* Goes to start of table when we reach the end */
idx++;
}
}
DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx);
return hashSet->ddictPtrTable[idx];
}

/* Allocates space for and returns a ddict hash set
* The hash set's ZSTD_DDict* table has all values automatically set to NULL to begin with.
* Returns NULL if allocation failed.
*/
static ZSTD_DDictHashSet* ZSTD_createDDictHashSet(ZSTD_customMem customMem) {
ZSTD_DDictHashSet* ret = (ZSTD_DDictHashSet*)ZSTD_customMalloc(sizeof(ZSTD_DDictHashSet), customMem);
DEBUGLOG(4, "Allocating new hash set");
ret->ddictPtrTable = (const ZSTD_DDict**)ZSTD_customCalloc(DDICT_HASHSET_TABLE_BASE_SIZE * sizeof(ZSTD_DDict*), customMem);
ret->ddictPtrTableSize = DDICT_HASHSET_TABLE_BASE_SIZE;
ret->ddictPtrCount = 0;
if (!ret || !ret->ddictPtrTable) {
return NULL;
}
return ret;
}

/* Frees the table of ZSTD_DDict* within a hashset, then frees the hashset itself.
* Note: The ZSTD_DDict* within the table are NOT freed.
*/
static void ZSTD_freeDDictHashSet(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) {
DEBUGLOG(4, "Freeing ddict hash set");
if (hashSet && hashSet->ddictPtrTable) {
ZSTD_customFree((void*)hashSet->ddictPtrTable, customMem);
}
if (hashSet) {
ZSTD_customFree(hashSet, customMem);
}
}

/* Public function: Adds a DDict into the ZSTD_DDictHashSet, possibly triggering a resize of the hash set.
* Returns 0 on success, or a ZSTD error.
*/
static size_t ZSTD_DDictHashSet_addDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict, ZSTD_customMem customMem) {
DEBUGLOG(4, "Adding dict ID: %u to hashset with - Count: %zu Tablesize: %zu", ZSTD_getDictID_fromDDict(ddict), hashSet->ddictPtrCount, hashSet->ddictPtrTableSize);
if (hashSet->ddictPtrCount * DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT / hashSet->ddictPtrTableSize * DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT != 0) {
FORWARD_IF_ERROR(ZSTD_DDictHashSet_expand(hashSet, customMem), "");
}
FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, ddict), "");
return 0;
}

/*-*************************************************************
* Context management
***************************************************************/
Expand Down Expand Up @@ -101,6 +240,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
dctx->outBufferMode = ZSTD_bm_buffered;
dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
}

static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
Expand All @@ -120,8 +260,8 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
dctx->noForwardProgress = 0;
dctx->oversizedDuration = 0;
dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
dctx->ddictSet = NULL;
ZSTD_DCtx_resetParameters(dctx);
dctx->validateChecksum = 1;
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
dctx->dictContentEndForFuzzing = NULL;
#endif
Expand Down Expand Up @@ -178,6 +318,10 @@ size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
if (dctx->legacyContext)
ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion);
#endif
if (dctx->ddictSet) {
ZSTD_freeDDictHashSet(dctx->ddictSet, cMem);
dctx->ddictSet = NULL;
}
ZSTD_customFree(dctx, cMem);
return 0;
}
Expand All @@ -190,6 +334,29 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
ZSTD_memcpy(dstDCtx, srcDCtx, toCopy); /* no need to copy workspace */
}

/* Given a dctx with a digested frame params, re-selects the correct ZSTD_DDict based on
* the requested dict ID from the frame. If there exists a reference to the correct ZSTD_DDict, then
* accordingly sets the ddict to be used to decompress the frame.
*
* If no DDict is found, then no action is taken, and the ZSTD_DCtx::ddict remains as-is.
*
* ZSTD_d_refMultipleDDicts must be enabled for this function to be called.
*/
static void ZSTD_DCtx_selectFrameDDict(ZSTD_DCtx* dctx) {
assert(dctx->refMultipleDDicts && dctx->ddictSet);
DEBUGLOG(4, "Adjusting DDict based on requested dict ID from frame");
if (dctx->ddict) {
const ZSTD_DDict* frameDDict = ZSTD_DDictHashSet_getDDict(dctx->ddictSet, dctx->fParams.dictID);
if (frameDDict) {
DEBUGLOG(4, "DDict found!");
ZSTD_clearDict(dctx);
dctx->dictID = dctx->fParams.dictID;
dctx->ddict = frameDDict;
dctx->dictUses = ZSTD_use_indefinitely;
}
}
}


/*-*************************************************************
* Frame header decoding
Expand Down Expand Up @@ -441,12 +608,19 @@ unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize)

/** ZSTD_decodeFrameHeader() :
* `headerSize` must be the size provided by ZSTD_frameHeaderSize().
* If multiple DDict references are enabled, also will choose the correct DDict to use.
* @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize)
{
size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format);
if (ZSTD_isError(result)) return result; /* invalid header */
RETURN_ERROR_IF(result>0, srcSize_wrong, "headerSize too small");

/* Reference DDict requested by frame if dctx references multiple ddicts */
if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts && dctx->ddictSet) {
ZSTD_DCtx_selectFrameDDict(dctx);
}

#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
/* Skip the dictID check in fuzzing mode, because it makes the search
* harder.
Expand Down Expand Up @@ -1391,6 +1565,16 @@ size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
if (ddict) {
dctx->ddict = ddict;
dctx->dictUses = ZSTD_use_indefinitely;
if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts) {
if (dctx->ddictSet == NULL) {
dctx->ddictSet = ZSTD_createDDictHashSet(dctx->customMem);
if (!dctx->ddictSet) {
RETURN_ERROR(memory_allocation, "Failed to allocate memory for hash set!");
}
}
assert(!dctx->staticSize); /* Impossible: ddictSet cannot have been allocated if static dctx */
FORWARD_IF_ERROR(ZSTD_DDictHashSet_addDDict(dctx->ddictSet, ddict, dctx->customMem), "");
}
}
return 0;
}
Expand Down Expand Up @@ -1436,6 +1620,10 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
bounds.lowerBound = (int)ZSTD_d_validateChecksum;
bounds.upperBound = (int)ZSTD_d_ignoreChecksum;
return bounds;
case ZSTD_d_refMultipleDDicts:
bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
return bounds;
default:;
}
bounds.error = ERROR(parameter_unsupported);
Expand Down Expand Up @@ -1473,6 +1661,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
case ZSTD_d_forceIgnoreChecksum:
*value = (int)dctx->forceIgnoreChecksum;
return 0;
case ZSTD_d_refMultipleDDicts:
*value = (int)dctx->refMultipleDDicts;
return 0;
default:;
}
RETURN_ERROR(parameter_unsupported, "");
Expand All @@ -1499,6 +1690,13 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
CHECK_DBOUNDS(ZSTD_d_forceIgnoreChecksum, value);
dctx->forceIgnoreChecksum = (ZSTD_forceIgnoreChecksum_e)value;
return 0;
case ZSTD_d_refMultipleDDicts:
CHECK_DBOUNDS(ZSTD_d_refMultipleDDicts, value);
if (dctx->staticSize != 0) {
RETURN_ERROR(parameter_unsupported, "Static dctx does not support multiple DDicts!");
}
dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
return 0;
default:;
}
RETURN_ERROR(parameter_unsupported, "");
Expand Down Expand Up @@ -1680,6 +1878,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
} }
#endif
{ size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format);
if (zds->refMultipleDDicts && zds->ddictSet) {
ZSTD_DCtx_selectFrameDDict(zds);
}
DEBUGLOG(5, "header size : %u", (U32)hSize);
if (ZSTD_isError(hSize)) {
#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
Expand Down
9 changes: 9 additions & 0 deletions lib/decompress/zstd_decompress_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ typedef enum {
ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */
} ZSTD_dictUses_e;

/* Hashset for storing references to multiple ZSTD_DDict within ZSTD_DCtx */
typedef struct {
const ZSTD_DDict** ddictPtrTable;
size_t ddictPtrTableSize;
size_t ddictPtrCount;
} ZSTD_DDictHashSet;

struct ZSTD_DCtx_s
{
const ZSTD_seqSymbol* LLTptr;
Expand Down Expand Up @@ -136,6 +143,8 @@ struct ZSTD_DCtx_s
U32 dictID;
int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
ZSTD_dictUses_e dictUses;
ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */
ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */

/* streaming */
ZSTD_dStreamStage streamStage;
Expand Down
41 changes: 40 additions & 1 deletion lib/zstd.h
Original file line number Diff line number Diff line change
Expand Up @@ -546,12 +546,14 @@ typedef enum {
* ZSTD_d_format
* ZSTD_d_stableOutBuffer
* ZSTD_d_forceIgnoreChecksum
* ZSTD_d_refMultipleDDicts
* Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
* note : never ever use experimentalParam? names directly
*/
ZSTD_d_experimentalParam1=1000,
ZSTD_d_experimentalParam2=1001,
ZSTD_d_experimentalParam3=1002
ZSTD_d_experimentalParam3=1002,
ZSTD_d_experimentalParam4=1003

} ZSTD_dParameter;

Expand Down Expand Up @@ -999,6 +1001,13 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
/*! ZSTD_DCtx_refDDict() :
* Reference a prepared dictionary, to be used to decompress next frames.
* The dictionary remains active for decompression of future frames using same DCtx.
*
* If called with ZSTD_d_refMultipleDDicts enabled, repeated calls of this function
* will store the DDict references in a table, and the DDict used for decompression
* will be determined at decompression time, as per the dict ID in the frame.
* The memory for the table is allocated on the first call to refDDict, and can be
* freed with ZSTD_freeDCtx().
*
* @result : 0, or an error code (which can be tested with ZSTD_isError()).
* Note 1 : Currently, only one dictionary can be managed.
* Referencing a new dictionary effectively "discards" any previous one.
Expand Down Expand Up @@ -1205,6 +1214,12 @@ typedef enum {
ZSTD_d_ignoreChecksum = 1
} ZSTD_forceIgnoreChecksum_e;

typedef enum {
/* Note: this enum controls ZSTD_d_refMultipleDDicts */
ZSTD_rmd_refSingleDDict = 0,
ZSTD_rmd_refMultipleDDicts = 1
} ZSTD_refMultipleDDicts_e;

typedef enum {
/* Note: this enum and the behavior it controls are effectively internal
* implementation details of the compressor. They are expected to continue
Expand Down Expand Up @@ -2000,6 +2015,30 @@ ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param
*/
#define ZSTD_d_forceIgnoreChecksum ZSTD_d_experimentalParam3

/* ZSTD_d_refMultipleDDicts
* Experimental parameter.
* Default is 0 == disabled. Set to 1 to enable
*
* If enabled and dctx is allocated on the heap, then additional memory will be allocated
* to store references to multiple ZSTD_DDict. That is, multiple calls of ZSTD_refDDict()
* using a given ZSTD_DCtx, rather than overwriting the previous DDict reference, will instead
* store all references. At decompression time, the appropriate dictID is selected
* from the set of DDicts based on the dictID in the frame.
*
* Usage is simply calling ZSTD_refDDict() on multiple dict buffers.
*
* Param has values of byte ZSTD_refMultipleDDicts_e
*
* WARNING: Enabling this parameter and calling ZSTD_DCtx_refDDict(), will trigger memory
* allocation for the hash table. ZSTD_freeDCtx() also frees this memory.
* Memory is allocated as per ZSTD_DCtx::customMem.
*
* Although this function allocates memory for the table, the user is still responsible for
* memory management of the underlying ZSTD_DDict* themselves.
*/
#define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4


/*! ZSTD_DCtx_setFormat() :
* Instruct the decoder context about what kind of data to decode next.
* This instruction is mandatory to decode data without a fully-formed header,
Expand Down
Loading