Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Block-Level Sequence Producer API #3333

Merged
merged 60 commits into from
Dec 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
e43bf48
First building commit with sample matchfinder
embg Nov 10, 2022
625bf62
Set up ZSTD_externalMatchCtx struct
embg Nov 21, 2022
a3c5c2b
move seqBuffer to ZSTD_Sequence*
embg Nov 21, 2022
201588a
support non-contiguous dictionary
embg Nov 21, 2022
5a01759
clean up parens
embg Nov 21, 2022
c6c5b5b
add clearExternalMatchfinder, handle allocation errors
embg Nov 21, 2022
462e967
Add useExternalMatchfinder cParam
embg Nov 23, 2022
a3a5f3d
validate useExternalMatchfinder cParam
embg Nov 23, 2022
8562b10
Disable LDM + external matchfinder
embg Nov 23, 2022
6488d8d
Check for static CCtx
embg Nov 23, 2022
50df826
Validate mState and mStateDestructor
embg Nov 23, 2022
f03da10
Improve LDM check to cover both branches
embg Nov 24, 2022
d955d16
Error API with optional fallback
embg Nov 24, 2022
a5241db
handle RLE properly for external matchfinder
embg Nov 27, 2022
4defd0e
nit
embg Nov 27, 2022
f9fc1b1
Move to a CDict-like model for resource ownership
embg Nov 29, 2022
c126ecd
Add hidden useExternalMatchfinder bool to CCtx_params_s
embg Nov 30, 2022
aeb060d
Eliminate malloc, move to cwksp allocation
embg Nov 30, 2022
349c36e
Handle CCtx reset properly
embg Nov 30, 2022
80ec8aa
Ensure seqStore has enough space for external sequences
embg Nov 30, 2022
bf69b2c
fix capitalization
embg Dec 1, 2022
5c4891f
Add DEBUGLOG statements
embg Dec 1, 2022
72e50fa
Add compressionLevel param to matchfinder API
embg Dec 1, 2022
56e6633
fix c99 issues and add a param combination error code
embg Dec 1, 2022
da90ae8
nits
embg Dec 1, 2022
4d12960
Test external matchfinder API
embg Dec 5, 2022
a968472
C90 compat for simpleExternalMatchFinder
embg Dec 6, 2022
80765c6
Fix some @nocommits and an ASAN bug
embg Dec 6, 2022
15d909a
nit
embg Dec 7, 2022
901ab34
nit
embg Dec 7, 2022
da18842
nits
embg Dec 7, 2022
ec9b6b0
forward declare copySequencesToSeqStore functions in zstd_compress_in…
embg Dec 7, 2022
b8b5547
nit
embg Dec 7, 2022
38182da
nit
embg Dec 7, 2022
d9534ad
nits
embg Dec 7, 2022
aabfc34
Update copyright headers
embg Dec 8, 2022
f866d0e
Fix CMake zstreamtest build
embg Dec 8, 2022
3a0efdf
Fix copyright headers (again)
embg Dec 8, 2022
b1e2422
typo
embg Dec 8, 2022
b6fe61d
Add externalMatchfinder demo program to make contrib
embg Dec 19, 2022
3de6d5c
Reduce memory consumption for small blockSize
embg Dec 19, 2022
fc37297
ZSTD_postProcessExternalMatchFinderResult nits
embg Dec 19, 2022
029ba01
test sum(matchlen) + sum(litlen) == srcSize in debug builds
embg Dec 19, 2022
c0be839
refExternalMatchFinder -> registerExternalMatchFinder
embg Dec 19, 2022
f4685d2
C90 nit
embg Dec 19, 2022
153be31
zstreamtest nits
embg Dec 19, 2022
beab112
contrib nits
embg Dec 19, 2022
b6d48a2
contrib nits
embg Dec 19, 2022
a40cea4
allow block splitter + external matchfinder, refactor
embg Dec 21, 2022
31260d8
add windowSize param
embg Dec 21, 2022
f89cedc
add contrib/externalMatchfinder/README.md
embg Dec 21, 2022
628755c
docs
embg Dec 21, 2022
b12c8e4
go back to old RLE heuristic because of the first block issue
embg Dec 21, 2022
12b7ca1
fix initializer element is not a constant expression
embg Dec 21, 2022
4b15448
ref contrib from zstd.h
embg Dec 21, 2022
c2574e7
extremely pedantic compiler warning fix, meson fix, typo fix
embg Dec 22, 2022
8052b10
Additional docs on API limitations
embg Dec 28, 2022
1e60543
minor nits
embg Dec 28, 2022
49cd2e8
Refactor maxNbSeq calculation into a helper function
embg Dec 28, 2022
241f2a7
Fix copyright
embg Dec 28, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ contrib: lib
$(MAKE) -C contrib/seekable_format/examples all
$(MAKE) -C contrib/seekable_format/tests test
$(MAKE) -C contrib/largeNbDicts all
$(MAKE) -C contrib/externalMatchfinder all
cd build/single_file_libs/ ; ./build_decoder_test.sh
cd build/single_file_libs/ ; ./build_library_test.sh

Expand All @@ -142,6 +143,7 @@ clean:
$(Q)$(MAKE) -C contrib/seekable_format/examples $@ > $(VOID)
$(Q)$(MAKE) -C contrib/seekable_format/tests $@ > $(VOID)
$(Q)$(MAKE) -C contrib/largeNbDicts $@ > $(VOID)
$(Q)$(MAKE) -C contrib/externalMatchfinder $@ > $(VOID)
$(Q)$(RM) zstd$(EXT) zstdmt$(EXT) tmp*
$(Q)$(RM) -r lz4
@echo Cleaning completed
Expand Down
2 changes: 1 addition & 1 deletion build/cmake/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ add_test(NAME fuzzer COMMAND fuzzer ${ZSTD_FUZZER_FLAGS})
#
# zstreamtest
#
add_executable(zstreamtest ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/seqgen.c ${TESTS_DIR}/zstreamtest.c)
add_executable(zstreamtest ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/seqgen.c ${TESTS_DIR}/zstreamtest.c ${TESTS_DIR}/external_matchfinder.c)
if (NOT MSVC)
target_compile_options(zstreamtest PRIVATE "-Wno-deprecated-declarations")
endif()
Expand Down
6 changes: 4 additions & 2 deletions build/meson/tests/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,10 @@ fuzzer = executable('fuzzer',
dependencies: [ testcommon_dep, thread_dep ],
install: false)

zstreamtest_sources = [join_paths(zstd_rootdir, 'tests/seqgen.c'),
join_paths(zstd_rootdir, 'tests/zstreamtest.c')]
zstreamtest_sources = [
join_paths(zstd_rootdir, 'tests/seqgen.c'),
join_paths(zstd_rootdir, 'tests/zstreamtest.c'),
join_paths(zstd_rootdir, 'tests/external_matchfinder.c')]
zstreamtest = executable('zstreamtest',
zstreamtest_sources,
include_directories: test_includes,
Expand Down
2 changes: 2 additions & 0 deletions contrib/externalMatchfinder/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# build artifacts
externalMatchfinder
40 changes: 40 additions & 0 deletions contrib/externalMatchfinder/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# ################################################################
# Copyright (c) Yann Collet, Meta Platforms, Inc.
# All rights reserved.
#
# This source code is licensed under both the BSD-style license (found in the
# LICENSE file in the root directory of this source tree) and the GPLv2 (found
# in the COPYING file in the root directory of this source tree).
# ################################################################

PROGDIR = ../../programs
LIBDIR = ../../lib

LIBZSTD = $(LIBDIR)/libzstd.a

CPPFLAGS+= -I$(LIBDIR) -I$(LIBDIR)/compress -I$(LIBDIR)/common

CFLAGS ?= -O3
CFLAGS += -std=gnu99
DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
-Wstrict-aliasing=1 -Wswitch-enum \
-Wstrict-prototypes -Wundef -Wpointer-arith \
-Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
-Wredundant-decls
CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS)

default: externalMatchfinder

all: externalMatchfinder

externalMatchfinder: matchfinder.c main.c $(LIBZSTD)
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@

.PHONY: $(LIBZSTD)
$(LIBZSTD):
$(MAKE) -C $(LIBDIR) libzstd.a CFLAGS="$(CFLAGS)"

clean:
$(RM) *.o
$(MAKE) -C $(LIBDIR) clean > /dev/null
$(RM) externalMatchfinder
14 changes: 14 additions & 0 deletions contrib/externalMatchfinder/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
externalMatchfinder
=====================

`externalMatchfinder` is a test tool for the external matchfinder API.
It demonstrates how to use the API to perform a simple round-trip test.

A sample matchfinder is provided in matchfinder.c, but the user can swap
this out with a different one if desired. The sample matchfinder implements
LZ compression with a 1KB hashtable. Dictionary compression is not currently supported.

Command line :
```
externalMatchfinder filename
```
107 changes: 107 additions & 0 deletions contrib/externalMatchfinder/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Copyright (c) Yann Collet, Meta Platforms, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#define ZSTD_STATIC_LINKING_ONLY
#include "zstd.h"
#include "zstd_errors.h"
#include "matchfinder.h" // simpleExternalMatchFinder

#define CHECK(res) \
do { \
if (ZSTD_isError(res)) { \
printf("ERROR: %s\n", ZSTD_getErrorName(res)); \
return 1; \
} \
} while (0) \

int main(int argc, char *argv[]) {
if (argc != 2) {
printf("Usage: exampleMatchfinder <file>\n");
return 1;
}

ZSTD_CCtx* const zc = ZSTD_createCCtx();

int simpleExternalMatchState = 0xdeadbeef;

// Here is the crucial bit of code!
ZSTD_registerExternalMatchFinder(
zc,
&simpleExternalMatchState,
simpleExternalMatchFinder
);

{
size_t const res = ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, 1);
CHECK(res);
}

FILE *f = fopen(argv[1], "rb");
assert(f);
{
int const ret = fseek(f, 0, SEEK_END);
assert(ret == 0);
}
size_t const srcSize = ftell(f);
{
int const ret = fseek(f, 0, SEEK_SET);
assert(ret == 0);
}

char* const src = malloc(srcSize + 1);
assert(src);
{
size_t const ret = fread(src, srcSize, 1, f);
assert(ret == 1);
int const ret2 = fclose(f);
assert(ret2 == 0);
}

size_t const dstSize = ZSTD_compressBound(srcSize);
char* const dst = malloc(dstSize);
assert(dst);

size_t const cSize = ZSTD_compress2(zc, dst, dstSize, src, srcSize);
CHECK(cSize);

char* const val = malloc(srcSize);
assert(val);

{
size_t const res = ZSTD_decompress(val, srcSize, dst, cSize);
CHECK(res);
}

if (memcmp(src, val, srcSize) == 0) {
printf("Compression and decompression were successful!\n");
printf("Original size: %lu\n", srcSize);
printf("Compressed size: %lu\n", cSize);
} else {
printf("ERROR: input and validation buffers don't match!\n");
for (size_t i = 0; i < srcSize; i++) {
if (src[i] != val[i]) {
printf("First bad index: %zu\n", i);
break;
}
}
return 1;
}

ZSTD_freeCCtx(zc);
free(src);
free(dst);
free(val);
return 0;
}
80 changes: 80 additions & 0 deletions contrib/externalMatchfinder/matchfinder.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (c) Yann Collet, Meta Platforms, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#include "zstd_compress_internal.h"
#include "matchfinder.h"

#define HSIZE 1024
static U32 const HLOG = 10;
static U32 const MLS = 4;
static U32 const BADIDX = 0xffffffff;

size_t simpleExternalMatchFinder(
void* externalMatchState,
ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
const void* src, size_t srcSize,
const void* dict, size_t dictSize,
int compressionLevel,
size_t windowSize
) {
const BYTE* const istart = (const BYTE*)src;
const BYTE* const iend = istart + srcSize;
const BYTE* ip = istart;
const BYTE* anchor = istart;
size_t seqCount = 0;
U32 hashTable[HSIZE];

(void)externalMatchState;
(void)dict;
(void)dictSize;
(void)outSeqsCapacity;
(void)compressionLevel;

{ int i;
for (i=0; i < HSIZE; i++) {
hashTable[i] = BADIDX;
} }

while (ip + MLS < iend) {
size_t const hash = ZSTD_hashPtr(ip, HLOG, MLS);
U32 const matchIndex = hashTable[hash];
hashTable[hash] = (U32)(ip - istart);

if (matchIndex != BADIDX) {
const BYTE* const match = istart + matchIndex;
U32 const matchLen = (U32)ZSTD_count(ip, match, iend);
if (matchLen >= ZSTD_MINMATCH_MIN) {
U32 const litLen = (U32)(ip - anchor);
U32 const offset = (U32)(ip - match);
ZSTD_Sequence const seq = {
offset, litLen, matchLen, 0
};

/* Note: it's crucial to stay within the window size! */
if (offset <= windowSize) {
outSeqs[seqCount++] = seq;
ip += matchLen;
anchor = ip;
continue;
}
}
}

ip++;
}

{ ZSTD_Sequence const finalSeq = {
0, (U32)(iend - anchor), 0, 0
};
outSeqs[seqCount++] = finalSeq;
}

return seqCount;
}
26 changes: 26 additions & 0 deletions contrib/externalMatchfinder/matchfinder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* Copyright (c) Yann Collet, Meta Platforms, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#ifndef MATCHFINDER_H
#define MATCHFINDER_H

#define ZSTD_STATIC_LINKING_ONLY
#include "zstd.h"

size_t simpleExternalMatchFinder(
void* externalMatchState,
ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
const void* src, size_t srcSize,
const void* dict, size_t dictSize,
int compressionLevel,
size_t windowSize
);

#endif
2 changes: 2 additions & 0 deletions lib/common/error_private.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ const char* ERR_getErrorString(ERR_enum code)
case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
case PREFIX(parameter_unsupported): return "Unsupported parameter";
case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
case PREFIX(init_missing): return "Context should be init first";
case PREFIX(memory_allocation): return "Allocation error : not enough memory";
Expand All @@ -51,6 +52,7 @@ const char* ERR_getErrorString(ERR_enum code)
case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
case PREFIX(externalMatchFinder_failed): return "External matchfinder returned an error code";
case PREFIX(maxCode):
default: return notErrorCode;
}
Expand Down
Loading