Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'zstd-linus-v6.2' of https://github.com/terrelln/linux

Pull zstd updates from Nick Terrell:
"Update the kernel to upstream zstd v1.5.2 [0]. Specifically to the tag
v1.5.2-kernel [1] which includes several cherrypicked fixes for the
kernel on top of v1.5.2.

Excepting the MAINTAINERS change, all the changes in this can be
generated by:

git clone https://github.com/facebook/zstd
cd zstd/contrib/linux-kernel
git checkout v1.5.2-kernel
LINUX=/path/to/linux/repo make import

Additionally, this includes several minor typo fixes, which have all
been fixed upstream so they are maintained on the next import"

Link: https://github.com/facebook/zstd/releases/tag/v1.5.2 [0]
Link: https://github.com/facebook/zstd/tree/v1.5.2-kernel [1]
Link: https://lore.kernel.org/lkml/20221024202606.404049-1-nickrterrell@gmail.com/
Link: https://github.com/torvalds/linux/commit/637a642f5ca5e850186bb64ac75ebb0f124b458d

* tag 'zstd-linus-v6.2' of https://github.com/terrelln/linux:
zstd: import usptream v1.5.2
zstd: Move zstd-common module exports to zstd_common_module.c
lib: zstd: Fix comment typo
lib: zstd: fix repeated words in comments
MAINTAINERS: git://github -> https://github.com for terrelln
lib: zstd: clean up double word in comment.

+6991 -2626
+1 -1
MAINTAINERS
··· 23073 23073 M: Nick Terrell <terrelln@fb.com> 23074 23074 S: Maintained 23075 23075 B: https://github.com/facebook/zstd/issues 23076 - T: git git://github.com/terrelln/linux.git 23076 + T: git https://github.com/terrelln/linux.git 23077 23077 F: include/linux/zstd* 23078 23078 F: lib/zstd/ 23079 23079 F: lib/decompress_unzstd.c
+297 -178
include/linux/zstd_lib.h
··· 17 17 18 18 19 19 /* ===== ZSTDLIB_API : control library symbols visibility ===== */ 20 - #define ZSTDLIB_VISIBILITY 21 - #define ZSTDLIB_API ZSTDLIB_VISIBILITY 20 + #ifndef ZSTDLIB_VISIBLE 21 + # if (__GNUC__ >= 4) && !defined(__MINGW32__) 22 + # define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) 23 + # define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) 24 + # else 25 + # define ZSTDLIB_VISIBLE 26 + # define ZSTDLIB_HIDDEN 27 + # endif 28 + #endif 29 + #define ZSTDLIB_API ZSTDLIB_VISIBLE 22 30 23 31 24 32 /* ***************************************************************************** ··· 64 56 65 57 /*------ Version ------*/ 66 58 #define ZSTD_VERSION_MAJOR 1 67 - #define ZSTD_VERSION_MINOR 4 68 - #define ZSTD_VERSION_RELEASE 10 59 + #define ZSTD_VERSION_MINOR 5 60 + #define ZSTD_VERSION_RELEASE 2 69 61 #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) 70 62 71 63 /*! ZSTD_versionNumber() : ··· 100 92 101 93 #define ZSTD_BLOCKSIZELOG_MAX 17 102 94 #define ZSTD_BLOCKSIZE_MAX (1<<ZSTD_BLOCKSIZELOG_MAX) 103 - 104 95 105 96 106 97 /* ************************************* ··· 158 151 * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ 159 152 ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); 160 153 161 - /*! ZSTD_findFrameCompressedSize() : 154 + /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ 162 155 * `src` should point to the start of a ZSTD frame or skippable frame. 163 156 * `srcSize` must be >= first frame size 164 157 * @return : the compressed size of the first frame starting at `src`, ··· 172 165 ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ 173 166 ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ 174 167 ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ 175 - ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed */ 168 + ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ 176 169 ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ 170 + ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ 177 171 178 172 179 173 /* ************************************* ··· 227 219 const void* src, size_t srcSize); 228 220 229 221 230 - /* ************************************* 231 - * Advanced compression API 232 - ***************************************/ 222 + /* ******************************************* 223 + * Advanced compression API (Requires v1.4.0+) 224 + **********************************************/ 233 225 234 226 /* API design : 235 227 * Parameters are pushed one by one into an existing context, ··· 240 232 * 241 233 * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). 242 234 * 243 - * This API supercedes all other "advanced" API entry points in the experimental section. 235 + * This API supersedes all other "advanced" API entry points in the experimental section. 244 236 * In the future, we expect to remove from experimental API entry points which are redundant with this API. 245 237 */ 246 238 ··· 258 250 /* note : new strategies _might_ be added in the future. 259 251 Only the order (from fast to strong) is guaranteed */ 260 252 } ZSTD_strategy; 261 - 262 253 263 254 typedef enum { 264 255 ··· 324 317 * The higher the value of selected strategy, the more complex it is, 325 318 * resulting in stronger and slower compression. 326 319 * Special: value 0 means "use default strategy". */ 327 - 328 320 /* LDM mode parameters */ 329 321 ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. 330 322 * This parameter is designed to improve compression ratio ··· 380 374 ZSTD_c_jobSize=401, /* Size of a compression job. This value is enforced only when nbWorkers >= 1. 381 375 * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads. 382 376 * 0 means default, which is dynamically determined based on compression parameters. 383 - * Job size must be a minimum of overlap size, or 1 MB, whichever is largest. 377 + * Job size must be a minimum of overlap size, or ZSTDMT_JOBSIZE_MIN (= 512 KB), whichever is largest. 384 378 * The minimum size is automatically and transparently enforced. */ 385 379 ZSTD_c_overlapLog=402, /* Control the overlap size, as a fraction of window size. 386 380 * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. ··· 410 404 * ZSTD_c_stableOutBuffer 411 405 * ZSTD_c_blockDelimiters 412 406 * ZSTD_c_validateSequences 407 + * ZSTD_c_useBlockSplitter 408 + * ZSTD_c_useRowMatchFinder 413 409 * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. 414 410 * note : never ever use experimentalParam? names directly; 415 411 * also, the enums values themselves are unstable and can still change. ··· 427 419 ZSTD_c_experimentalParam9=1006, 428 420 ZSTD_c_experimentalParam10=1007, 429 421 ZSTD_c_experimentalParam11=1008, 430 - ZSTD_c_experimentalParam12=1009 422 + ZSTD_c_experimentalParam12=1009, 423 + ZSTD_c_experimentalParam13=1010, 424 + ZSTD_c_experimentalParam14=1011, 425 + ZSTD_c_experimentalParam15=1012 431 426 } ZSTD_cParameter; 432 427 433 428 typedef struct { ··· 515 504 const void* src, size_t srcSize); 516 505 517 506 518 - /* ************************************* 519 - * Advanced decompression API 520 - ***************************************/ 507 + /* ********************************************* 508 + * Advanced decompression API (Requires v1.4.0+) 509 + ************************************************/ 521 510 522 511 /* The advanced API pushes parameters one by one into an existing DCtx context. 523 512 * Parameters are sticky, and remain valid for all following frames ··· 679 668 : note : multithreaded compression will block to flush as much output as possible. */ 680 669 } ZSTD_EndDirective; 681 670 682 - /*! ZSTD_compressStream2() : 671 + /*! ZSTD_compressStream2() : Requires v1.4.0+ 683 672 * Behaves about the same as ZSTD_compressStream, with additional control on end directive. 684 673 * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() 685 674 * - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode) ··· 725 714 726 715 727 716 /* ***************************************************************************** 728 - * This following is a legacy streaming API. 717 + * This following is a legacy streaming API, available since v1.0+ . 729 718 * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). 730 719 * It is redundant, but remains fully supported. 731 - * Advanced parameters and dictionary compression can only be used through the 732 - * new API. 720 + * Streaming in combination with advanced parameters and dictionary compression 721 + * can only be used through the new API. 733 722 ******************************************************************************/ 734 723 735 724 /*! ··· 807 796 /*! ZSTD_compress_usingDict() : 808 797 * Compression at an explicit compression level using a Dictionary. 809 798 * A dictionary can be any arbitrary data segment (also called a prefix), 810 - * or a buffer with specified information (see dictBuilder/zdict.h). 799 + * or a buffer with specified information (see zdict.h). 811 800 * Note : This function loads the dictionary, resulting in significant startup delay. 812 801 * It's intended for a dictionary used only once. 813 802 * Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */ ··· 890 879 * Dictionary helper functions 891 880 *******************************/ 892 881 893 - /*! ZSTD_getDictID_fromDict() : 882 + /*! ZSTD_getDictID_fromDict() : Requires v1.4.0+ 894 883 * Provides the dictID stored within dictionary. 895 884 * if @return == 0, the dictionary is not conformant with Zstandard specification. 896 885 * It can still be loaded, but as a content-only dictionary. */ 897 886 ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize); 898 887 899 - /*! ZSTD_getDictID_fromDDict() : 888 + /*! ZSTD_getDictID_fromCDict() : Requires v1.5.0+ 889 + * Provides the dictID of the dictionary loaded into `cdict`. 890 + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. 891 + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ 892 + ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict); 893 + 894 + /*! ZSTD_getDictID_fromDDict() : Requires v1.4.0+ 900 895 * Provides the dictID of the dictionary loaded into `ddict`. 901 896 * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. 902 897 * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ 903 898 ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); 904 899 905 - /*! ZSTD_getDictID_fromFrame() : 900 + /*! ZSTD_getDictID_fromFrame() : Requires v1.4.0+ 906 901 * Provides the dictID required to decompressed the frame stored within `src`. 907 902 * If @return == 0, the dictID could not be decoded. 908 903 * This could for one of the following reasons : ··· 922 905 923 906 924 907 /* ***************************************************************************** 925 - * Advanced dictionary and prefix API 908 + * Advanced dictionary and prefix API (Requires v1.4.0+) 926 909 * 927 910 * This API allows dictionaries to be used with ZSTD_compress2(), 928 - * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and 911 + * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and 929 912 * only reset with the context is reset with ZSTD_reset_parameters or 930 913 * ZSTD_reset_session_and_parameters. Prefixes are single-use. 931 914 ******************************************************************************/ 932 915 933 916 934 - /*! ZSTD_CCtx_loadDictionary() : 917 + /*! ZSTD_CCtx_loadDictionary() : Requires v1.4.0+ 935 918 * Create an internal CDict from `dict` buffer. 936 919 * Decompression will have to use same dictionary. 937 920 * @result : 0, or an error code (which can be tested with ZSTD_isError()). ··· 950 933 * to precisely select how dictionary content must be interpreted. */ 951 934 ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); 952 935 953 - /*! ZSTD_CCtx_refCDict() : 936 + /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ 954 937 * Reference a prepared dictionary, to be used for all next compressed frames. 955 938 * Note that compression parameters are enforced from within CDict, 956 939 * and supersede any compression parameter previously set within CCtx. ··· 964 947 * Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */ 965 948 ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); 966 949 967 - /*! ZSTD_CCtx_refPrefix() : 950 + /*! ZSTD_CCtx_refPrefix() : Requires v1.4.0+ 968 951 * Reference a prefix (single-usage dictionary) for next compressed frame. 969 952 * A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end). 970 953 * Decompression will need same prefix to properly regenerate data. ··· 985 968 ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, 986 969 const void* prefix, size_t prefixSize); 987 970 988 - /*! ZSTD_DCtx_loadDictionary() : 971 + /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ 989 972 * Create an internal DDict from dict buffer, 990 973 * to be used to decompress next frames. 991 974 * The dictionary remains valid for all future frames, until explicitly invalidated. ··· 1002 985 */ 1003 986 ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); 1004 987 1005 - /*! ZSTD_DCtx_refDDict() : 988 + /*! ZSTD_DCtx_refDDict() : Requires v1.4.0+ 1006 989 * Reference a prepared dictionary, to be used to decompress next frames. 1007 990 * The dictionary remains active for decompression of future frames using same DCtx. 1008 991 * ··· 1020 1003 */ 1021 1004 ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); 1022 1005 1023 - /*! ZSTD_DCtx_refPrefix() : 1006 + /*! ZSTD_DCtx_refPrefix() : Requires v1.4.0+ 1024 1007 * Reference a prefix (single-usage dictionary) to decompress next frame. 1025 1008 * This is the reverse operation of ZSTD_CCtx_refPrefix(), 1026 1009 * and must use the same prefix as the one used during compression. ··· 1041 1024 1042 1025 /* === Memory management === */ 1043 1026 1044 - /*! ZSTD_sizeof_*() : 1027 + /*! ZSTD_sizeof_*() : Requires v1.4.0+ 1045 1028 * These functions give the _current_ memory usage of selected object. 1046 1029 * Note that object memory usage can evolve (increase or decrease) over time. */ 1047 1030 ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); ··· 1065 1048 1066 1049 #if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) 1067 1050 #define ZSTD_H_ZSTD_STATIC_LINKING_ONLY 1051 + 1052 + /* This can be overridden externally to hide static symbols. */ 1053 + #ifndef ZSTDLIB_STATIC_API 1054 + #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE 1055 + #endif 1056 + 1057 + /* Deprecation warnings : 1058 + * Should these warnings be a problem, it is generally possible to disable them, 1059 + * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. 1060 + * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. 1061 + */ 1062 + #ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS 1063 + # define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */ 1064 + #else 1065 + # if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) 1066 + # define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message))) 1067 + # elif (__GNUC__ >= 3) 1068 + # define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated)) 1069 + # else 1070 + # pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") 1071 + # define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API 1072 + # endif 1073 + #endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ 1068 1074 1069 1075 /* ************************************************************************************** 1070 1076 * experimental API (static linking only) ··· 1150 1110 #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX 1151 1111 #define ZSTD_SRCSIZEHINT_MIN 0 1152 1112 #define ZSTD_SRCSIZEHINT_MAX INT_MAX 1153 - 1154 - /* internal */ 1155 - #define ZSTD_HASHLOG3_MAX 17 1156 1113 1157 1114 1158 1115 /* --- Advanced types --- */ ··· 1292 1255 ZSTD_lcm_uncompressed = 2 /*< Always emit uncompressed literals. */ 1293 1256 } ZSTD_literalCompressionMode_e; 1294 1257 1258 + typedef enum { 1259 + /* Note: This enum controls features which are conditionally beneficial. Zstd typically will make a final 1260 + * decision on whether or not to enable the feature (ZSTD_ps_auto), but setting the switch to ZSTD_ps_enable 1261 + * or ZSTD_ps_disable allow for a force enable/disable the feature. 1262 + */ 1263 + ZSTD_ps_auto = 0, /* Let the library automatically determine whether the feature shall be enabled */ 1264 + ZSTD_ps_enable = 1, /* Force-enable the feature */ 1265 + ZSTD_ps_disable = 2 /* Do not use the feature */ 1266 + } ZSTD_paramSwitch_e; 1295 1267 1296 1268 /* ************************************* 1297 1269 * Frame size functions ··· 1327 1281 * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to 1328 1282 * read each contained frame header. This is fast as most of the data is skipped, 1329 1283 * however it does mean that all frame data must be present and valid. */ 1330 - ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); 1284 + ZSTDLIB_STATIC_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); 1331 1285 1332 1286 /*! ZSTD_decompressBound() : 1333 1287 * `src` should point to the start of a series of ZSTD encoded and/or skippable frames ··· 1342 1296 * note 3 : when the decompressed size field isn't available, the upper-bound for that frame is calculated by: 1343 1297 * upper-bound = # blocks * min(128 KB, Window_Size) 1344 1298 */ 1345 - ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); 1299 + ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); 1346 1300 1347 1301 /*! ZSTD_frameHeaderSize() : 1348 1302 * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. 1349 1303 * @return : size of the Frame Header, 1350 1304 * or an error code (if srcSize is too small) */ 1351 - ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); 1305 + ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); 1352 1306 1353 1307 typedef enum { 1354 1308 ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ ··· 1371 1325 * @return : number of sequences generated 1372 1326 */ 1373 1327 1374 - ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, 1328 + ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, 1375 1329 size_t outSeqsSize, const void* src, size_t srcSize); 1376 1330 1377 1331 /*! ZSTD_mergeBlockDelimiters() : 1378 1332 * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals 1379 - * by merging them into into the literals of the next sequence. 1333 + * by merging them into the literals of the next sequence. 1380 1334 * 1381 1335 * As such, the final generated result has no explicit representation of block boundaries, 1382 1336 * and the final last literals segment is not represented in the sequences. ··· 1385 1339 * setting of ZSTD_c_blockDelimiters as ZSTD_sf_noBlockDelimiters 1386 1340 * @return : number of sequences left after merging 1387 1341 */ 1388 - ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); 1342 + ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); 1389 1343 1390 1344 /*! ZSTD_compressSequences() : 1391 1345 * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. ··· 1415 1369 * and cannot emit an RLE block that disagrees with the repcode history 1416 1370 * @return : final compressed size or a ZSTD error. 1417 1371 */ 1418 - ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, 1372 + ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, 1419 1373 const ZSTD_Sequence* inSeqs, size_t inSeqsSize, 1420 1374 const void* src, size_t srcSize); 1421 1375 ··· 1423 1377 /*! ZSTD_writeSkippableFrame() : 1424 1378 * Generates a zstd skippable frame containing data given by src, and writes it to dst buffer. 1425 1379 * 1426 - * Skippable frames begin with a a 4-byte magic number. There are 16 possible choices of magic number, 1380 + * Skippable frames begin with a 4-byte magic number. There are 16 possible choices of magic number, 1427 1381 * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15. 1428 1382 * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so 1429 1383 * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. ··· 1433 1387 * 1434 1388 * @return : number of bytes written or a ZSTD error. 1435 1389 */ 1436 - ZSTDLIB_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, 1390 + ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, 1437 1391 const void* src, size_t srcSize, unsigned magicVariant); 1392 + 1393 + /*! ZSTD_readSkippableFrame() : 1394 + * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. 1395 + * 1396 + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, 1397 + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested 1398 + * in the magicVariant. 1399 + * 1400 + * Returns an error if destination buffer is not large enough, or if the frame is not skippable. 1401 + * 1402 + * @return : number of bytes written or a ZSTD error. 1403 + */ 1404 + ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, 1405 + const void* src, size_t srcSize); 1406 + 1407 + /*! ZSTD_isSkippableFrame() : 1408 + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. 1409 + */ 1410 + ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); 1411 + 1438 1412 1439 1413 1440 1414 /* ************************************* ··· 1484 1418 * Note 2 : only single-threaded compression is supported. 1485 1419 * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. 1486 1420 */ 1487 - ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel); 1488 - ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); 1489 - ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); 1490 - ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); 1421 + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); 1422 + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); 1423 + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); 1424 + ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); 1491 1425 1492 1426 /*! ZSTD_estimateCStreamSize() : 1493 1427 * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. ··· 1502 1436 * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), 1503 1437 * an internal ?Dict will be created, which additional size is not estimated here. 1504 1438 * In this case, get total size by adding ZSTD_estimate?DictSize */ 1505 - ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel); 1506 - ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); 1507 - ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); 1508 - ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize); 1509 - ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); 1439 + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); 1440 + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); 1441 + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); 1442 + ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize); 1443 + ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); 1510 1444 1511 1445 /*! ZSTD_estimate?DictSize() : 1512 1446 * ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict(). 1513 1447 * ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced(). 1514 1448 * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller. 1515 1449 */ 1516 - ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); 1517 - ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); 1518 - ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); 1450 + ZSTDLIB_STATIC_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); 1451 + ZSTDLIB_STATIC_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); 1452 + ZSTDLIB_STATIC_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); 1519 1453 1520 1454 /*! ZSTD_initStatic*() : 1521 1455 * Initialize an object using a pre-allocated fixed-size buffer. ··· 1538 1472 * Limitation 2 : static cctx currently not compatible with multi-threading. 1539 1473 * Limitation 3 : static dctx is incompatible with legacy support. 1540 1474 */ 1541 - ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); 1542 - ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /*< same as ZSTD_initStaticCCtx() */ 1475 + ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); 1476 + ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /*< same as ZSTD_initStaticCCtx() */ 1543 1477 1544 - ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); 1545 - ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /*< same as ZSTD_initStaticDCtx() */ 1478 + ZSTDLIB_STATIC_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); 1479 + ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /*< same as ZSTD_initStaticDCtx() */ 1546 1480 1547 - ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict( 1481 + ZSTDLIB_STATIC_API const ZSTD_CDict* ZSTD_initStaticCDict( 1548 1482 void* workspace, size_t workspaceSize, 1549 1483 const void* dict, size_t dictSize, 1550 1484 ZSTD_dictLoadMethod_e dictLoadMethod, 1551 1485 ZSTD_dictContentType_e dictContentType, 1552 1486 ZSTD_compressionParameters cParams); 1553 1487 1554 - ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict( 1488 + ZSTDLIB_STATIC_API const ZSTD_DDict* ZSTD_initStaticDDict( 1555 1489 void* workspace, size_t workspaceSize, 1556 1490 const void* dict, size_t dictSize, 1557 1491 ZSTD_dictLoadMethod_e dictLoadMethod, ··· 1570 1504 __attribute__((__unused__)) 1571 1505 ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /*< this constant defers to stdlib's functions */ 1572 1506 1573 - ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); 1574 - ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); 1575 - ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); 1576 - ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); 1507 + ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); 1508 + ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); 1509 + ZSTDLIB_STATIC_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); 1510 + ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); 1577 1511 1578 - ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, 1512 + ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, 1579 1513 ZSTD_dictLoadMethod_e dictLoadMethod, 1580 1514 ZSTD_dictContentType_e dictContentType, 1581 1515 ZSTD_compressionParameters cParams, 1582 1516 ZSTD_customMem customMem); 1583 1517 1584 - /* ! Thread pool : 1585 - * These prototypes make it possible to share a thread pool among multiple compression contexts. 1586 - * This can limit resources for applications with multiple threads where each one uses 1587 - * a threaded compression mode (via ZSTD_c_nbWorkers parameter). 1588 - * ZSTD_createThreadPool creates a new thread pool with a given number of threads. 1589 - * Note that the lifetime of such pool must exist while being used. 1590 - * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL argument value 1591 - * to use an internal thread pool). 1592 - * ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer. 1518 + /*! Thread pool : 1519 + * These prototypes make it possible to share a thread pool among multiple compression contexts. 1520 + * This can limit resources for applications with multiple threads where each one uses 1521 + * a threaded compression mode (via ZSTD_c_nbWorkers parameter). 1522 + * ZSTD_createThreadPool creates a new thread pool with a given number of threads. 1523 + * Note that the lifetime of such pool must exist while being used. 1524 + * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL argument value 1525 + * to use an internal thread pool). 1526 + * ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer. 1593 1527 */ 1594 1528 typedef struct POOL_ctx_s ZSTD_threadPool; 1595 - ZSTDLIB_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads); 1596 - ZSTDLIB_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool); /* accept NULL pointer */ 1597 - ZSTDLIB_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool); 1529 + ZSTDLIB_STATIC_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads); 1530 + ZSTDLIB_STATIC_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool); /* accept NULL pointer */ 1531 + ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool); 1598 1532 1599 1533 1600 1534 /* 1601 1535 * This API is temporary and is expected to change or disappear in the future! 1602 1536 */ 1603 - ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( 1537 + ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced2( 1604 1538 const void* dict, size_t dictSize, 1605 1539 ZSTD_dictLoadMethod_e dictLoadMethod, 1606 1540 ZSTD_dictContentType_e dictContentType, 1607 1541 const ZSTD_CCtx_params* cctxParams, 1608 1542 ZSTD_customMem customMem); 1609 1543 1610 - ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced( 1544 + ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_advanced( 1611 1545 const void* dict, size_t dictSize, 1612 1546 ZSTD_dictLoadMethod_e dictLoadMethod, 1613 1547 ZSTD_dictContentType_e dictContentType, ··· 1624 1558 * As a consequence, `dictBuffer` **must** outlive CDict, 1625 1559 * and its content must remain unmodified throughout the lifetime of CDict. 1626 1560 * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */ 1627 - ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); 1628 - 1629 - /*! ZSTD_getDictID_fromCDict() : 1630 - * Provides the dictID of the dictionary loaded into `cdict`. 1631 - * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. 1632 - * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ 1633 - ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict); 1561 + ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); 1634 1562 1635 1563 /*! ZSTD_getCParams() : 1636 1564 * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize. 1637 1565 * `estimatedSrcSize` value is optional, select 0 if not known */ 1638 - ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); 1566 + ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); 1639 1567 1640 1568 /*! ZSTD_getParams() : 1641 1569 * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`. 1642 1570 * All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */ 1643 - ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); 1571 + ZSTDLIB_STATIC_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); 1644 1572 1645 1573 /*! ZSTD_checkCParams() : 1646 1574 * Ensure param values remain within authorized range. 1647 1575 * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */ 1648 - ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); 1576 + ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); 1649 1577 1650 1578 /*! ZSTD_adjustCParams() : 1651 1579 * optimize params for a given `srcSize` and `dictSize`. ··· 1647 1587 * `dictSize` must be `0` when there is no dictionary. 1648 1588 * cPar can be invalid : all parameters will be clamped within valid range in the @return struct. 1649 1589 * This function never fails (wide contract) */ 1650 - ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); 1590 + ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); 1651 1591 1652 1592 /*! ZSTD_compress_advanced() : 1653 1593 * Note : this function is now DEPRECATED. 1654 1594 * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. 1655 - * This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */ 1656 - ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, 1595 + * This prototype will generate compilation warnings. */ 1596 + ZSTD_DEPRECATED("use ZSTD_compress2") 1597 + size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, 1657 1598 void* dst, size_t dstCapacity, 1658 1599 const void* src, size_t srcSize, 1659 1600 const void* dict,size_t dictSize, 1660 1601 ZSTD_parameters params); 1661 1602 1662 1603 /*! ZSTD_compress_usingCDict_advanced() : 1663 - * Note : this function is now REDUNDANT. 1604 + * Note : this function is now DEPRECATED. 1664 1605 * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. 1665 - * This prototype will be marked as deprecated and generate compilation warning in some future version */ 1666 - ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, 1606 + * This prototype will generate compilation warnings. */ 1607 + ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") 1608 + size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, 1667 1609 void* dst, size_t dstCapacity, 1668 1610 const void* src, size_t srcSize, 1669 1611 const ZSTD_CDict* cdict, ··· 1675 1613 /*! ZSTD_CCtx_loadDictionary_byReference() : 1676 1614 * Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx. 1677 1615 * It saves some memory, but also requires that `dict` outlives its usage within `cctx` */ 1678 - ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); 1616 + ZSTDLIB_STATIC_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); 1679 1617 1680 1618 /*! ZSTD_CCtx_loadDictionary_advanced() : 1681 1619 * Same as ZSTD_CCtx_loadDictionary(), but gives finer control over 1682 1620 * how to load the dictionary (by copy ? by reference ?) 1683 1621 * and how to interpret it (automatic ? force raw mode ? full mode only ?) */ 1684 - ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); 1622 + ZSTDLIB_STATIC_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); 1685 1623 1686 1624 /*! ZSTD_CCtx_refPrefix_advanced() : 1687 1625 * Same as ZSTD_CCtx_refPrefix(), but gives finer control over 1688 1626 * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ 1689 - ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); 1627 + ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); 1690 1628 1691 1629 /* === experimental parameters === */ 1692 1630 /* these parameters can be used with ZSTD_setParameter() ··· 1725 1663 * See the comments on that enum for an explanation of the feature. */ 1726 1664 #define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 1727 1665 1728 - /* Controls how the literals are compressed (default is auto). 1729 - * The value must be of type ZSTD_literalCompressionMode_e. 1730 - * See ZSTD_literalCompressionMode_t enum definition for details. 1666 + /* Controlled with ZSTD_paramSwitch_e enum. 1667 + * Default is ZSTD_ps_auto. 1668 + * Set to ZSTD_ps_disable to never compress literals. 1669 + * Set to ZSTD_ps_enable to always compress literals. (Note: uncompressed literals 1670 + * may still be emitted if huffman is not beneficial to use.) 1671 + * 1672 + * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use 1673 + * literals compression based on the compression parameters - specifically, 1674 + * negative compression levels do not use literal compression. 1731 1675 */ 1732 1676 #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 1733 1677 ··· 1796 1728 * 1797 1729 * Note that this means that the CDict tables can no longer be copied into the 1798 1730 * CCtx, so the dict attachment mode ZSTD_dictForceCopy will no longer be 1799 - * useable. The dictionary can only be attached or reloaded. 1731 + * usable. The dictionary can only be attached or reloaded. 1800 1732 * 1801 1733 * In general, you should expect compression to be faster--sometimes very much 1802 1734 * so--and CDict creation to be slightly slower. Eventually, we will probably ··· 1885 1817 */ 1886 1818 #define ZSTD_c_validateSequences ZSTD_c_experimentalParam12 1887 1819 1820 + /* ZSTD_c_useBlockSplitter 1821 + * Controlled with ZSTD_paramSwitch_e enum. 1822 + * Default is ZSTD_ps_auto. 1823 + * Set to ZSTD_ps_disable to never use block splitter. 1824 + * Set to ZSTD_ps_enable to always use block splitter. 1825 + * 1826 + * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use 1827 + * block splitting based on the compression parameters. 1828 + */ 1829 + #define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13 1830 + 1831 + /* ZSTD_c_useRowMatchFinder 1832 + * Controlled with ZSTD_paramSwitch_e enum. 1833 + * Default is ZSTD_ps_auto. 1834 + * Set to ZSTD_ps_disable to never use row-based matchfinder. 1835 + * Set to ZSTD_ps_enable to force usage of row-based matchfinder. 1836 + * 1837 + * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use 1838 + * the row-based matchfinder based on support for SIMD instructions and the window log. 1839 + * Note that this only pertains to compression strategies: greedy, lazy, and lazy2 1840 + */ 1841 + #define ZSTD_c_useRowMatchFinder ZSTD_c_experimentalParam14 1842 + 1843 + /* ZSTD_c_deterministicRefPrefix 1844 + * Default is 0 == disabled. Set to 1 to enable. 1845 + * 1846 + * Zstd produces different results for prefix compression when the prefix is 1847 + * directly adjacent to the data about to be compressed vs. when it isn't. 1848 + * This is because zstd detects that the two buffers are contiguous and it can 1849 + * use a more efficient match finding algorithm. However, this produces different 1850 + * results than when the two buffers are non-contiguous. This flag forces zstd 1851 + * to always load the prefix in non-contiguous mode, even if it happens to be 1852 + * adjacent to the data, to guarantee determinism. 1853 + * 1854 + * If you really care about determinism when using a dictionary or prefix, 1855 + * like when doing delta compression, you should select this option. It comes 1856 + * at a speed penalty of about ~2.5% if the dictionary and data happened to be 1857 + * contiguous, and is free if they weren't contiguous. We don't expect that 1858 + * intentionally making the dictionary and data contiguous will be worth the 1859 + * cost to memcpy() the data. 1860 + */ 1861 + #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 1862 + 1888 1863 /*! ZSTD_CCtx_getParameter() : 1889 1864 * Get the requested compression parameter value, selected by enum ZSTD_cParameter, 1890 1865 * and store it into int* value. 1891 1866 * @return : 0, or an error code (which can be tested with ZSTD_isError()). 1892 1867 */ 1893 - ZSTDLIB_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); 1868 + ZSTDLIB_STATIC_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); 1894 1869 1895 1870 1896 1871 /*! ZSTD_CCtx_params : ··· 1953 1842 * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams() 1954 1843 * for static allocation of CCtx for single-threaded compression. 1955 1844 */ 1956 - ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); 1957 - ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); /* accept NULL pointer */ 1845 + ZSTDLIB_STATIC_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); 1846 + ZSTDLIB_STATIC_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); /* accept NULL pointer */ 1958 1847 1959 1848 /*! ZSTD_CCtxParams_reset() : 1960 1849 * Reset params to default values. 1961 1850 */ 1962 - ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); 1851 + ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); 1963 1852 1964 1853 /*! ZSTD_CCtxParams_init() : 1965 1854 * Initializes the compression parameters of cctxParams according to 1966 1855 * compression level. All other parameters are reset to their default values. 1967 1856 */ 1968 - ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); 1857 + ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); 1969 1858 1970 1859 /*! ZSTD_CCtxParams_init_advanced() : 1971 1860 * Initializes the compression and frame parameters of cctxParams according to 1972 1861 * params. All other parameters are reset to their default values. 1973 1862 */ 1974 - ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); 1863 + ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); 1975 1864 1976 - /*! ZSTD_CCtxParams_setParameter() : 1865 + /*! ZSTD_CCtxParams_setParameter() : Requires v1.4.0+ 1977 1866 * Similar to ZSTD_CCtx_setParameter. 1978 1867 * Set one compression parameter, selected by enum ZSTD_cParameter. 1979 1868 * Parameters must be applied to a ZSTD_CCtx using ··· 1981 1870 * @result : a code representing success or failure (which can be tested with 1982 1871 * ZSTD_isError()). 1983 1872 */ 1984 - ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value); 1873 + ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value); 1985 1874 1986 1875 /*! ZSTD_CCtxParams_getParameter() : 1987 1876 * Similar to ZSTD_CCtx_getParameter. 1988 1877 * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter. 1989 1878 * @result : 0, or an error code (which can be tested with ZSTD_isError()). 1990 1879 */ 1991 - ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); 1880 + ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); 1992 1881 1993 1882 /*! ZSTD_CCtx_setParametersUsingCCtxParams() : 1994 1883 * Apply a set of ZSTD_CCtx_params to the compression context. ··· 1997 1886 * if nbWorkers>=1, new parameters will be picked up at next job, 1998 1887 * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated). 1999 1888 */ 2000 - ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( 1889 + ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( 2001 1890 ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params); 2002 1891 2003 1892 /*! ZSTD_compressStream2_simpleArgs() : ··· 2006 1895 * This variant might be helpful for binders from dynamic languages 2007 1896 * which have troubles handling structures containing memory pointers. 2008 1897 */ 2009 - ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( 1898 + ZSTDLIB_STATIC_API size_t ZSTD_compressStream2_simpleArgs ( 2010 1899 ZSTD_CCtx* cctx, 2011 1900 void* dst, size_t dstCapacity, size_t* dstPos, 2012 1901 const void* src, size_t srcSize, size_t* srcPos, ··· 2022 1911 * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. 2023 1912 * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. 2024 1913 * Note 3 : Skippable Frame Identifiers are considered valid. */ 2025 - ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size); 1914 + ZSTDLIB_STATIC_API unsigned ZSTD_isFrame(const void* buffer, size_t size); 2026 1915 2027 1916 /*! ZSTD_createDDict_byReference() : 2028 1917 * Create a digested dictionary, ready to start decompression operation without startup delay. 2029 1918 * Dictionary content is referenced, and therefore stays in dictBuffer. 2030 1919 * It is important that dictBuffer outlives DDict, 2031 1920 * it must remain read accessible throughout the lifetime of DDict */ 2032 - ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); 1921 + ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); 2033 1922 2034 1923 /*! ZSTD_DCtx_loadDictionary_byReference() : 2035 1924 * Same as ZSTD_DCtx_loadDictionary(), 2036 1925 * but references `dict` content instead of copying it into `dctx`. 2037 1926 * This saves memory if `dict` remains around., 2038 1927 * However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */ 2039 - ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); 1928 + ZSTDLIB_STATIC_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); 2040 1929 2041 1930 /*! ZSTD_DCtx_loadDictionary_advanced() : 2042 1931 * Same as ZSTD_DCtx_loadDictionary(), 2043 1932 * but gives direct control over 2044 1933 * how to load the dictionary (by copy ? by reference ?) 2045 1934 * and how to interpret it (automatic ? force raw mode ? full mode only ?). */ 2046 - ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); 1935 + ZSTDLIB_STATIC_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); 2047 1936 2048 1937 /*! ZSTD_DCtx_refPrefix_advanced() : 2049 1938 * Same as ZSTD_DCtx_refPrefix(), but gives finer control over 2050 1939 * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ 2051 - ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); 1940 + ZSTDLIB_STATIC_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); 2052 1941 2053 1942 /*! ZSTD_DCtx_setMaxWindowSize() : 2054 1943 * Refuses allocating internal buffers for frames requiring a window size larger than provided limit. ··· 2057 1946 * By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) 2058 1947 * @return : 0, or an error code (which can be tested using ZSTD_isError()). 2059 1948 */ 2060 - ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); 1949 + ZSTDLIB_STATIC_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); 2061 1950 2062 1951 /*! ZSTD_DCtx_getParameter() : 2063 1952 * Get the requested decompression parameter value, selected by enum ZSTD_dParameter, 2064 1953 * and store it into int* value. 2065 1954 * @return : 0, or an error code (which can be tested with ZSTD_isError()). 2066 1955 */ 2067 - ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value); 1956 + ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value); 2068 1957 2069 1958 /* ZSTD_d_format 2070 1959 * experimental parameter, ··· 2139 2028 2140 2029 2141 2030 /*! ZSTD_DCtx_setFormat() : 2031 + * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). 2142 2032 * Instruct the decoder context about what kind of data to decode next. 2143 2033 * This instruction is mandatory to decode data without a fully-formed header, 2144 2034 * such ZSTD_f_zstd1_magicless for example. 2145 2035 * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ 2146 - ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); 2036 + ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") 2037 + size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); 2147 2038 2148 2039 /*! ZSTD_decompressStream_simpleArgs() : 2149 2040 * Same as ZSTD_decompressStream(), ··· 2153 2040 * This can be helpful for binders from dynamic languages 2154 2041 * which have troubles handling structures containing memory pointers. 2155 2042 */ 2156 - ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( 2043 + ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( 2157 2044 ZSTD_DCtx* dctx, 2158 2045 void* dst, size_t dstCapacity, size_t* dstPos, 2159 2046 const void* src, size_t srcSize, size_t* srcPos); ··· 2169 2056 /*===== Advanced Streaming compression functions =====*/ 2170 2057 2171 2058 /*! ZSTD_initCStream_srcSize() : 2172 - * This function is deprecated, and equivalent to: 2059 + * This function is DEPRECATED, and equivalent to: 2173 2060 * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); 2174 2061 * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) 2175 2062 * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); ··· 2178 2065 * pledgedSrcSize must be correct. If it is not known at init time, use 2179 2066 * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, 2180 2067 * "0" also disables frame content size field. It may be enabled in the future. 2181 - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x 2068 + * This prototype will generate compilation warnings. 2182 2069 */ 2183 - ZSTDLIB_API size_t 2184 - ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, 2070 + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") 2071 + size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, 2185 2072 int compressionLevel, 2186 2073 unsigned long long pledgedSrcSize); 2187 2074 2188 2075 /*! ZSTD_initCStream_usingDict() : 2189 - * This function is deprecated, and is equivalent to: 2076 + * This function is DEPRECATED, and is equivalent to: 2190 2077 * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); 2191 2078 * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); 2192 2079 * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); ··· 2195 2082 * dict == NULL or dictSize < 8, in which case no dict is used. 2196 2083 * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if 2197 2084 * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy. 2198 - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x 2085 + * This prototype will generate compilation warnings. 2199 2086 */ 2200 - ZSTDLIB_API size_t 2201 - ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, 2087 + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") 2088 + size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, 2202 2089 const void* dict, size_t dictSize, 2203 2090 int compressionLevel); 2204 2091 2205 2092 /*! ZSTD_initCStream_advanced() : 2206 - * This function is deprecated, and is approximately equivalent to: 2093 + * This function is DEPRECATED, and is approximately equivalent to: 2207 2094 * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); 2208 2095 * // Pseudocode: Set each zstd parameter and leave the rest as-is. 2209 2096 * for ((param, value) : params) { ··· 2215 2102 * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy. 2216 2103 * pledgedSrcSize must be correct. 2217 2104 * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. 2218 - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x 2105 + * This prototype will generate compilation warnings. 2219 2106 */ 2220 - ZSTDLIB_API size_t 2221 - ZSTD_initCStream_advanced(ZSTD_CStream* zcs, 2107 + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") 2108 + size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, 2222 2109 const void* dict, size_t dictSize, 2223 2110 ZSTD_parameters params, 2224 2111 unsigned long long pledgedSrcSize); 2225 2112 2226 2113 /*! ZSTD_initCStream_usingCDict() : 2227 - * This function is deprecated, and equivalent to: 2114 + * This function is DEPRECATED, and equivalent to: 2228 2115 * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); 2229 2116 * ZSTD_CCtx_refCDict(zcs, cdict); 2230 2117 * 2231 2118 * note : cdict will just be referenced, and must outlive compression session 2232 - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x 2119 + * This prototype will generate compilation warnings. 2233 2120 */ 2234 - ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); 2121 + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") 2122 + size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); 2235 2123 2236 2124 /*! ZSTD_initCStream_usingCDict_advanced() : 2237 2125 * This function is DEPRECATED, and is approximately equivalent to: ··· 2247 2133 * same as ZSTD_initCStream_usingCDict(), with control over frame parameters. 2248 2134 * pledgedSrcSize must be correct. If srcSize is not known at init time, use 2249 2135 * value ZSTD_CONTENTSIZE_UNKNOWN. 2250 - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x 2136 + * This prototype will generate compilation warnings. 2251 2137 */ 2252 - ZSTDLIB_API size_t 2253 - ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, 2138 + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") 2139 + size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, 2254 2140 const ZSTD_CDict* cdict, 2255 2141 ZSTD_frameParameters fParams, 2256 2142 unsigned long long pledgedSrcSize); 2257 2143 2258 2144 /*! ZSTD_resetCStream() : 2259 - * This function is deprecated, and is equivalent to: 2145 + * This function is DEPRECATED, and is equivalent to: 2260 2146 * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); 2261 2147 * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); 2148 + * Note: ZSTD_resetCStream() interprets pledgedSrcSize == 0 as ZSTD_CONTENTSIZE_UNKNOWN, but 2149 + * ZSTD_CCtx_setPledgedSrcSize() does not do the same, so ZSTD_CONTENTSIZE_UNKNOWN must be 2150 + * explicitly specified. 2262 2151 * 2263 2152 * start a new frame, using same parameters from previous frame. 2264 2153 * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. ··· 2271 2154 * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs, 2272 2155 * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead. 2273 2156 * @return : 0, or an error code (which can be tested using ZSTD_isError()) 2274 - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x 2157 + * This prototype will generate compilation warnings. 2275 2158 */ 2276 - ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); 2159 + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") 2160 + size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); 2277 2161 2278 2162 2279 2163 typedef struct { ··· 2292 2174 * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed. 2293 2175 * Aggregates progression inside active worker threads. 2294 2176 */ 2295 - ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); 2177 + ZSTDLIB_STATIC_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); 2296 2178 2297 2179 /*! ZSTD_toFlushNow() : 2298 2180 * Tell how many bytes are ready to be flushed immediately. ··· 2307 2189 * therefore flush speed is limited by production speed of oldest job 2308 2190 * irrespective of the speed of concurrent (and newer) jobs. 2309 2191 */ 2310 - ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); 2192 + ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); 2311 2193 2312 2194 2313 2195 /*===== Advanced Streaming decompression functions =====*/ ··· 2321 2203 * note: no dictionary will be used if dict == NULL or dictSize < 8 2322 2204 * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x 2323 2205 */ 2324 - ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); 2206 + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); 2325 2207 2326 2208 /*! 2327 2209 * This function is deprecated, and is equivalent to: ··· 2332 2214 * note : ddict is referenced, it must outlive decompression session 2333 2215 * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x 2334 2216 */ 2335 - ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); 2217 + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); 2336 2218 2337 2219 /*! 2338 2220 * This function is deprecated, and is equivalent to: ··· 2342 2224 * re-use decompression parameters from previous init; saves dictionary loading 2343 2225 * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x 2344 2226 */ 2345 - ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); 2227 + ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); 2346 2228 2347 2229 2348 2230 /* ******************************************************************* ··· 2361 2243 ZSTD_CCtx object can be re-used multiple times within successive compression operations. 2362 2244 2363 2245 Start by initializing a context. 2364 - Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression, 2365 - or ZSTD_compressBegin_advanced(), for finer parameter control. 2246 + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. 2366 2247 It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() 2367 2248 2368 2249 Then, consume your input using ZSTD_compressContinue(). ··· 2384 2267 */ 2385 2268 2386 2269 /*===== Buffer-less streaming compression functions =====*/ 2387 - ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); 2388 - ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); 2389 - ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ 2390 - ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ 2391 - ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ 2392 - ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ 2270 + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); 2271 + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); 2272 + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ 2273 + ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ 2393 2274 2394 - ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); 2395 - ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); 2275 + ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); 2276 + ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); 2396 2277 2397 - 2278 + /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ 2279 + ZSTD_DEPRECATED("use advanced API to access custom parameters") 2280 + size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ 2281 + ZSTD_DEPRECATED("use advanced API to access custom parameters") 2282 + size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ 2398 2283 /* 2399 2284 Buffer-less streaming decompression (synchronous mode) 2400 2285 ··· 2487 2368 * @return : 0, `zfhPtr` is correctly filled, 2488 2369 * >0, `srcSize` is too small, value is wanted `srcSize` amount, 2489 2370 * or an error code, which can be tested using ZSTD_isError() */ 2490 - ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ 2371 + ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ 2491 2372 /*! ZSTD_getFrameHeader_advanced() : 2492 2373 * same as ZSTD_getFrameHeader(), 2493 2374 * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ 2494 - ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); 2495 - ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ 2375 + ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); 2376 + ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ 2496 2377 2497 - ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); 2498 - ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); 2499 - ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); 2378 + ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); 2379 + ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); 2380 + ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); 2500 2381 2501 - ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); 2502 - ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); 2382 + ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); 2383 + ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); 2503 2384 2504 2385 /* misc */ 2505 - ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); 2386 + ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); 2506 2387 typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; 2507 - ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); 2388 + ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); 2508 2389 2509 2390 2510 2391 ··· 2541 2422 */ 2542 2423 2543 2424 /*===== Raw zstd block functions =====*/ 2544 - ZSTDLIB_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); 2545 - ZSTDLIB_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); 2546 - ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); 2547 - ZSTDLIB_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ 2425 + ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); 2426 + ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); 2427 + ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); 2428 + ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ 2548 2429 2549 2430 2550 2431 #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+1
lib/zstd/Makefile
··· 35 35 decompress/zstd_decompress_block.o \ 36 36 37 37 zstd_common-y := \ 38 + zstd_common_module.o \ 38 39 common/debug.o \ 39 40 common/entropy_common.o \ 40 41 common/error_private.o \
+9
lib/zstd/common/bitstream.h
··· 313 313 U32 const regMask = sizeof(bitContainer)*8 - 1; 314 314 /* if start > regMask, bitstream is corrupted, and result is undefined */ 315 315 assert(nbBits < BIT_MASK_SIZE); 316 + /* x86 transform & ((1 << nbBits) - 1) to bzhi instruction, it is better 317 + * than accessing memory. When bmi2 instruction is not present, we consider 318 + * such cpus old (pre-Haswell, 2013) and their performance is not of that 319 + * importance. 320 + */ 321 + #if defined(__x86_64__) || defined(_M_X86) 322 + return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1); 323 + #else 316 324 return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; 325 + #endif 317 326 } 318 327 319 328 MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+37 -30
lib/zstd/common/compiler.h
··· 11 11 #ifndef ZSTD_COMPILER_H 12 12 #define ZSTD_COMPILER_H 13 13 14 + #include "portability_macros.h" 15 + 14 16 /*-******************************************************* 15 17 * Compiler specifics 16 18 *********************************************************/ ··· 36 34 37 35 /* 38 36 On MSVC qsort requires that functions passed into it use the __cdecl calling conversion(CC). 39 - This explictly marks such functions as __cdecl so that the code will still compile 37 + This explicitly marks such functions as __cdecl so that the code will still compile 40 38 if a CC other than __cdecl has been made the default. 41 39 */ 42 40 #define WIN_CDECL ··· 72 70 73 71 74 72 /* target attribute */ 75 - #ifndef __has_attribute 76 - #define __has_attribute(x) 0 /* Compatibility with non-clang compilers. */ 77 - #endif 78 73 #define TARGET_ATTRIBUTE(target) __attribute__((__target__(target))) 79 74 80 - /* Enable runtime BMI2 dispatch based on the CPU. 81 - * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default. 75 + /* Target attribute for BMI2 dynamic dispatch. 76 + * Enable lzcnt, bmi, and bmi2. 77 + * We test for bmi1 & bmi2. lzcnt is included in bmi1. 82 78 */ 83 - #ifndef DYNAMIC_BMI2 84 - #if ((defined(__clang__) && __has_attribute(__target__)) \ 85 - || (defined(__GNUC__) \ 86 - && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \ 87 - && (defined(__x86_64__) || defined(_M_X86)) \ 88 - && !defined(__BMI2__) 89 - # define DYNAMIC_BMI2 1 90 - #else 91 - # define DYNAMIC_BMI2 0 92 - #endif 93 - #endif 79 + #define BMI2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("lzcnt,bmi,bmi2") 94 80 95 81 /* prefetch 96 82 * can be disabled, by declaring NO_PREFETCH build macro */ ··· 105 115 } 106 116 107 117 /* vectorization 108 - * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax */ 109 - #if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) 118 + * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax, 119 + * and some compilers, like Intel ICC and MCST LCC, do not support it at all. */ 120 + #if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) && !defined(__LCC__) 110 121 # if (__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5) 111 122 # define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) 112 123 # else ··· 125 134 #define LIKELY(x) (__builtin_expect((x), 1)) 126 135 #define UNLIKELY(x) (__builtin_expect((x), 0)) 127 136 137 + #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))) 138 + # define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); } 139 + #else 140 + # define ZSTD_UNREACHABLE { assert(0); } 141 + #endif 142 + 128 143 /* disable warnings */ 129 144 130 145 /*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/ 131 146 132 147 133 - /* compat. with non-clang compilers */ 134 - #ifndef __has_builtin 135 - # define __has_builtin(x) 0 136 - #endif 137 - 138 - /* compat. with non-clang compilers */ 139 - #ifndef __has_feature 140 - # define __has_feature(x) 0 141 - #endif 148 + /* compile time determination of SIMD support */ 142 149 143 150 /* C-language Attributes are added in C23. */ 144 151 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute) ··· 157 168 */ 158 169 #define ZSTD_FALLTHROUGH fallthrough 159 170 160 - /* detects whether we are being compiled under msan */ 171 + /*-************************************************************** 172 + * Alignment check 173 + *****************************************************************/ 161 174 175 + /* this test was initially positioned in mem.h, 176 + * but this file is removed (or replaced) for linux kernel 177 + * so it's now hosted in compiler.h, 178 + * which remains valid for both user & kernel spaces. 179 + */ 162 180 163 - /* detects whether we are being compiled under asan */ 181 + #ifndef ZSTD_ALIGNOF 182 + /* covers gcc, clang & MSVC */ 183 + /* note : this section must come first, before C11, 184 + * due to a limitation in the kernel source generator */ 185 + # define ZSTD_ALIGNOF(T) __alignof(T) 186 + 187 + #endif /* ZSTD_ALIGNOF */ 188 + 189 + /*-************************************************************** 190 + * Sanitizer 191 + *****************************************************************/ 192 + 164 193 165 194 166 195 #endif /* ZSTD_COMPILER_H */
+4 -7
lib/zstd/common/entropy_common.c
··· 15 15 /* ************************************* 16 16 * Dependencies 17 17 ***************************************/ 18 - #include <linux/module.h> 19 18 #include "mem.h" 20 19 #include "error_private.h" /* ERR_*, ERROR */ 21 20 #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ ··· 212 213 } 213 214 214 215 #if DYNAMIC_BMI2 215 - TARGET_ATTRIBUTE("bmi2") static size_t FSE_readNCount_body_bmi2( 216 + BMI2_TARGET_ATTRIBUTE static size_t FSE_readNCount_body_bmi2( 216 217 short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, 217 218 const void* headerBuffer, size_t hbSize) 218 219 { ··· 239 240 { 240 241 return FSE_readNCount_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, headerBuffer, hbSize, /* bmi2 */ 0); 241 242 } 242 - EXPORT_SYMBOL_GPL(FSE_readNCount); 243 + 243 244 244 245 /*! HUF_readStats() : 245 246 Read compact Huffman tree, saved by HUF_writeCTable(). ··· 255 256 U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; 256 257 return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0); 257 258 } 258 - EXPORT_SYMBOL_GPL(HUF_readStats); 259 259 260 260 FORCE_INLINE_TEMPLATE size_t 261 261 HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats, ··· 294 296 ZSTD_memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32)); 295 297 weightTotal = 0; 296 298 { U32 n; for (n=0; n<oSize; n++) { 297 - if (huffWeight[n] >= HUF_TABLELOG_MAX) return ERROR(corruption_detected); 299 + if (huffWeight[n] > HUF_TABLELOG_MAX) return ERROR(corruption_detected); 298 300 rankStats[huffWeight[n]]++; 299 301 weightTotal += (1 << huffWeight[n]) >> 1; 300 302 } } ··· 332 334 } 333 335 334 336 #if DYNAMIC_BMI2 335 - static TARGET_ATTRIBUTE("bmi2") size_t HUF_readStats_body_bmi2(BYTE* huffWeight, size_t hwSize, U32* rankStats, 337 + static BMI2_TARGET_ATTRIBUTE size_t HUF_readStats_body_bmi2(BYTE* huffWeight, size_t hwSize, U32* rankStats, 336 338 U32* nbSymbolsPtr, U32* tableLogPtr, 337 339 const void* src, size_t srcSize, 338 340 void* workSpace, size_t wkspSize) ··· 355 357 (void)bmi2; 356 358 return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); 357 359 } 358 - EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
+80 -1
lib/zstd/common/error_private.h
··· 18 18 /* **************************************** 19 19 * Dependencies 20 20 ******************************************/ 21 - #include "zstd_deps.h" /* size_t */ 22 21 #include <linux/zstd_errors.h> /* enum list */ 22 + #include "compiler.h" 23 + #include "debug.h" 24 + #include "zstd_deps.h" /* size_t */ 23 25 24 26 25 27 /* **************************************** ··· 63 61 { 64 62 return ERR_getErrorString(ERR_getErrorCode(code)); 65 63 } 64 + 65 + /* 66 + * Ignore: this is an internal helper. 67 + * 68 + * This is a helper function to help force C99-correctness during compilation. 69 + * Under strict compilation modes, variadic macro arguments can't be empty. 70 + * However, variadic function arguments can be. Using a function therefore lets 71 + * us statically check that at least one (string) argument was passed, 72 + * independent of the compilation flags. 73 + */ 74 + static INLINE_KEYWORD UNUSED_ATTR 75 + void _force_has_format_string(const char *format, ...) { 76 + (void)format; 77 + } 78 + 79 + /* 80 + * Ignore: this is an internal helper. 81 + * 82 + * We want to force this function invocation to be syntactically correct, but 83 + * we don't want to force runtime evaluation of its arguments. 84 + */ 85 + #define _FORCE_HAS_FORMAT_STRING(...) \ 86 + if (0) { \ 87 + _force_has_format_string(__VA_ARGS__); \ 88 + } 89 + 90 + #define ERR_QUOTE(str) #str 91 + 92 + /* 93 + * Return the specified error if the condition evaluates to true. 94 + * 95 + * In debug modes, prints additional information. 96 + * In order to do that (particularly, printing the conditional that failed), 97 + * this can't just wrap RETURN_ERROR(). 98 + */ 99 + #define RETURN_ERROR_IF(cond, err, ...) \ 100 + if (cond) { \ 101 + RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ 102 + __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ 103 + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ 104 + RAWLOG(3, ": " __VA_ARGS__); \ 105 + RAWLOG(3, "\n"); \ 106 + return ERROR(err); \ 107 + } 108 + 109 + /* 110 + * Unconditionally return the specified error. 111 + * 112 + * In debug modes, prints additional information. 113 + */ 114 + #define RETURN_ERROR(err, ...) \ 115 + do { \ 116 + RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ 117 + __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ 118 + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ 119 + RAWLOG(3, ": " __VA_ARGS__); \ 120 + RAWLOG(3, "\n"); \ 121 + return ERROR(err); \ 122 + } while(0); 123 + 124 + /* 125 + * If the provided expression evaluates to an error code, returns that error code. 126 + * 127 + * In debug modes, prints additional information. 128 + */ 129 + #define FORWARD_IF_ERROR(err, ...) \ 130 + do { \ 131 + size_t const err_code = (err); \ 132 + if (ERR_isError(err_code)) { \ 133 + RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ 134 + __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ 135 + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ 136 + RAWLOG(3, ": " __VA_ARGS__); \ 137 + RAWLOG(3, "\n"); \ 138 + return err_code; \ 139 + } \ 140 + } while(0); 66 141 67 142 68 143 #endif /* ERROR_H_MODULE */
+2 -1
lib/zstd/common/fse.h
··· 333 333 /* FSE_buildCTable_wksp() : 334 334 * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). 335 335 * `wkspSize` must be >= `FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)` of `unsigned`. 336 + * See FSE_buildCTable_wksp() for breakdown of workspace usage. 336 337 */ 337 - #define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (maxSymbolValue + 2 + (1ull << (tableLog - 2))) 338 + #define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (((maxSymbolValue + 2) + (1ull << (tableLog)))/2 + sizeof(U64)/sizeof(U32) /* additional 8 bytes for potential table overwrite */) 338 339 #define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(unsigned) * FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)) 339 340 size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); 340 341
+1 -1
lib/zstd/common/fse_decompress.c
··· 365 365 } 366 366 367 367 #if DYNAMIC_BMI2 368 - TARGET_ATTRIBUTE("bmi2") static size_t FSE_decompress_wksp_body_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) 368 + BMI2_TARGET_ATTRIBUTE static size_t FSE_decompress_wksp_body_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) 369 369 { 370 370 return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, 1); 371 371 }
+24 -22
lib/zstd/common/huf.h
··· 86 86 87 87 /* HUF_compress4X_wksp() : 88 88 * Same as HUF_compress2(), but uses externally allocated `workSpace`. 89 - * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */ 90 - #define HUF_WORKSPACE_SIZE ((6 << 10) + 256) 91 - #define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32)) 89 + * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */ 90 + #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) 91 + #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) 92 92 HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, 93 93 const void* src, size_t srcSize, 94 94 unsigned maxSymbolValue, unsigned tableLog, ··· 113 113 114 114 115 115 /* *** Constants *** */ 116 - #define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */ 116 + #define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ 117 117 #define HUF_TABLELOG_DEFAULT 11 /* default tableLog value when none specified */ 118 118 #define HUF_SYMBOLVALUE_MAX 255 119 119 120 - #define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */ 120 + #define HUF_TABLELOG_ABSOLUTEMAX 12 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */ 121 121 #if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX) 122 122 # error "HUF_TABLELOG_MAX is too large !" 123 123 #endif ··· 133 133 134 134 /* static allocation of HUF's Compression Table */ 135 135 /* this is a private definition, just exposed for allocation and strict aliasing purpose. never EVER access its members directly */ 136 - struct HUF_CElt_s { 137 - U16 val; 138 - BYTE nbBits; 139 - }; /* typedef'd to HUF_CElt */ 140 - typedef struct HUF_CElt_s HUF_CElt; /* consider it an incomplete type */ 141 - #define HUF_CTABLE_SIZE_U32(maxSymbolValue) ((maxSymbolValue)+1) /* Use tables of U32, for proper alignment */ 142 - #define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32)) 136 + typedef size_t HUF_CElt; /* consider it an incomplete type */ 137 + #define HUF_CTABLE_SIZE_ST(maxSymbolValue) ((maxSymbolValue)+2) /* Use tables of size_t, for proper alignment */ 138 + #define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_ST(maxSymbolValue) * sizeof(size_t)) 143 139 #define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \ 144 - HUF_CElt name[HUF_CTABLE_SIZE_U32(maxSymbolValue)] /* no final ; */ 140 + HUF_CElt name[HUF_CTABLE_SIZE_ST(maxSymbolValue)] /* no final ; */ 145 141 146 142 /* static allocation of HUF's DTable */ 147 143 typedef U32 HUF_DTable; ··· 187 191 size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); 188 192 size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize); 189 193 size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); 194 + size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); 190 195 size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); 191 196 int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); 192 197 ··· 200 203 * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. 201 204 * If it uses hufTable it does not modify hufTable or repeat. 202 205 * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. 203 - * If preferRepeat then the old table will always be used if valid. */ 206 + * If preferRepeat then the old table will always be used if valid. 207 + * If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ 204 208 size_t HUF_compress4X_repeat(void* dst, size_t dstSize, 205 209 const void* src, size_t srcSize, 206 210 unsigned maxSymbolValue, unsigned tableLog, 207 211 void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ 208 - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); 212 + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); 209 213 210 214 /* HUF_buildCTable_wksp() : 211 215 * Same as HUF_buildCTable(), but using externally allocated scratch buffer. ··· 244 246 * Loading a CTable saved with HUF_writeCTable() */ 245 247 size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights); 246 248 247 - /* HUF_getNbBits() : 249 + /* HUF_getNbBitsFromCTable() : 248 250 * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX 249 - * Note 1 : is not inlined, as HUF_CElt definition is private 250 - * Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */ 251 - U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue); 251 + * Note 1 : is not inlined, as HUF_CElt definition is private */ 252 + U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue); 252 253 253 254 /* 254 255 * HUF_decompress() does the following: ··· 299 302 /* ====================== */ 300 303 301 304 size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); 302 - size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ 305 + size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */ 303 306 size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); 307 + size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); 304 308 /* HUF_compress1X_repeat() : 305 309 * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. 306 310 * If it uses hufTable it does not modify hufTable or repeat. 307 311 * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. 308 - * If preferRepeat then the old table will always be used if valid. */ 312 + * If preferRepeat then the old table will always be used if valid. 313 + * If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ 309 314 size_t HUF_compress1X_repeat(void* dst, size_t dstSize, 310 315 const void* src, size_t srcSize, 311 316 unsigned maxSymbolValue, unsigned tableLog, 312 317 void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ 313 - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); 318 + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); 314 319 315 320 size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ 316 321 #ifndef HUF_FORCE_DECOMPRESS_X1 ··· 349 350 size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); 350 351 #ifndef HUF_FORCE_DECOMPRESS_X2 351 352 size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); 353 + #endif 354 + #ifndef HUF_FORCE_DECOMPRESS_X1 355 + size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); 352 356 #endif 353 357 354 358 #endif /* HUF_STATIC_LINKING_ONLY */
+2
lib/zstd/common/mem.h
··· 30 30 * Basic Types 31 31 *****************************************************************/ 32 32 typedef uint8_t BYTE; 33 + typedef uint8_t U8; 34 + typedef int8_t S8; 33 35 typedef uint16_t U16; 34 36 typedef int16_t S16; 35 37 typedef uint32_t U32;
+93
lib/zstd/common/portability_macros.h
··· 1 + /* 2 + * Copyright (c) Facebook, Inc. 3 + * All rights reserved. 4 + * 5 + * This source code is licensed under both the BSD-style license (found in the 6 + * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 + * in the COPYING file in the root directory of this source tree). 8 + * You may select, at your option, one of the above-listed licenses. 9 + */ 10 + 11 + #ifndef ZSTD_PORTABILITY_MACROS_H 12 + #define ZSTD_PORTABILITY_MACROS_H 13 + 14 + /* 15 + * This header file contains macro defintions to support portability. 16 + * This header is shared between C and ASM code, so it MUST only 17 + * contain macro definitions. It MUST not contain any C code. 18 + * 19 + * This header ONLY defines macros to detect platforms/feature support. 20 + * 21 + */ 22 + 23 + 24 + /* compat. with non-clang compilers */ 25 + #ifndef __has_attribute 26 + #define __has_attribute(x) 0 27 + #endif 28 + 29 + /* compat. with non-clang compilers */ 30 + #ifndef __has_builtin 31 + # define __has_builtin(x) 0 32 + #endif 33 + 34 + /* compat. with non-clang compilers */ 35 + #ifndef __has_feature 36 + # define __has_feature(x) 0 37 + #endif 38 + 39 + /* detects whether we are being compiled under msan */ 40 + 41 + /* detects whether we are being compiled under asan */ 42 + 43 + /* detects whether we are being compiled under dfsan */ 44 + 45 + /* Mark the internal assembly functions as hidden */ 46 + #ifdef __ELF__ 47 + # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func 48 + #else 49 + # define ZSTD_HIDE_ASM_FUNCTION(func) 50 + #endif 51 + 52 + /* Enable runtime BMI2 dispatch based on the CPU. 53 + * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default. 54 + */ 55 + #ifndef DYNAMIC_BMI2 56 + #if ((defined(__clang__) && __has_attribute(__target__)) \ 57 + || (defined(__GNUC__) \ 58 + && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \ 59 + && (defined(__x86_64__) || defined(_M_X64)) \ 60 + && !defined(__BMI2__) 61 + # define DYNAMIC_BMI2 1 62 + #else 63 + # define DYNAMIC_BMI2 0 64 + #endif 65 + #endif 66 + 67 + /* 68 + * Only enable assembly for GNUC comptabile compilers, 69 + * because other platforms may not support GAS assembly syntax. 70 + * 71 + * Only enable assembly for Linux / MacOS, other platforms may 72 + * work, but they haven't been tested. This could likely be 73 + * extended to BSD systems. 74 + * 75 + * Disable assembly when MSAN is enabled, because MSAN requires 76 + * 100% of code to be instrumented to work. 77 + */ 78 + #define ZSTD_ASM_SUPPORTED 1 79 + 80 + /* 81 + * Determines whether we should enable assembly for x86-64 82 + * with BMI2. 83 + * 84 + * Enable if all of the following conditions hold: 85 + * - ASM hasn't been explicitly disabled by defining ZSTD_DISABLE_ASM 86 + * - Assembly is supported 87 + * - We are compiling for x86-64 and either: 88 + * - DYNAMIC_BMI2 is enabled 89 + * - BMI2 is supported at compile time 90 + */ 91 + #define ZSTD_ENABLE_ASM_X86_64_BMI2 0 92 + 93 + #endif /* ZSTD_PORTABILITY_MACROS_H */
-10
lib/zstd/common/zstd_common.c
··· 13 13 /*-************************************* 14 14 * Dependencies 15 15 ***************************************/ 16 - #include <linux/module.h> 17 16 #define ZSTD_DEPS_NEED_MALLOC 18 17 #include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ 19 18 #include "error_private.h" ··· 35 36 * tells if a return value is an error code 36 37 * symbol is required for external callers */ 37 38 unsigned ZSTD_isError(size_t code) { return ERR_isError(code); } 38 - EXPORT_SYMBOL_GPL(ZSTD_isError); 39 39 40 40 /*! ZSTD_getErrorName() : 41 41 * provides error code string from function result (useful for debugging) */ 42 42 const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); } 43 - EXPORT_SYMBOL_GPL(ZSTD_getErrorName); 44 43 45 44 /*! ZSTD_getError() : 46 45 * convert a `size_t` function result into a proper ZSTD_errorCode enum */ 47 46 ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); } 48 - EXPORT_SYMBOL_GPL(ZSTD_getErrorCode); 49 47 50 48 /*! ZSTD_getErrorString() : 51 49 * provides error code string from enum */ ··· 59 63 return customMem.customAlloc(customMem.opaque, size); 60 64 return ZSTD_malloc(size); 61 65 } 62 - EXPORT_SYMBOL_GPL(ZSTD_customMalloc); 63 66 64 67 void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) 65 68 { ··· 71 76 } 72 77 return ZSTD_calloc(1, size); 73 78 } 74 - EXPORT_SYMBOL_GPL(ZSTD_customCalloc); 75 79 76 80 void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) 77 81 { ··· 81 87 ZSTD_free(ptr); 82 88 } 83 89 } 84 - EXPORT_SYMBOL_GPL(ZSTD_customFree); 85 - 86 - MODULE_LICENSE("Dual BSD/GPL"); 87 - MODULE_DESCRIPTION("Zstd Common");
+84 -91
lib/zstd/common/zstd_internal.h
··· 20 20 * Dependencies 21 21 ***************************************/ 22 22 #include "compiler.h" 23 + #include "cpu.h" 23 24 #include "mem.h" 24 25 #include "debug.h" /* assert, DEBUGLOG, RAWLOG, g_debuglevel */ 25 26 #include "error_private.h" ··· 48 47 #undef MAX 49 48 #define MIN(a,b) ((a)<(b) ? (a) : (b)) 50 49 #define MAX(a,b) ((a)>(b) ? (a) : (b)) 51 - 52 - /* 53 - * Ignore: this is an internal helper. 54 - * 55 - * This is a helper function to help force C99-correctness during compilation. 56 - * Under strict compilation modes, variadic macro arguments can't be empty. 57 - * However, variadic function arguments can be. Using a function therefore lets 58 - * us statically check that at least one (string) argument was passed, 59 - * independent of the compilation flags. 60 - */ 61 - static INLINE_KEYWORD UNUSED_ATTR 62 - void _force_has_format_string(const char *format, ...) { 63 - (void)format; 64 - } 65 - 66 - /* 67 - * Ignore: this is an internal helper. 68 - * 69 - * We want to force this function invocation to be syntactically correct, but 70 - * we don't want to force runtime evaluation of its arguments. 71 - */ 72 - #define _FORCE_HAS_FORMAT_STRING(...) \ 73 - if (0) { \ 74 - _force_has_format_string(__VA_ARGS__); \ 75 - } 76 - 77 - /* 78 - * Return the specified error if the condition evaluates to true. 79 - * 80 - * In debug modes, prints additional information. 81 - * In order to do that (particularly, printing the conditional that failed), 82 - * this can't just wrap RETURN_ERROR(). 83 - */ 84 - #define RETURN_ERROR_IF(cond, err, ...) \ 85 - if (cond) { \ 86 - RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ 87 - __FILE__, __LINE__, ZSTD_QUOTE(cond), ZSTD_QUOTE(ERROR(err))); \ 88 - _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ 89 - RAWLOG(3, ": " __VA_ARGS__); \ 90 - RAWLOG(3, "\n"); \ 91 - return ERROR(err); \ 92 - } 93 - 94 - /* 95 - * Unconditionally return the specified error. 96 - * 97 - * In debug modes, prints additional information. 98 - */ 99 - #define RETURN_ERROR(err, ...) \ 100 - do { \ 101 - RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ 102 - __FILE__, __LINE__, ZSTD_QUOTE(ERROR(err))); \ 103 - _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ 104 - RAWLOG(3, ": " __VA_ARGS__); \ 105 - RAWLOG(3, "\n"); \ 106 - return ERROR(err); \ 107 - } while(0); 108 - 109 - /* 110 - * If the provided expression evaluates to an error code, returns that error code. 111 - * 112 - * In debug modes, prints additional information. 113 - */ 114 - #define FORWARD_IF_ERROR(err, ...) \ 115 - do { \ 116 - size_t const err_code = (err); \ 117 - if (ERR_isError(err_code)) { \ 118 - RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ 119 - __FILE__, __LINE__, ZSTD_QUOTE(err), ERR_getErrorName(err_code)); \ 120 - _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ 121 - RAWLOG(3, ": " __VA_ARGS__); \ 122 - RAWLOG(3, "\n"); \ 123 - return err_code; \ 124 - } \ 125 - } while(0); 50 + #define BOUNDED(min,val,max) (MAX(min,MIN(val,max))) 126 51 127 52 128 53 /*-************************************* ··· 57 130 #define ZSTD_OPT_NUM (1<<12) 58 131 59 132 #define ZSTD_REP_NUM 3 /* number of repcodes */ 60 - #define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) 61 133 static UNUSED_ATTR const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 }; 62 134 63 135 #define KB *(1 <<10) ··· 108 182 /* Each table cannot take more than #symbols * FSELog bits */ 109 183 #define ZSTD_MAX_FSE_HEADERS_SIZE (((MaxML + 1) * MLFSELog + (MaxLL + 1) * LLFSELog + (MaxOff + 1) * OffFSELog + 7) / 8) 110 184 111 - static UNUSED_ATTR const U32 LL_bits[MaxLL+1] = { 185 + static UNUSED_ATTR const U8 LL_bits[MaxLL+1] = { 112 186 0, 0, 0, 0, 0, 0, 0, 0, 113 187 0, 0, 0, 0, 0, 0, 0, 0, 114 188 1, 1, 1, 1, 2, 2, 3, 3, ··· 125 199 #define LL_DEFAULTNORMLOG 6 /* for static allocation */ 126 200 static UNUSED_ATTR const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG; 127 201 128 - static UNUSED_ATTR const U32 ML_bits[MaxML+1] = { 202 + static UNUSED_ATTR const U8 ML_bits[MaxML+1] = { 129 203 0, 0, 0, 0, 0, 0, 0, 0, 130 204 0, 0, 0, 0, 0, 0, 0, 0, 131 205 0, 0, 0, 0, 0, 0, 0, 0, ··· 160 234 * Shared functions to include for inlining 161 235 *********************************************/ 162 236 static void ZSTD_copy8(void* dst, const void* src) { 237 + #if defined(ZSTD_ARCH_ARM_NEON) 238 + vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src)); 239 + #else 163 240 ZSTD_memcpy(dst, src, 8); 241 + #endif 164 242 } 165 - 166 243 #define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; } 244 + 245 + /* Need to use memmove here since the literal buffer can now be located within 246 + the dst buffer. In circumstances where the op "catches up" to where the 247 + literal buffer is, there can be partial overlaps in this call on the final 248 + copy if the literal is being shifted by less than 16 bytes. */ 167 249 static void ZSTD_copy16(void* dst, const void* src) { 168 - ZSTD_memcpy(dst, src, 16); 250 + #if defined(ZSTD_ARCH_ARM_NEON) 251 + vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src)); 252 + #elif defined(ZSTD_ARCH_X86_SSE2) 253 + _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src)); 254 + #elif defined(__clang__) 255 + ZSTD_memmove(dst, src, 16); 256 + #else 257 + /* ZSTD_memmove is not inlined properly by gcc */ 258 + BYTE copy16_buf[16]; 259 + ZSTD_memcpy(copy16_buf, src, 16); 260 + ZSTD_memcpy(dst, copy16_buf, 16); 261 + #endif 169 262 } 170 263 #define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; } 171 264 ··· 211 266 const BYTE* ip = (const BYTE*)src; 212 267 BYTE* op = (BYTE*)dst; 213 268 BYTE* const oend = op + length; 214 - 215 - assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff <= -WILDCOPY_VECLEN)); 216 269 217 270 if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) { 218 271 /* Handle short offset copies. */ ··· 274 331 * Private declarations 275 332 *********************************************/ 276 333 typedef struct seqDef_s { 277 - U32 offset; /* Offset code of the sequence */ 334 + U32 offBase; /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */ 278 335 U16 litLength; 279 - U16 matchLength; 336 + U16 mlBase; /* mlBase == matchLength - MINMATCH */ 280 337 } seqDef; 338 + 339 + /* Controls whether seqStore has a single "long" litLength or matchLength. See seqStore_t. */ 340 + typedef enum { 341 + ZSTD_llt_none = 0, /* no longLengthType */ 342 + ZSTD_llt_literalLength = 1, /* represents a long literal */ 343 + ZSTD_llt_matchLength = 2 /* represents a long match */ 344 + } ZSTD_longLengthType_e; 281 345 282 346 typedef struct { 283 347 seqDef* sequencesStart; ··· 297 347 size_t maxNbSeq; 298 348 size_t maxNbLit; 299 349 300 - /* longLengthPos and longLengthID to allow us to represent either a single litLength or matchLength 350 + /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength 301 351 * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment 302 352 * the existing value of the litLength or matchLength by 0x10000. 303 353 */ 304 - U32 longLengthID; /* 0 == no longLength; 1 == Represent the long literal; 2 == Represent the long match; */ 305 - U32 longLengthPos; /* Index of the sequence to apply long length modification to */ 354 + ZSTD_longLengthType_e longLengthType; 355 + U32 longLengthPos; /* Index of the sequence to apply long length modification to */ 306 356 } seqStore_t; 307 357 308 358 typedef struct { ··· 312 362 313 363 /* 314 364 * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences 315 - * indicated by longLengthPos and longLengthID, and adds MINMATCH back to matchLength. 365 + * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength. 316 366 */ 317 367 MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq) 318 368 { 319 369 ZSTD_sequenceLength seqLen; 320 370 seqLen.litLength = seq->litLength; 321 - seqLen.matchLength = seq->matchLength + MINMATCH; 371 + seqLen.matchLength = seq->mlBase + MINMATCH; 322 372 if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { 323 - if (seqStore->longLengthID == 1) { 373 + if (seqStore->longLengthType == ZSTD_llt_literalLength) { 324 374 seqLen.litLength += 0xFFFF; 325 375 } 326 - if (seqStore->longLengthID == 2) { 376 + if (seqStore->longLengthType == ZSTD_llt_matchLength) { 327 377 seqLen.matchLength += 0xFFFF; 328 378 } 329 379 } ··· 369 419 } 370 420 } 371 421 422 + /* 423 + * Counts the number of trailing zeros of a `size_t`. 424 + * Most compilers should support CTZ as a builtin. A backup 425 + * implementation is provided if the builtin isn't supported, but 426 + * it may not be terribly efficient. 427 + */ 428 + MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val) 429 + { 430 + if (MEM_64bits()) { 431 + # if (__GNUC__ >= 4) 432 + return __builtin_ctzll((U64)val); 433 + # else 434 + static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19, 435 + 4, 25, 14, 28, 9, 34, 20, 56, 436 + 5, 17, 26, 54, 15, 41, 29, 43, 437 + 10, 31, 38, 35, 21, 45, 49, 57, 438 + 63, 6, 12, 18, 24, 27, 33, 55, 439 + 16, 53, 40, 42, 30, 37, 44, 48, 440 + 62, 11, 23, 32, 52, 39, 36, 47, 441 + 61, 22, 51, 46, 60, 50, 59, 58 }; 442 + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; 443 + # endif 444 + } else { /* 32 bits */ 445 + # if (__GNUC__ >= 3) 446 + return __builtin_ctz((U32)val); 447 + # else 448 + static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 449 + 30, 22, 20, 15, 25, 17, 4, 8, 450 + 31, 27, 13, 23, 21, 19, 16, 7, 451 + 26, 12, 18, 6, 11, 5, 10, 9 }; 452 + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; 453 + # endif 454 + } 455 + } 456 + 372 457 373 458 /* ZSTD_invalidateRepCodes() : 374 459 * ensures next compression will not use repcodes from previous block. ··· 430 445 size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, 431 446 const void* src, size_t srcSize); 432 447 448 + /* 449 + * @returns true iff the CPU supports dynamic BMI2 dispatch. 450 + */ 451 + MEM_STATIC int ZSTD_cpuSupportsBmi2(void) 452 + { 453 + ZSTD_cpuid_t cpuid = ZSTD_cpuid(); 454 + return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid); 455 + } 433 456 434 457 435 458 #endif /* ZSTD_CCOMMON_H_MODULE */
+132
lib/zstd/compress/clevels.h
··· 1 + /* 2 + * Copyright (c) Yann Collet, Facebook, Inc. 3 + * All rights reserved. 4 + * 5 + * This source code is licensed under both the BSD-style license (found in the 6 + * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 + * in the COPYING file in the root directory of this source tree). 8 + * You may select, at your option, one of the above-listed licenses. 9 + */ 10 + 11 + #ifndef ZSTD_CLEVELS_H 12 + #define ZSTD_CLEVELS_H 13 + 14 + #define ZSTD_STATIC_LINKING_ONLY /* ZSTD_compressionParameters */ 15 + #include <linux/zstd.h> 16 + 17 + /*-===== Pre-defined compression levels =====-*/ 18 + 19 + #define ZSTD_MAX_CLEVEL 22 20 + 21 + __attribute__((__unused__)) 22 + 23 + static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = { 24 + { /* "default" - for any srcSize > 256 KB */ 25 + /* W, C, H, S, L, TL, strat */ 26 + { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels */ 27 + { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */ 28 + { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */ 29 + { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */ 30 + { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */ 31 + { 21, 18, 19, 3, 5, 2, ZSTD_greedy }, /* level 5 */ 32 + { 21, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6 */ 33 + { 21, 19, 20, 4, 5, 8, ZSTD_lazy }, /* level 7 */ 34 + { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 8 */ 35 + { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */ 36 + { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 10 */ 37 + { 22, 21, 22, 6, 5, 16, ZSTD_lazy2 }, /* level 11 */ 38 + { 22, 22, 23, 6, 5, 32, ZSTD_lazy2 }, /* level 12 */ 39 + { 22, 22, 22, 4, 5, 32, ZSTD_btlazy2 }, /* level 13 */ 40 + { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */ 41 + { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */ 42 + { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */ 43 + { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */ 44 + { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */ 45 + { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */ 46 + { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */ 47 + { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */ 48 + { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */ 49 + }, 50 + { /* for srcSize <= 256 KB */ 51 + /* W, C, H, S, L, T, strat */ 52 + { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ 53 + { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */ 54 + { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */ 55 + { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */ 56 + { 18, 16, 17, 3, 5, 2, ZSTD_greedy }, /* level 4.*/ 57 + { 18, 17, 18, 5, 5, 2, ZSTD_greedy }, /* level 5.*/ 58 + { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/ 59 + { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */ 60 + { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ 61 + { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ 62 + { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ 63 + { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/ 64 + { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/ 65 + { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */ 66 + { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ 67 + { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/ 68 + { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/ 69 + { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/ 70 + { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/ 71 + { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ 72 + { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/ 73 + { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/ 74 + { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/ 75 + }, 76 + { /* for srcSize <= 128 KB */ 77 + /* W, C, H, S, L, T, strat */ 78 + { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ 79 + { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */ 80 + { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */ 81 + { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */ 82 + { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */ 83 + { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */ 84 + { 17, 16, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */ 85 + { 17, 16, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */ 86 + { 17, 16, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ 87 + { 17, 16, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ 88 + { 17, 16, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ 89 + { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */ 90 + { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */ 91 + { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/ 92 + { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ 93 + { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/ 94 + { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/ 95 + { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/ 96 + { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/ 97 + { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/ 98 + { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/ 99 + { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ 100 + { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/ 101 + }, 102 + { /* for srcSize <= 16 KB */ 103 + /* W, C, H, S, L, T, strat */ 104 + { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ 105 + { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */ 106 + { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */ 107 + { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */ 108 + { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */ 109 + { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/ 110 + { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */ 111 + { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */ 112 + { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/ 113 + { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/ 114 + { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/ 115 + { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/ 116 + { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/ 117 + { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/ 118 + { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/ 119 + { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/ 120 + { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/ 121 + { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/ 122 + { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/ 123 + { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ 124 + { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/ 125 + { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ 126 + { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/ 127 + }, 128 + }; 129 + 130 + 131 + 132 + #endif /* ZSTD_CLEVELS_H */
+63 -20
lib/zstd/compress/fse_compress.c
··· 75 75 void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ; 76 76 FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); 77 77 U32 const step = FSE_TABLESTEP(tableSize); 78 + U32 const maxSV1 = maxSymbolValue+1; 78 79 79 - U32* cumul = (U32*)workSpace; 80 - FSE_FUNCTION_TYPE* tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSymbolValue + 2)); 80 + U16* cumul = (U16*)workSpace; /* size = maxSV1 */ 81 + FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1+1)); /* size = tableSize */ 81 82 82 83 U32 highThreshold = tableSize-1; 83 84 84 - if ((size_t)workSpace & 3) return ERROR(GENERIC); /* Must be 4 byte aligned */ 85 + assert(((size_t)workSpace & 1) == 0); /* Must be 2 bytes-aligned */ 85 86 if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize) return ERROR(tableLog_tooLarge); 86 87 /* CTable header */ 87 88 tableU16[-2] = (U16) tableLog; ··· 99 98 /* symbol start positions */ 100 99 { U32 u; 101 100 cumul[0] = 0; 102 - for (u=1; u <= maxSymbolValue+1; u++) { 101 + for (u=1; u <= maxSV1; u++) { 103 102 if (normalizedCounter[u-1]==-1) { /* Low proba symbol */ 104 103 cumul[u] = cumul[u-1] + 1; 105 104 tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1); 106 105 } else { 107 - cumul[u] = cumul[u-1] + normalizedCounter[u-1]; 106 + assert(normalizedCounter[u-1] >= 0); 107 + cumul[u] = cumul[u-1] + (U16)normalizedCounter[u-1]; 108 + assert(cumul[u] >= cumul[u-1]); /* no overflow */ 108 109 } } 109 - cumul[maxSymbolValue+1] = tableSize+1; 110 + cumul[maxSV1] = (U16)(tableSize+1); 110 111 } 111 112 112 113 /* Spread symbols */ 113 - { U32 position = 0; 114 + if (highThreshold == tableSize - 1) { 115 + /* Case for no low prob count symbols. Lay down 8 bytes at a time 116 + * to reduce branch misses since we are operating on a small block 117 + */ 118 + BYTE* const spread = tableSymbol + tableSize; /* size = tableSize + 8 (may write beyond tableSize) */ 119 + { U64 const add = 0x0101010101010101ull; 120 + size_t pos = 0; 121 + U64 sv = 0; 122 + U32 s; 123 + for (s=0; s<maxSV1; ++s, sv += add) { 124 + int i; 125 + int const n = normalizedCounter[s]; 126 + MEM_write64(spread + pos, sv); 127 + for (i = 8; i < n; i += 8) { 128 + MEM_write64(spread + pos + i, sv); 129 + } 130 + assert(n>=0); 131 + pos += (size_t)n; 132 + } 133 + } 134 + /* Spread symbols across the table. Lack of lowprob symbols means that 135 + * we don't need variable sized inner loop, so we can unroll the loop and 136 + * reduce branch misses. 137 + */ 138 + { size_t position = 0; 139 + size_t s; 140 + size_t const unroll = 2; /* Experimentally determined optimal unroll */ 141 + assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ 142 + for (s = 0; s < (size_t)tableSize; s += unroll) { 143 + size_t u; 144 + for (u = 0; u < unroll; ++u) { 145 + size_t const uPosition = (position + (u * step)) & tableMask; 146 + tableSymbol[uPosition] = spread[s + u]; 147 + } 148 + position = (position + (unroll * step)) & tableMask; 149 + } 150 + assert(position == 0); /* Must have initialized all positions */ 151 + } 152 + } else { 153 + U32 position = 0; 114 154 U32 symbol; 115 - for (symbol=0; symbol<=maxSymbolValue; symbol++) { 155 + for (symbol=0; symbol<maxSV1; symbol++) { 116 156 int nbOccurrences; 117 157 int const freq = normalizedCounter[symbol]; 118 158 for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) { ··· 162 120 while (position > highThreshold) 163 121 position = (position + step) & tableMask; /* Low proba area */ 164 122 } } 165 - 166 123 assert(position==0); /* Must have initialized all positions */ 167 124 } 168 125 ··· 185 144 case -1: 186 145 case 1: 187 146 symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog); 188 - symbolTT[s].deltaFindState = total - 1; 147 + assert(total <= INT_MAX); 148 + symbolTT[s].deltaFindState = (int)(total - 1); 189 149 total ++; 190 150 break; 191 151 default : 192 - { 193 - U32 const maxBitsOut = tableLog - BIT_highbit32 (normalizedCounter[s]-1); 194 - U32 const minStatePlus = normalizedCounter[s] << maxBitsOut; 152 + assert(normalizedCounter[s] > 1); 153 + { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1); 154 + U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; 195 155 symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; 196 - symbolTT[s].deltaFindState = total - normalizedCounter[s]; 197 - total += normalizedCounter[s]; 156 + symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); 157 + total += (unsigned)normalizedCounter[s]; 198 158 } } } } 199 159 200 160 #if 0 /* debug : symbol costs */ ··· 206 164 symbol, normalizedCounter[symbol], 207 165 FSE_getMaxNbBits(symbolTT, symbol), 208 166 (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256); 209 - } 210 - } 167 + } } 211 168 #endif 212 169 213 170 return 0; ··· 214 173 215 174 216 175 217 - 218 176 #ifndef FSE_COMMONDEFS_ONLY 219 - 220 177 221 178 /*-************************************************************** 222 179 * FSE NCount encoding 223 180 ****************************************************************/ 224 181 size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) 225 182 { 226 - size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3; 183 + size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog 184 + + 4 /* bitCount initialized at 4 */ 185 + + 2 /* first two symbols may use one additional bit each */) / 8) 186 + + 1 /* round up to whole nb bytes */ 187 + + 2 /* additional two bytes for bitstream flush */; 227 188 return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ 228 189 } 229 190
+536 -106
lib/zstd/compress/huf_compress.c
··· 50 50 /* ******************************************************* 51 51 * HUF : Huffman block compression 52 52 *********************************************************/ 53 + #define HUF_WORKSPACE_MAX_ALIGNMENT 8 54 + 55 + static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align) 56 + { 57 + size_t const mask = align - 1; 58 + size_t const rem = (size_t)workspace & mask; 59 + size_t const add = (align - rem) & mask; 60 + BYTE* const aligned = (BYTE*)workspace + add; 61 + assert((align & (align - 1)) == 0); /* pow 2 */ 62 + assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT); 63 + if (*workspaceSizePtr >= add) { 64 + assert(add < align); 65 + assert(((size_t)aligned & mask) == 0); 66 + *workspaceSizePtr -= add; 67 + return aligned; 68 + } else { 69 + *workspaceSizePtr = 0; 70 + return NULL; 71 + } 72 + } 73 + 74 + 53 75 /* HUF_compressWeights() : 54 76 * Same as FSE_compress(), but dedicated to huff0's weights compression. 55 77 * The use case needs much less stack memory. ··· 94 72 95 73 unsigned maxSymbolValue = HUF_TABLELOG_MAX; 96 74 U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER; 97 - HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)workspace; 75 + HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32)); 98 76 99 77 if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC); 100 78 ··· 125 103 return (size_t)(op-ostart); 126 104 } 127 105 106 + static size_t HUF_getNbBits(HUF_CElt elt) 107 + { 108 + return elt & 0xFF; 109 + } 110 + 111 + static size_t HUF_getNbBitsFast(HUF_CElt elt) 112 + { 113 + return elt; 114 + } 115 + 116 + static size_t HUF_getValue(HUF_CElt elt) 117 + { 118 + return elt & ~0xFF; 119 + } 120 + 121 + static size_t HUF_getValueFast(HUF_CElt elt) 122 + { 123 + return elt; 124 + } 125 + 126 + static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits) 127 + { 128 + assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX); 129 + *elt = nbBits; 130 + } 131 + 132 + static void HUF_setValue(HUF_CElt* elt, size_t value) 133 + { 134 + size_t const nbBits = HUF_getNbBits(*elt); 135 + if (nbBits > 0) { 136 + assert((value >> nbBits) == 0); 137 + *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits); 138 + } 139 + } 128 140 129 141 typedef struct { 130 142 HUF_CompressWeightsWksp wksp; ··· 170 114 const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, 171 115 void* workspace, size_t workspaceSize) 172 116 { 117 + HUF_CElt const* const ct = CTable + 1; 173 118 BYTE* op = (BYTE*)dst; 174 119 U32 n; 175 - HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)workspace; 120 + HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32)); 176 121 177 122 /* check conditions */ 178 123 if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); ··· 184 127 for (n=1; n<huffLog+1; n++) 185 128 wksp->bitsToWeight[n] = (BYTE)(huffLog + 1 - n); 186 129 for (n=0; n<maxSymbolValue; n++) 187 - wksp->huffWeight[n] = wksp->bitsToWeight[CTable[n].nbBits]; 130 + wksp->huffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])]; 188 131 189 132 /* attempt weights compression by FSE */ 133 + if (maxDstSize < 1) return ERROR(dstSize_tooSmall); 190 134 { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) ); 191 135 if ((hSize>1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */ 192 136 op[0] = (BYTE)hSize; ··· 221 163 U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */ 222 164 U32 tableLog = 0; 223 165 U32 nbSymbols = 0; 166 + HUF_CElt* const ct = CTable + 1; 224 167 225 168 /* get symbol weights */ 226 169 CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize)); ··· 230 171 /* check result */ 231 172 if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); 232 173 if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); 174 + 175 + CTable[0] = tableLog; 233 176 234 177 /* Prepare base value per rank */ 235 178 { U32 n, nextRankStart = 0; ··· 244 183 /* fill nbBits */ 245 184 { U32 n; for (n=0; n<nbSymbols; n++) { 246 185 const U32 w = huffWeight[n]; 247 - CTable[n].nbBits = (BYTE)(tableLog + 1 - w) & -(w != 0); 186 + HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w != 0)); 248 187 } } 249 188 250 189 /* fill val */ 251 190 { U16 nbPerRank[HUF_TABLELOG_MAX+2] = {0}; /* support w=0=>n=tableLog+1 */ 252 191 U16 valPerRank[HUF_TABLELOG_MAX+2] = {0}; 253 - { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; } 192 + { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[HUF_getNbBits(ct[n])]++; } 254 193 /* determine stating value per rank */ 255 194 valPerRank[tableLog+1] = 0; /* for w==0 */ 256 195 { U16 min = 0; ··· 260 199 min >>= 1; 261 200 } } 262 201 /* assign value within rank, symbol order */ 263 - { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; } 202 + { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); } 264 203 } 265 204 266 205 *maxSymbolValuePtr = nbSymbols - 1; 267 206 return readSize; 268 207 } 269 208 270 - U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue) 209 + U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue) 271 210 { 272 - const HUF_CElt* table = (const HUF_CElt*)symbolTable; 211 + const HUF_CElt* ct = CTable + 1; 273 212 assert(symbolValue <= HUF_SYMBOLVALUE_MAX); 274 - return table[symbolValue].nbBits; 213 + return (U32)HUF_getNbBits(ct[symbolValue]); 275 214 } 276 215 277 216 ··· 425 364 } 426 365 427 366 typedef struct { 428 - U32 base; 429 - U32 curr; 367 + U16 base; 368 + U16 curr; 430 369 } rankPos; 431 370 432 371 typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; 433 372 434 - #define RANK_POSITION_TABLE_SIZE 32 373 + /* Number of buckets available for HUF_sort() */ 374 + #define RANK_POSITION_TABLE_SIZE 192 435 375 436 376 typedef struct { 437 377 huffNodeTable huffNodeTbl; 438 378 rankPos rankPosition[RANK_POSITION_TABLE_SIZE]; 439 379 } HUF_buildCTable_wksp_tables; 440 380 381 + /* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing. 382 + * Strategy is to use as many buckets as possible for representing distinct 383 + * counts while using the remainder to represent all "large" counts. 384 + * 385 + * To satisfy this requirement for 192 buckets, we can do the following: 386 + * Let buckets 0-166 represent distinct counts of [0, 166] 387 + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. 388 + */ 389 + #define RANK_POSITION_MAX_COUNT_LOG 32 390 + #define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */ 391 + #define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */ 392 + 393 + /* Return the appropriate bucket index for a given count. See definition of 394 + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. 395 + */ 396 + static U32 HUF_getIndex(U32 const count) { 397 + return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) 398 + ? count 399 + : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; 400 + } 401 + 402 + /* Helper swap function for HUF_quickSortPartition() */ 403 + static void HUF_swapNodes(nodeElt* a, nodeElt* b) { 404 + nodeElt tmp = *a; 405 + *a = *b; 406 + *b = tmp; 407 + } 408 + 409 + /* Returns 0 if the huffNode array is not sorted by descending count */ 410 + MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) { 411 + U32 i; 412 + for (i = 1; i < maxSymbolValue1; ++i) { 413 + if (huffNode[i].count > huffNode[i-1].count) { 414 + return 0; 415 + } 416 + } 417 + return 1; 418 + } 419 + 420 + /* Insertion sort by descending order */ 421 + HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) { 422 + int i; 423 + int const size = high-low+1; 424 + huffNode += low; 425 + for (i = 1; i < size; ++i) { 426 + nodeElt const key = huffNode[i]; 427 + int j = i - 1; 428 + while (j >= 0 && huffNode[j].count < key.count) { 429 + huffNode[j + 1] = huffNode[j]; 430 + j--; 431 + } 432 + huffNode[j + 1] = key; 433 + } 434 + } 435 + 436 + /* Pivot helper function for quicksort. */ 437 + static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) { 438 + /* Simply select rightmost element as pivot. "Better" selectors like 439 + * median-of-three don't experimentally appear to have any benefit. 440 + */ 441 + U32 const pivot = arr[high].count; 442 + int i = low - 1; 443 + int j = low; 444 + for ( ; j < high; j++) { 445 + if (arr[j].count > pivot) { 446 + i++; 447 + HUF_swapNodes(&arr[i], &arr[j]); 448 + } 449 + } 450 + HUF_swapNodes(&arr[i + 1], &arr[high]); 451 + return i + 1; 452 + } 453 + 454 + /* Classic quicksort by descending with partially iterative calls 455 + * to reduce worst case callstack size. 456 + */ 457 + static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) { 458 + int const kInsertionSortThreshold = 8; 459 + if (high - low < kInsertionSortThreshold) { 460 + HUF_insertionSort(arr, low, high); 461 + return; 462 + } 463 + while (low < high) { 464 + int const idx = HUF_quickSortPartition(arr, low, high); 465 + if (idx - low < high - idx) { 466 + HUF_simpleQuickSort(arr, low, idx - 1); 467 + low = idx + 1; 468 + } else { 469 + HUF_simpleQuickSort(arr, idx + 1, high); 470 + high = idx - 1; 471 + } 472 + } 473 + } 474 + 441 475 /* 442 476 * HUF_sort(): 443 477 * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order. 478 + * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket. 444 479 * 445 480 * @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled. 446 481 * Must have (maxSymbolValue + 1) entries. ··· 544 387 * @param[in] maxSymbolValue Maximum symbol value. 545 388 * @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries. 546 389 */ 547 - static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition) 548 - { 549 - int n; 550 - int const maxSymbolValue1 = (int)maxSymbolValue + 1; 390 + static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) { 391 + U32 n; 392 + U32 const maxSymbolValue1 = maxSymbolValue+1; 551 393 552 394 /* Compute base and set curr to base. 553 - * For symbol s let lowerRank = BIT_highbit32(count[n]+1) and rank = lowerRank + 1. 554 - * Then 2^lowerRank <= count[n]+1 <= 2^rank. 395 + * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1. 396 + * See HUF_getIndex to see bucketing strategy. 555 397 * We attribute each symbol to lowerRank's base value, because we want to know where 556 398 * each rank begins in the output, so for rank R we want to count ranks R+1 and above. 557 399 */ 558 400 ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE); 559 401 for (n = 0; n < maxSymbolValue1; ++n) { 560 - U32 lowerRank = BIT_highbit32(count[n] + 1); 402 + U32 lowerRank = HUF_getIndex(count[n]); 403 + assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1); 561 404 rankPosition[lowerRank].base++; 562 405 } 406 + 563 407 assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0); 408 + /* Set up the rankPosition table */ 564 409 for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) { 565 410 rankPosition[n-1].base += rankPosition[n].base; 566 411 rankPosition[n-1].curr = rankPosition[n-1].base; 567 412 } 568 - /* Sort */ 413 + 414 + /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */ 569 415 for (n = 0; n < maxSymbolValue1; ++n) { 570 416 U32 const c = count[n]; 571 - U32 const r = BIT_highbit32(c+1) + 1; 572 - U32 pos = rankPosition[r].curr++; 573 - /* Insert into the correct position in the rank. 574 - * We have at most 256 symbols, so this insertion should be fine. 575 - */ 576 - while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) { 577 - huffNode[pos] = huffNode[pos-1]; 578 - pos--; 579 - } 417 + U32 const r = HUF_getIndex(c) + 1; 418 + U32 const pos = rankPosition[r].curr++; 419 + assert(pos < maxSymbolValue1); 580 420 huffNode[pos].count = c; 581 421 huffNode[pos].byte = (BYTE)n; 582 422 } 583 - } 584 423 424 + /* Sort each bucket. */ 425 + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { 426 + U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base; 427 + U32 const bucketStartIdx = rankPosition[n].base; 428 + if (bucketSize > 1) { 429 + assert(bucketStartIdx < maxSymbolValue1); 430 + HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1); 431 + } 432 + } 433 + 434 + assert(HUF_isSorted(huffNode, maxSymbolValue1)); 435 + } 585 436 586 437 /* HUF_buildCTable_wksp() : 587 438 * Same as HUF_buildCTable(), but using externally allocated scratch buffer. ··· 652 487 */ 653 488 static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits) 654 489 { 490 + HUF_CElt* const ct = CTable + 1; 655 491 /* fill result into ctable (val, nbBits) */ 656 492 int n; 657 493 U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0}; ··· 668 502 min >>= 1; 669 503 } } 670 504 for (n=0; n<alphabetSize; n++) 671 - CTable[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */ 505 + HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */ 672 506 for (n=0; n<alphabetSize; n++) 673 - CTable[n].val = valPerRank[CTable[n].nbBits]++; /* assign value within rank, symbol order */ 507 + HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); /* assign value within rank, symbol order */ 508 + CTable[0] = maxNbBits; 674 509 } 675 510 676 - size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize) 511 + size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize) 677 512 { 678 - HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace; 513 + HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32)); 679 514 nodeElt* const huffNode0 = wksp_tables->huffNodeTbl; 680 515 nodeElt* const huffNode = huffNode0+1; 681 516 int nonNullRank; 682 517 683 518 /* safety checks */ 684 - if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ 685 519 if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) 686 520 return ERROR(workSpace_tooSmall); 687 521 if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; ··· 699 533 maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); 700 534 if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ 701 535 702 - HUF_buildCTableFromTree(tree, huffNode, nonNullRank, maxSymbolValue, maxNbBits); 536 + HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits); 703 537 704 538 return maxNbBits; 705 539 } 706 540 707 541 size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) 708 542 { 543 + HUF_CElt const* ct = CTable + 1; 709 544 size_t nbBits = 0; 710 545 int s; 711 546 for (s = 0; s <= (int)maxSymbolValue; ++s) { 712 - nbBits += CTable[s].nbBits * count[s]; 547 + nbBits += HUF_getNbBits(ct[s]) * count[s]; 713 548 } 714 549 return nbBits >> 3; 715 550 } 716 551 717 552 int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { 553 + HUF_CElt const* ct = CTable + 1; 718 554 int bad = 0; 719 555 int s; 720 556 for (s = 0; s <= (int)maxSymbolValue; ++s) { 721 - bad |= (count[s] != 0) & (CTable[s].nbBits == 0); 557 + bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); 722 558 } 723 559 return !bad; 724 560 } 725 561 726 562 size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } 727 563 728 - FORCE_INLINE_TEMPLATE void 729 - HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) 564 + /* HUF_CStream_t: 565 + * Huffman uses its own BIT_CStream_t implementation. 566 + * There are three major differences from BIT_CStream_t: 567 + * 1. HUF_addBits() takes a HUF_CElt (size_t) which is 568 + * the pair (nbBits, value) in the format: 569 + * format: 570 + * - Bits [0, 4) = nbBits 571 + * - Bits [4, 64 - nbBits) = 0 572 + * - Bits [64 - nbBits, 64) = value 573 + * 2. The bitContainer is built from the upper bits and 574 + * right shifted. E.g. to add a new value of N bits 575 + * you right shift the bitContainer by N, then or in 576 + * the new value into the N upper bits. 577 + * 3. The bitstream has two bit containers. You can add 578 + * bits to the second container and merge them into 579 + * the first container. 580 + */ 581 + 582 + #define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8) 583 + 584 + typedef struct { 585 + size_t bitContainer[2]; 586 + size_t bitPos[2]; 587 + 588 + BYTE* startPtr; 589 + BYTE* ptr; 590 + BYTE* endPtr; 591 + } HUF_CStream_t; 592 + 593 + /*! HUF_initCStream(): 594 + * Initializes the bitstream. 595 + * @returns 0 or an error code. 596 + */ 597 + static size_t HUF_initCStream(HUF_CStream_t* bitC, 598 + void* startPtr, size_t dstCapacity) 730 599 { 731 - BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); 600 + ZSTD_memset(bitC, 0, sizeof(*bitC)); 601 + bitC->startPtr = (BYTE*)startPtr; 602 + bitC->ptr = bitC->startPtr; 603 + bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]); 604 + if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall); 605 + return 0; 732 606 } 733 607 734 - #define HUF_FLUSHBITS(s) BIT_flushBits(s) 608 + /*! HUF_addBits(): 609 + * Adds the symbol stored in HUF_CElt elt to the bitstream. 610 + * 611 + * @param elt The element we're adding. This is a (nbBits, value) pair. 612 + * See the HUF_CStream_t docs for the format. 613 + * @param idx Insert into the bitstream at this idx. 614 + * @param kFast This is a template parameter. If the bitstream is guaranteed 615 + * to have at least 4 unused bits after this call it may be 1, 616 + * otherwise it must be 0. HUF_addBits() is faster when fast is set. 617 + */ 618 + FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast) 619 + { 620 + assert(idx <= 1); 621 + assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX); 622 + /* This is efficient on x86-64 with BMI2 because shrx 623 + * only reads the low 6 bits of the register. The compiler 624 + * knows this and elides the mask. When fast is set, 625 + * every operation can use the same value loaded from elt. 626 + */ 627 + bitC->bitContainer[idx] >>= HUF_getNbBits(elt); 628 + bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt); 629 + /* We only read the low 8 bits of bitC->bitPos[idx] so it 630 + * doesn't matter that the high bits have noise from the value. 631 + */ 632 + bitC->bitPos[idx] += HUF_getNbBitsFast(elt); 633 + assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); 634 + /* The last 4-bits of elt are dirty if fast is set, 635 + * so we must not be overwriting bits that have already been 636 + * inserted into the bit container. 637 + */ 638 + #if DEBUGLEVEL >= 1 639 + { 640 + size_t const nbBits = HUF_getNbBits(elt); 641 + size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; 642 + (void)dirtyBits; 643 + /* Middle bits are 0. */ 644 + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); 645 + /* We didn't overwrite any bits in the bit container. */ 646 + assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); 647 + (void)dirtyBits; 648 + } 649 + #endif 650 + } 735 651 736 - #define HUF_FLUSHBITS_1(stream) \ 737 - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) 652 + FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC) 653 + { 654 + bitC->bitContainer[1] = 0; 655 + bitC->bitPos[1] = 0; 656 + } 738 657 739 - #define HUF_FLUSHBITS_2(stream) \ 740 - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream) 658 + /*! HUF_mergeIndex1() : 659 + * Merges the bit container @ index 1 into the bit container @ index 0 660 + * and zeros the bit container @ index 1. 661 + */ 662 + FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC) 663 + { 664 + assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER); 665 + bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF); 666 + bitC->bitContainer[0] |= bitC->bitContainer[1]; 667 + bitC->bitPos[0] += bitC->bitPos[1]; 668 + assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER); 669 + } 670 + 671 + /*! HUF_flushBits() : 672 + * Flushes the bits in the bit container @ index 0. 673 + * 674 + * @post bitPos will be < 8. 675 + * @param kFast If kFast is set then we must know a-priori that 676 + * the bit container will not overflow. 677 + */ 678 + FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast) 679 + { 680 + /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */ 681 + size_t const nbBits = bitC->bitPos[0] & 0xFF; 682 + size_t const nbBytes = nbBits >> 3; 683 + /* The top nbBits bits of bitContainer are the ones we need. */ 684 + size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits); 685 + /* Mask bitPos to account for the bytes we consumed. */ 686 + bitC->bitPos[0] &= 7; 687 + assert(nbBits > 0); 688 + assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8); 689 + assert(bitC->ptr <= bitC->endPtr); 690 + MEM_writeLEST(bitC->ptr, bitContainer); 691 + bitC->ptr += nbBytes; 692 + assert(!kFast || bitC->ptr <= bitC->endPtr); 693 + if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr; 694 + /* bitContainer doesn't need to be modified because the leftover 695 + * bits are already the top bitPos bits. And we don't care about 696 + * noise in the lower values. 697 + */ 698 + } 699 + 700 + /*! HUF_endMark() 701 + * @returns The Huffman stream end mark: A 1-bit value = 1. 702 + */ 703 + static HUF_CElt HUF_endMark(void) 704 + { 705 + HUF_CElt endMark; 706 + HUF_setNbBits(&endMark, 1); 707 + HUF_setValue(&endMark, 1); 708 + return endMark; 709 + } 710 + 711 + /*! HUF_closeCStream() : 712 + * @return Size of CStream, in bytes, 713 + * or 0 if it could not fit into dstBuffer */ 714 + static size_t HUF_closeCStream(HUF_CStream_t* bitC) 715 + { 716 + HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0); 717 + HUF_flushBits(bitC, /* kFast */ 0); 718 + { 719 + size_t const nbBits = bitC->bitPos[0] & 0xFF; 720 + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ 721 + return (bitC->ptr - bitC->startPtr) + (nbBits > 0); 722 + } 723 + } 724 + 725 + FORCE_INLINE_TEMPLATE void 726 + HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast) 727 + { 728 + HUF_addBits(bitCPtr, CTable[symbol], idx, fast); 729 + } 730 + 731 + FORCE_INLINE_TEMPLATE void 732 + HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC, 733 + const BYTE* ip, size_t srcSize, 734 + const HUF_CElt* ct, 735 + int kUnroll, int kFastFlush, int kLastFast) 736 + { 737 + /* Join to kUnroll */ 738 + int n = (int)srcSize; 739 + int rem = n % kUnroll; 740 + if (rem > 0) { 741 + for (; rem > 0; --rem) { 742 + HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0); 743 + } 744 + HUF_flushBits(bitC, kFastFlush); 745 + } 746 + assert(n % kUnroll == 0); 747 + 748 + /* Join to 2 * kUnroll */ 749 + if (n % (2 * kUnroll)) { 750 + int u; 751 + for (u = 1; u < kUnroll; ++u) { 752 + HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1); 753 + } 754 + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast); 755 + HUF_flushBits(bitC, kFastFlush); 756 + n -= kUnroll; 757 + } 758 + assert(n % (2 * kUnroll) == 0); 759 + 760 + for (; n>0; n-= 2 * kUnroll) { 761 + /* Encode kUnroll symbols into the bitstream @ index 0. */ 762 + int u; 763 + for (u = 1; u < kUnroll; ++u) { 764 + HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1); 765 + } 766 + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast); 767 + HUF_flushBits(bitC, kFastFlush); 768 + /* Encode kUnroll symbols into the bitstream @ index 1. 769 + * This allows us to start filling the bit container 770 + * without any data dependencies. 771 + */ 772 + HUF_zeroIndex1(bitC); 773 + for (u = 1; u < kUnroll; ++u) { 774 + HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1); 775 + } 776 + HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast); 777 + /* Merge bitstream @ index 1 into the bitstream @ index 0 */ 778 + HUF_mergeIndex1(bitC); 779 + HUF_flushBits(bitC, kFastFlush); 780 + } 781 + assert(n == 0); 782 + 783 + } 784 + 785 + /* 786 + * Returns a tight upper bound on the output space needed by Huffman 787 + * with 8 bytes buffer to handle over-writes. If the output is at least 788 + * this large we don't need to do bounds checks during Huffman encoding. 789 + */ 790 + static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog) 791 + { 792 + return ((srcSize * tableLog) >> 3) + 8; 793 + } 794 + 741 795 742 796 FORCE_INLINE_TEMPLATE size_t 743 797 HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, 744 798 const void* src, size_t srcSize, 745 799 const HUF_CElt* CTable) 746 800 { 801 + U32 const tableLog = (U32)CTable[0]; 802 + HUF_CElt const* ct = CTable + 1; 747 803 const BYTE* ip = (const BYTE*) src; 748 804 BYTE* const ostart = (BYTE*)dst; 749 805 BYTE* const oend = ostart + dstSize; 750 806 BYTE* op = ostart; 751 - size_t n; 752 - BIT_CStream_t bitC; 807 + HUF_CStream_t bitC; 753 808 754 809 /* init */ 755 810 if (dstSize < 8) return 0; /* not enough space to compress */ 756 - { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op)); 811 + { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); 757 812 if (HUF_isError(initErr)) return 0; } 758 813 759 - n = srcSize & ~3; /* join to mod 4 */ 760 - switch (srcSize & 3) 761 - { 762 - case 3: 763 - HUF_encodeSymbol(&bitC, ip[n+ 2], CTable); 764 - HUF_FLUSHBITS_2(&bitC); 765 - ZSTD_FALLTHROUGH; 766 - case 2: 767 - HUF_encodeSymbol(&bitC, ip[n+ 1], CTable); 768 - HUF_FLUSHBITS_1(&bitC); 769 - ZSTD_FALLTHROUGH; 770 - case 1: 771 - HUF_encodeSymbol(&bitC, ip[n+ 0], CTable); 772 - HUF_FLUSHBITS(&bitC); 773 - ZSTD_FALLTHROUGH; 774 - case 0: ZSTD_FALLTHROUGH; 775 - default: break; 814 + if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) 815 + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0); 816 + else { 817 + if (MEM_32bits()) { 818 + switch (tableLog) { 819 + case 11: 820 + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0); 821 + break; 822 + case 10: ZSTD_FALLTHROUGH; 823 + case 9: ZSTD_FALLTHROUGH; 824 + case 8: 825 + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1); 826 + break; 827 + case 7: ZSTD_FALLTHROUGH; 828 + default: 829 + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1); 830 + break; 831 + } 832 + } else { 833 + switch (tableLog) { 834 + case 11: 835 + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0); 836 + break; 837 + case 10: 838 + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1); 839 + break; 840 + case 9: 841 + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0); 842 + break; 843 + case 8: 844 + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0); 845 + break; 846 + case 7: 847 + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0); 848 + break; 849 + case 6: ZSTD_FALLTHROUGH; 850 + default: 851 + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1); 852 + break; 853 + } 854 + } 776 855 } 856 + assert(bitC.ptr <= bitC.endPtr); 777 857 778 - for (; n>0; n-=4) { /* note : n&3==0 at this stage */ 779 - HUF_encodeSymbol(&bitC, ip[n- 1], CTable); 780 - HUF_FLUSHBITS_1(&bitC); 781 - HUF_encodeSymbol(&bitC, ip[n- 2], CTable); 782 - HUF_FLUSHBITS_2(&bitC); 783 - HUF_encodeSymbol(&bitC, ip[n- 3], CTable); 784 - HUF_FLUSHBITS_1(&bitC); 785 - HUF_encodeSymbol(&bitC, ip[n- 4], CTable); 786 - HUF_FLUSHBITS(&bitC); 787 - } 788 - 789 - return BIT_closeCStream(&bitC); 858 + return HUF_closeCStream(&bitC); 790 859 } 791 860 792 861 #if DYNAMIC_BMI2 793 862 794 - static TARGET_ATTRIBUTE("bmi2") size_t 863 + static BMI2_TARGET_ATTRIBUTE size_t 795 864 HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize, 796 865 const void* src, size_t srcSize, 797 866 const HUF_CElt* CTable) ··· 1068 667 1069 668 size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) 1070 669 { 1071 - return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); 670 + return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); 1072 671 } 1073 672 673 + size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) 674 + { 675 + return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); 676 + } 1074 677 1075 678 static size_t 1076 679 HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, ··· 1094 689 1095 690 assert(op <= oend); 1096 691 { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); 1097 - if (cSize==0) return 0; 1098 - assert(cSize <= 65535); 692 + if (cSize == 0 || cSize > 65535) return 0; 1099 693 MEM_writeLE16(ostart, (U16)cSize); 1100 694 op += cSize; 1101 695 } ··· 1102 698 ip += segmentSize; 1103 699 assert(op <= oend); 1104 700 { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); 1105 - if (cSize==0) return 0; 1106 - assert(cSize <= 65535); 701 + if (cSize == 0 || cSize > 65535) return 0; 1107 702 MEM_writeLE16(ostart+2, (U16)cSize); 1108 703 op += cSize; 1109 704 } ··· 1110 707 ip += segmentSize; 1111 708 assert(op <= oend); 1112 709 { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); 1113 - if (cSize==0) return 0; 1114 - assert(cSize <= 65535); 710 + if (cSize == 0 || cSize > 65535) return 0; 1115 711 MEM_writeLE16(ostart+4, (U16)cSize); 1116 712 op += cSize; 1117 713 } ··· 1119 717 assert(op <= oend); 1120 718 assert(ip <= iend); 1121 719 { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); 1122 - if (cSize==0) return 0; 720 + if (cSize == 0 || cSize > 65535) return 0; 1123 721 op += cSize; 1124 722 } 1125 723 ··· 1128 726 1129 727 size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) 1130 728 { 1131 - return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); 729 + return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); 730 + } 731 + 732 + size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) 733 + { 734 + return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); 1132 735 } 1133 736 1134 737 typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; ··· 1157 750 1158 751 typedef struct { 1159 752 unsigned count[HUF_SYMBOLVALUE_MAX + 1]; 1160 - HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1]; 753 + HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)]; 1161 754 union { 1162 755 HUF_buildCTable_wksp_tables buildCTable_wksp; 1163 756 HUF_WriteCTableWksp writeCTable_wksp; 757 + U32 hist_wksp[HIST_WKSP_SIZE_U32]; 1164 758 } wksps; 1165 759 } HUF_compress_tables_t; 1166 760 761 + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 762 + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ 763 + 1167 764 /* HUF_compress_internal() : 1168 765 * `workSpace_align4` must be aligned on 4-bytes boundaries, 1169 - * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsigned */ 766 + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ 1170 767 static size_t 1171 768 HUF_compress_internal (void* dst, size_t dstSize, 1172 769 const void* src, size_t srcSize, 1173 770 unsigned maxSymbolValue, unsigned huffLog, 1174 771 HUF_nbStreams_e nbStreams, 1175 - void* workSpace_align4, size_t wkspSize, 772 + void* workSpace, size_t wkspSize, 1176 773 HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, 1177 - const int bmi2) 774 + const int bmi2, unsigned suspectUncompressible) 1178 775 { 1179 - HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace_align4; 776 + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); 1180 777 BYTE* const ostart = (BYTE*)dst; 1181 778 BYTE* const oend = ostart + dstSize; 1182 779 BYTE* op = ostart; 1183 780 1184 - HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE); 1185 - assert(((size_t)workSpace_align4 & 3) == 0); /* must be aligned on 4-bytes boundaries */ 781 + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); 1186 782 1187 783 /* checks & inits */ 1188 - if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall); 784 + if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall); 1189 785 if (!srcSize) return 0; /* Uncompressed */ 1190 786 if (!dstSize) return 0; /* cannot fit anything within dst budget */ 1191 787 if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */ ··· 1204 794 nbStreams, oldHufTable, bmi2); 1205 795 } 1206 796 797 + /* If uncompressible data is suspected, do a smaller sampling first */ 798 + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); 799 + if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { 800 + size_t largestTotal = 0; 801 + { unsigned maxSymbolValueBegin = maxSymbolValue; 802 + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); 803 + largestTotal += largestBegin; 804 + } 805 + { unsigned maxSymbolValueEnd = maxSymbolValue; 806 + CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); 807 + largestTotal += largestEnd; 808 + } 809 + if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0; /* heuristic : probably not compressible enough */ 810 + } 811 + 1207 812 /* Scan input and build symbol stats */ 1208 - { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace_align4, wkspSize) ); 813 + { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) ); 1209 814 if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ 1210 815 if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ 1211 816 } ··· 1245 820 &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); 1246 821 CHECK_F(maxBits); 1247 822 huffLog = (U32)maxBits; 1248 - /* Zero unused symbols in CTable, so we can check it for validity */ 1249 - ZSTD_memset(table->CTable + (maxSymbolValue + 1), 0, 1250 - sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt))); 823 + } 824 + /* Zero unused symbols in CTable, so we can check it for validity */ 825 + { 826 + size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue); 827 + size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt); 828 + ZSTD_memset(table->CTable + ctableSize, 0, unusedSize); 1251 829 } 1252 830 1253 831 /* Write table description header */ ··· 1287 859 return HUF_compress_internal(dst, dstSize, src, srcSize, 1288 860 maxSymbolValue, huffLog, HUF_singleStream, 1289 861 workSpace, wkspSize, 1290 - NULL, NULL, 0, 0 /*bmi2*/); 862 + NULL, NULL, 0, 0 /*bmi2*/, 0); 1291 863 } 1292 864 1293 865 size_t HUF_compress1X_repeat (void* dst, size_t dstSize, 1294 866 const void* src, size_t srcSize, 1295 867 unsigned maxSymbolValue, unsigned huffLog, 1296 868 void* workSpace, size_t wkspSize, 1297 - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) 869 + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, 870 + int bmi2, unsigned suspectUncompressible) 1298 871 { 1299 872 return HUF_compress_internal(dst, dstSize, src, srcSize, 1300 873 maxSymbolValue, huffLog, HUF_singleStream, 1301 874 workSpace, wkspSize, hufTable, 1302 - repeat, preferRepeat, bmi2); 875 + repeat, preferRepeat, bmi2, suspectUncompressible); 1303 876 } 1304 877 1305 878 /* HUF_compress4X_repeat(): ··· 1314 885 return HUF_compress_internal(dst, dstSize, src, srcSize, 1315 886 maxSymbolValue, huffLog, HUF_fourStreams, 1316 887 workSpace, wkspSize, 1317 - NULL, NULL, 0, 0 /*bmi2*/); 888 + NULL, NULL, 0, 0 /*bmi2*/, 0); 1318 889 } 1319 890 1320 891 /* HUF_compress4X_repeat(): 1321 892 * compress input using 4 streams. 893 + * consider skipping quickly 1322 894 * re-use an existing huffman compression table */ 1323 895 size_t HUF_compress4X_repeat (void* dst, size_t dstSize, 1324 896 const void* src, size_t srcSize, 1325 897 unsigned maxSymbolValue, unsigned huffLog, 1326 898 void* workSpace, size_t wkspSize, 1327 - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) 899 + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible) 1328 900 { 1329 901 return HUF_compress_internal(dst, dstSize, src, srcSize, 1330 902 maxSymbolValue, huffLog, HUF_fourStreams, 1331 903 workSpace, wkspSize, 1332 - hufTable, repeat, preferRepeat, bmi2); 904 + hufTable, repeat, preferRepeat, bmi2, suspectUncompressible); 1333 905 } 1334 906
+1511 -493
lib/zstd/compress/zstd_compress.c
··· 12 12 * Dependencies 13 13 ***************************************/ 14 14 #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ 15 - #include "../common/cpu.h" 16 15 #include "../common/mem.h" 17 16 #include "hist.h" /* HIST_countFast_wksp */ 18 17 #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ ··· 38 39 * Note that functions with explicit context such as ZSTD_compressCCtx() are unaffected. 39 40 */ 40 41 42 + /*! 43 + * ZSTD_HASHLOG3_MAX : 44 + * Maximum size of the hash table dedicated to find 3-bytes matches, 45 + * in log format, aka 17 => 1 << 17 == 128Ki positions. 46 + * This structure is only used in zstd_opt. 47 + * Since allocation is centralized for all strategies, it has to be known here. 48 + * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3, 49 + * so that zstd_opt.c doesn't need to know about this constant. 50 + */ 51 + #ifndef ZSTD_HASHLOG3_MAX 52 + # define ZSTD_HASHLOG3_MAX 17 53 + #endif 41 54 42 55 /*-************************************* 43 56 * Helper functions ··· 80 69 ZSTD_customMem customMem; 81 70 U32 dictID; 82 71 int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */ 72 + ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use 73 + * row-based matchfinder. Unless the cdict is reloaded, we will use 74 + * the same greedy/lazy matchfinder at compression time. 75 + */ 83 76 }; /* typedef'd to ZSTD_CDict within "zstd.h" */ 84 77 85 78 ZSTD_CCtx* ZSTD_createCCtx(void) ··· 96 81 assert(cctx != NULL); 97 82 ZSTD_memset(cctx, 0, sizeof(*cctx)); 98 83 cctx->customMem = memManager; 99 - cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); 84 + cctx->bmi2 = ZSTD_cpuSupportsBmi2(); 100 85 { size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters); 101 86 assert(!ZSTD_isError(err)); 102 87 (void)err; ··· 207 192 /* private API call, for dictBuilder only */ 208 193 const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } 209 194 195 + /* Returns true if the strategy supports using a row based matchfinder */ 196 + static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { 197 + return (strategy >= ZSTD_greedy && strategy <= ZSTD_lazy2); 198 + } 199 + 200 + /* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder 201 + * for this compression. 202 + */ 203 + static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_paramSwitch_e mode) { 204 + assert(mode != ZSTD_ps_auto); 205 + return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable); 206 + } 207 + 208 + /* Returns row matchfinder usage given an initial mode and cParams */ 209 + static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_e mode, 210 + const ZSTD_compressionParameters* const cParams) { 211 + #if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON) 212 + int const kHasSIMD128 = 1; 213 + #else 214 + int const kHasSIMD128 = 0; 215 + #endif 216 + if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */ 217 + mode = ZSTD_ps_disable; 218 + if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode; 219 + if (kHasSIMD128) { 220 + if (cParams->windowLog > 14) mode = ZSTD_ps_enable; 221 + } else { 222 + if (cParams->windowLog > 17) mode = ZSTD_ps_enable; 223 + } 224 + return mode; 225 + } 226 + 227 + /* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */ 228 + static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode, 229 + const ZSTD_compressionParameters* const cParams) { 230 + if (mode != ZSTD_ps_auto) return mode; 231 + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable; 232 + } 233 + 234 + /* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */ 235 + static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, 236 + const ZSTD_paramSwitch_e useRowMatchFinder, 237 + const U32 forDDSDict) { 238 + assert(useRowMatchFinder != ZSTD_ps_auto); 239 + /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate. 240 + * We do not allocate a chaintable if we are using ZSTD_fast, or are using the row-based matchfinder. 241 + */ 242 + return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); 243 + } 244 + 210 245 /* Returns 1 if compression parameters are such that we should 211 246 * enable long distance matching (wlog >= 27, strategy >= btopt). 212 247 * Returns 0 otherwise. 213 248 */ 214 - static U32 ZSTD_CParams_shouldEnableLdm(const ZSTD_compressionParameters* const cParams) { 215 - return cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27; 249 + static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, 250 + const ZSTD_compressionParameters* const cParams) { 251 + if (mode != ZSTD_ps_auto) return mode; 252 + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; 216 253 } 217 254 218 255 static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( ··· 275 208 ZSTD_CCtxParams_init(&cctxParams, ZSTD_CLEVEL_DEFAULT); 276 209 cctxParams.cParams = cParams; 277 210 278 - if (ZSTD_CParams_shouldEnableLdm(&cParams)) { 279 - DEBUGLOG(4, "ZSTD_makeCCtxParamsFromCParams(): Including LDM into cctx params"); 280 - cctxParams.ldmParams.enableLdm = 1; 281 - /* LDM is enabled by default for optimal parser and window size >= 128MB */ 211 + /* Adjust advanced params according to cParams */ 212 + cctxParams.ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams.ldmParams.enableLdm, &cParams); 213 + if (cctxParams.ldmParams.enableLdm == ZSTD_ps_enable) { 282 214 ZSTD_ldm_adjustParameters(&cctxParams.ldmParams, &cParams); 283 215 assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog); 284 216 assert(cctxParams.ldmParams.hashRateLog < 32); 285 217 } 286 - 218 + cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); 219 + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); 287 220 assert(!ZSTD_checkCParams(cParams)); 288 221 return cctxParams; 289 222 } ··· 342 275 * But, set it for tracing anyway. 343 276 */ 344 277 cctxParams->compressionLevel = compressionLevel; 278 + cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams); 279 + cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams); 280 + cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams); 281 + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", 282 + cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); 345 283 } 346 284 347 285 size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) ··· 503 431 return bounds; 504 432 505 433 case ZSTD_c_literalCompressionMode: 506 - ZSTD_STATIC_ASSERT(ZSTD_lcm_auto < ZSTD_lcm_huffman && ZSTD_lcm_huffman < ZSTD_lcm_uncompressed); 507 - bounds.lowerBound = ZSTD_lcm_auto; 508 - bounds.upperBound = ZSTD_lcm_uncompressed; 434 + ZSTD_STATIC_ASSERT(ZSTD_ps_auto < ZSTD_ps_enable && ZSTD_ps_enable < ZSTD_ps_disable); 435 + bounds.lowerBound = (int)ZSTD_ps_auto; 436 + bounds.upperBound = (int)ZSTD_ps_disable; 509 437 return bounds; 510 438 511 439 case ZSTD_c_targetCBlockSize: ··· 530 458 return bounds; 531 459 532 460 case ZSTD_c_validateSequences: 461 + bounds.lowerBound = 0; 462 + bounds.upperBound = 1; 463 + return bounds; 464 + 465 + case ZSTD_c_useBlockSplitter: 466 + bounds.lowerBound = (int)ZSTD_ps_auto; 467 + bounds.upperBound = (int)ZSTD_ps_disable; 468 + return bounds; 469 + 470 + case ZSTD_c_useRowMatchFinder: 471 + bounds.lowerBound = (int)ZSTD_ps_auto; 472 + bounds.upperBound = (int)ZSTD_ps_disable; 473 + return bounds; 474 + 475 + case ZSTD_c_deterministicRefPrefix: 533 476 bounds.lowerBound = 0; 534 477 bounds.upperBound = 1; 535 478 return bounds; ··· 610 523 case ZSTD_c_stableOutBuffer: 611 524 case ZSTD_c_blockDelimiters: 612 525 case ZSTD_c_validateSequences: 526 + case ZSTD_c_useBlockSplitter: 527 + case ZSTD_c_useRowMatchFinder: 528 + case ZSTD_c_deterministicRefPrefix: 613 529 default: 614 530 return 0; 615 531 } ··· 665 575 case ZSTD_c_stableOutBuffer: 666 576 case ZSTD_c_blockDelimiters: 667 577 case ZSTD_c_validateSequences: 578 + case ZSTD_c_useBlockSplitter: 579 + case ZSTD_c_useRowMatchFinder: 580 + case ZSTD_c_deterministicRefPrefix: 668 581 break; 669 582 670 583 default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); ··· 765 672 } 766 673 767 674 case ZSTD_c_literalCompressionMode : { 768 - const ZSTD_literalCompressionMode_e lcm = (ZSTD_literalCompressionMode_e)value; 675 + const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value; 769 676 BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); 770 677 CCtxParams->literalCompressionMode = lcm; 771 678 return CCtxParams->literalCompressionMode; ··· 792 699 return CCtxParams->enableDedicatedDictSearch; 793 700 794 701 case ZSTD_c_enableLongDistanceMatching : 795 - CCtxParams->ldmParams.enableLdm = (value!=0); 702 + CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value; 796 703 return CCtxParams->ldmParams.enableLdm; 797 704 798 705 case ZSTD_c_ldmHashLog : ··· 850 757 BOUNDCHECK(ZSTD_c_validateSequences, value); 851 758 CCtxParams->validateSequences = value; 852 759 return CCtxParams->validateSequences; 760 + 761 + case ZSTD_c_useBlockSplitter: 762 + BOUNDCHECK(ZSTD_c_useBlockSplitter, value); 763 + CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value; 764 + return CCtxParams->useBlockSplitter; 765 + 766 + case ZSTD_c_useRowMatchFinder: 767 + BOUNDCHECK(ZSTD_c_useRowMatchFinder, value); 768 + CCtxParams->useRowMatchFinder = (ZSTD_paramSwitch_e)value; 769 + return CCtxParams->useRowMatchFinder; 770 + 771 + case ZSTD_c_deterministicRefPrefix: 772 + BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); 773 + CCtxParams->deterministicRefPrefix = !!value; 774 + return CCtxParams->deterministicRefPrefix; 853 775 854 776 default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); 855 777 } ··· 971 863 case ZSTD_c_validateSequences : 972 864 *value = (int)CCtxParams->validateSequences; 973 865 break; 866 + case ZSTD_c_useBlockSplitter : 867 + *value = (int)CCtxParams->useBlockSplitter; 868 + break; 869 + case ZSTD_c_useRowMatchFinder : 870 + *value = (int)CCtxParams->useRowMatchFinder; 871 + break; 872 + case ZSTD_c_deterministicRefPrefix: 873 + *value = (int)CCtxParams->deterministicRefPrefix; 874 + break; 974 875 default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); 975 876 } 976 877 return 0; ··· 1006 889 return 0; 1007 890 } 1008 891 1009 - ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) 892 + size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) 1010 893 { 1011 894 DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); 1012 895 RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, ··· 1086 969 return 0; 1087 970 } 1088 971 1089 - ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference( 972 + size_t ZSTD_CCtx_loadDictionary_byReference( 1090 973 ZSTD_CCtx* cctx, const void* dict, size_t dictSize) 1091 974 { 1092 975 return ZSTD_CCtx_loadDictionary_advanced( 1093 976 cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); 1094 977 } 1095 978 1096 - ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) 979 + size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) 1097 980 { 1098 981 return ZSTD_CCtx_loadDictionary_advanced( 1099 982 cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); ··· 1263 1146 break; 1264 1147 case ZSTD_cpm_createCDict: 1265 1148 /* Assume a small source size when creating a dictionary 1266 - * with an unkown source size. 1149 + * with an unknown source size. 1267 1150 */ 1268 1151 if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN) 1269 1152 srcSize = minSrcSize; ··· 1337 1220 srcSizeHint = CCtxParams->srcSizeHint; 1338 1221 } 1339 1222 cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode); 1340 - if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; 1223 + if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; 1341 1224 ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); 1342 1225 assert(!ZSTD_checkCParams(cParams)); 1343 1226 /* srcSizeHint == 0 means 0 */ ··· 1346 1229 1347 1230 static size_t 1348 1231 ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, 1232 + const ZSTD_paramSwitch_e useRowMatchFinder, 1233 + const U32 enableDedicatedDictSearch, 1349 1234 const U32 forCCtx) 1350 1235 { 1351 - size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); 1236 + /* chain table size should be 0 for fast or row-hash strategies */ 1237 + size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, enableDedicatedDictSearch && !forCCtx) 1238 + ? ((size_t)1 << cParams->chainLog) 1239 + : 0; 1352 1240 size_t const hSize = ((size_t)1) << cParams->hashLog; 1353 1241 U32 const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; 1354 1242 size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; ··· 1363 1241 + hSize * sizeof(U32) 1364 1242 + h3Size * sizeof(U32); 1365 1243 size_t const optPotentialSpace = 1366 - ZSTD_cwksp_alloc_size((MaxML+1) * sizeof(U32)) 1367 - + ZSTD_cwksp_alloc_size((MaxLL+1) * sizeof(U32)) 1368 - + ZSTD_cwksp_alloc_size((MaxOff+1) * sizeof(U32)) 1369 - + ZSTD_cwksp_alloc_size((1<<Litbits) * sizeof(U32)) 1370 - + ZSTD_cwksp_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)) 1371 - + ZSTD_cwksp_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); 1244 + ZSTD_cwksp_aligned_alloc_size((MaxML+1) * sizeof(U32)) 1245 + + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32)) 1246 + + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32)) 1247 + + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32)) 1248 + + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)) 1249 + + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); 1250 + size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder) 1251 + ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) 1252 + : 0; 1372 1253 size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) 1373 1254 ? optPotentialSpace 1374 1255 : 0; 1256 + size_t const slackSpace = ZSTD_cwksp_slack_space_required(); 1257 + 1258 + /* tables are guaranteed to be sized in multiples of 64 bytes (or 16 uint32_t) */ 1259 + ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >= 4 && ZSTD_WINDOWLOG_MIN >= 4 && ZSTD_CHAINLOG_MIN >= 4); 1260 + assert(useRowMatchFinder != ZSTD_ps_auto); 1261 + 1375 1262 DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u", 1376 1263 (U32)chainSize, (U32)hSize, (U32)h3Size); 1377 - return tableSpace + optSpace; 1264 + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; 1378 1265 } 1379 1266 1380 1267 static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( 1381 1268 const ZSTD_compressionParameters* cParams, 1382 1269 const ldmParams_t* ldmParams, 1383 1270 const int isStatic, 1271 + const ZSTD_paramSwitch_e useRowMatchFinder, 1384 1272 const size_t buffInSize, 1385 1273 const size_t buffOutSize, 1386 1274 const U64 pledgedSrcSize) 1387 1275 { 1388 - size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << cParams->windowLog), pledgedSrcSize)); 1276 + size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); 1389 1277 size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); 1390 1278 U32 const divider = (cParams->minMatch==3) ? 3 : 4; 1391 1279 size_t const maxNbSeq = blockSize / divider; 1392 1280 size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) 1393 - + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef)) 1281 + + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) 1394 1282 + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); 1395 1283 size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE); 1396 1284 size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); 1397 - size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, /* forCCtx */ 1); 1285 + size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); 1398 1286 1399 1287 size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams); 1400 1288 size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize); 1401 - size_t const ldmSeqSpace = ldmParams->enableLdm ? 1402 - ZSTD_cwksp_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; 1289 + size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ? 1290 + ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; 1403 1291 1404 1292 1405 1293 size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) ··· 1435 1303 { 1436 1304 ZSTD_compressionParameters const cParams = 1437 1305 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); 1306 + ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, 1307 + &cParams); 1438 1308 1439 1309 RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); 1440 1310 /* estimateCCtxSize is for one-shot compression. So no buffers should 1441 1311 * be needed. However, we still allocate two 0-sized buffers, which can 1442 1312 * take space under ASAN. */ 1443 1313 return ZSTD_estimateCCtxSize_usingCCtxParams_internal( 1444 - &cParams, &params->ldmParams, 1, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); 1314 + &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); 1445 1315 } 1446 1316 1447 1317 size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) 1448 1318 { 1449 - ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); 1450 - return ZSTD_estimateCCtxSize_usingCCtxParams(&params); 1319 + ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); 1320 + if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { 1321 + /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ 1322 + size_t noRowCCtxSize; 1323 + size_t rowCCtxSize; 1324 + initialParams.useRowMatchFinder = ZSTD_ps_disable; 1325 + noRowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); 1326 + initialParams.useRowMatchFinder = ZSTD_ps_enable; 1327 + rowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); 1328 + return MAX(noRowCCtxSize, rowCCtxSize); 1329 + } else { 1330 + return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); 1331 + } 1451 1332 } 1452 1333 1453 1334 static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) ··· 1500 1355 size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered) 1501 1356 ? ZSTD_compressBound(blockSize) + 1 1502 1357 : 0; 1358 + ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, &params->cParams); 1503 1359 1504 1360 return ZSTD_estimateCCtxSize_usingCCtxParams_internal( 1505 - &cParams, &params->ldmParams, 1, inBuffSize, outBuffSize, 1361 + &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, 1506 1362 ZSTD_CONTENTSIZE_UNKNOWN); 1507 1363 } 1508 1364 } 1509 1365 1510 1366 size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams) 1511 1367 { 1512 - ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); 1513 - return ZSTD_estimateCStreamSize_usingCCtxParams(&params); 1368 + ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); 1369 + if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { 1370 + /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ 1371 + size_t noRowCCtxSize; 1372 + size_t rowCCtxSize; 1373 + initialParams.useRowMatchFinder = ZSTD_ps_disable; 1374 + noRowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); 1375 + initialParams.useRowMatchFinder = ZSTD_ps_enable; 1376 + rowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); 1377 + return MAX(noRowCCtxSize, rowCCtxSize); 1378 + } else { 1379 + return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); 1380 + } 1514 1381 } 1515 1382 1516 1383 static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) ··· 1637 1480 ZSTD_resetTarget_CCtx 1638 1481 } ZSTD_resetTarget_e; 1639 1482 1483 + 1640 1484 static size_t 1641 1485 ZSTD_reset_matchState(ZSTD_matchState_t* ms, 1642 1486 ZSTD_cwksp* ws, 1643 1487 const ZSTD_compressionParameters* cParams, 1488 + const ZSTD_paramSwitch_e useRowMatchFinder, 1644 1489 const ZSTD_compResetPolicy_e crp, 1645 1490 const ZSTD_indexResetPolicy_e forceResetIndex, 1646 1491 const ZSTD_resetTarget_e forWho) 1647 1492 { 1648 - size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); 1493 + /* disable chain table allocation for fast or row-based strategies */ 1494 + size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, 1495 + ms->dedicatedDictSearch && (forWho == ZSTD_resetTarget_CDict)) 1496 + ? ((size_t)1 << cParams->chainLog) 1497 + : 0; 1649 1498 size_t const hSize = ((size_t)1) << cParams->hashLog; 1650 1499 U32 const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; 1651 1500 size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; 1652 1501 1653 1502 DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset); 1503 + assert(useRowMatchFinder != ZSTD_ps_auto); 1654 1504 if (forceResetIndex == ZSTDirp_reset) { 1655 1505 ZSTD_window_init(&ms->window); 1656 1506 ZSTD_cwksp_mark_tables_dirty(ws); ··· 1696 1532 ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); 1697 1533 } 1698 1534 1535 + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { 1536 + { /* Row match finder needs an additional table of hashes ("tags") */ 1537 + size_t const tagTableSize = hSize*sizeof(U16); 1538 + ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); 1539 + if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); 1540 + } 1541 + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ 1542 + U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); 1543 + assert(cParams->hashLog >= rowLog); 1544 + ms->rowHashLog = cParams->hashLog - rowLog; 1545 + } 1546 + } 1547 + 1699 1548 ms->cParams = *cParams; 1700 1549 1701 1550 RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, 1702 1551 "failed a workspace allocation in ZSTD_reset_matchState"); 1703 - 1704 1552 return 0; 1705 1553 } 1706 1554 ··· 1729 1553 return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN); 1730 1554 } 1731 1555 1556 + /* ZSTD_dictTooBig(): 1557 + * When dictionaries are larger than ZSTD_CHUNKSIZE_MAX they can't be loaded in 1558 + * one go generically. So we ensure that in that case we reset the tables to zero, 1559 + * so that we can load as much of the dictionary as possible. 1560 + */ 1561 + static int ZSTD_dictTooBig(size_t const loadedDictSize) 1562 + { 1563 + return loadedDictSize > ZSTD_CHUNKSIZE_MAX; 1564 + } 1565 + 1732 1566 /*! ZSTD_resetCCtx_internal() : 1733 - note : `params` are assumed fully validated at this stage */ 1567 + * @param loadedDictSize The size of the dictionary to be loaded 1568 + * into the context, if any. If no dictionary is used, or the 1569 + * dictionary is being attached / copied, then pass 0. 1570 + * note : `params` are assumed fully validated at this stage. 1571 + */ 1734 1572 static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, 1735 - ZSTD_CCtx_params params, 1573 + ZSTD_CCtx_params const* params, 1736 1574 U64 const pledgedSrcSize, 1575 + size_t const loadedDictSize, 1737 1576 ZSTD_compResetPolicy_e const crp, 1738 1577 ZSTD_buffered_policy_e const zbuff) 1739 1578 { 1740 1579 ZSTD_cwksp* const ws = &zc->workspace; 1741 - DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u", 1742 - (U32)pledgedSrcSize, params.cParams.windowLog); 1743 - assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); 1580 + DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d", 1581 + (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter); 1582 + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); 1744 1583 1745 1584 zc->isFirstBlock = 1; 1746 1585 1747 - if (params.ldmParams.enableLdm) { 1586 + /* Set applied params early so we can modify them for LDM, 1587 + * and point params at the applied params. 1588 + */ 1589 + zc->appliedParams = *params; 1590 + params = &zc->appliedParams; 1591 + 1592 + assert(params->useRowMatchFinder != ZSTD_ps_auto); 1593 + assert(params->useBlockSplitter != ZSTD_ps_auto); 1594 + assert(params->ldmParams.enableLdm != ZSTD_ps_auto); 1595 + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { 1748 1596 /* Adjust long distance matching parameters */ 1749 - ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams); 1750 - assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); 1751 - assert(params.ldmParams.hashRateLog < 32); 1597 + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams); 1598 + assert(params->ldmParams.hashLog >= params->ldmParams.bucketSizeLog); 1599 + assert(params->ldmParams.hashRateLog < 32); 1752 1600 } 1753 1601 1754 - { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize)); 1602 + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); 1755 1603 size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); 1756 - U32 const divider = (params.cParams.minMatch==3) ? 3 : 4; 1604 + U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; 1757 1605 size_t const maxNbSeq = blockSize / divider; 1758 - size_t const buffOutSize = (zbuff == ZSTDb_buffered && params.outBufferMode == ZSTD_bm_buffered) 1606 + size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) 1759 1607 ? ZSTD_compressBound(blockSize) + 1 1760 1608 : 0; 1761 - size_t const buffInSize = (zbuff == ZSTDb_buffered && params.inBufferMode == ZSTD_bm_buffered) 1609 + size_t const buffInSize = (zbuff == ZSTDb_buffered && params->inBufferMode == ZSTD_bm_buffered) 1762 1610 ? windowSize + blockSize 1763 1611 : 0; 1764 - size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize); 1612 + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize); 1765 1613 1766 1614 int const indexTooClose = ZSTD_indexTooCloseToMax(zc->blockState.matchState.window); 1615 + int const dictTooBig = ZSTD_dictTooBig(loadedDictSize); 1767 1616 ZSTD_indexResetPolicy_e needsIndexReset = 1768 - (!indexTooClose && zc->initialized) ? ZSTDirp_continue : ZSTDirp_reset; 1617 + (indexTooClose || dictTooBig || !zc->initialized) ? ZSTDirp_reset : ZSTDirp_continue; 1769 1618 1770 1619 size_t const neededSpace = 1771 1620 ZSTD_estimateCCtxSize_usingCCtxParams_internal( 1772 - &params.cParams, &params.ldmParams, zc->staticSize != 0, 1621 + &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, 1773 1622 buffInSize, buffOutSize, pledgedSrcSize); 1623 + int resizeWorkspace; 1624 + 1774 1625 FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); 1775 1626 1776 1627 if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0); 1777 1628 1778 - /* Check if workspace is large enough, alloc a new one if needed */ 1779 - { 1629 + { /* Check if workspace is large enough, alloc a new one if needed */ 1780 1630 int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; 1781 1631 int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); 1782 - 1632 + resizeWorkspace = workspaceTooSmall || workspaceWasteful; 1783 1633 DEBUGLOG(4, "Need %zu B workspace", neededSpace); 1784 1634 DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); 1785 1635 1786 - if (workspaceTooSmall || workspaceWasteful) { 1636 + if (resizeWorkspace) { 1787 1637 DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB", 1788 1638 ZSTD_cwksp_sizeof(ws) >> 10, 1789 1639 neededSpace >> 10); ··· 1831 1629 zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); 1832 1630 RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock"); 1833 1631 zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE); 1834 - RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate entropyWorkspace"); 1632 + RETURN_ERROR_IF(zc->entropyWorkspace == NULL, memory_allocation, "couldn't allocate entropyWorkspace"); 1835 1633 } } 1836 1634 1837 1635 ZSTD_cwksp_clear(ws); 1838 1636 1839 1637 /* init params */ 1840 - zc->appliedParams = params; 1841 - zc->blockState.matchState.cParams = params.cParams; 1638 + zc->blockState.matchState.cParams = params->cParams; 1842 1639 zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; 1843 1640 zc->consumedSrcSize = 0; 1844 1641 zc->producedCSize = 0; ··· 1868 1667 zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); 1869 1668 1870 1669 /* ldm bucketOffsets table */ 1871 - if (params.ldmParams.enableLdm) { 1670 + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { 1872 1671 /* TODO: avoid memset? */ 1873 1672 size_t const numBuckets = 1874 - ((size_t)1) << (params.ldmParams.hashLog - 1875 - params.ldmParams.bucketSizeLog); 1673 + ((size_t)1) << (params->ldmParams.hashLog - 1674 + params->ldmParams.bucketSizeLog); 1876 1675 zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets); 1877 1676 ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); 1878 1677 } ··· 1888 1687 FORWARD_IF_ERROR(ZSTD_reset_matchState( 1889 1688 &zc->blockState.matchState, 1890 1689 ws, 1891 - &params.cParams, 1690 + &params->cParams, 1691 + params->useRowMatchFinder, 1892 1692 crp, 1893 1693 needsIndexReset, 1894 1694 ZSTD_resetTarget_CCtx), ""); 1895 1695 1896 1696 /* ldm hash table */ 1897 - if (params.ldmParams.enableLdm) { 1697 + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { 1898 1698 /* TODO: avoid memset? */ 1899 - size_t const ldmHSize = ((size_t)1) << params.ldmParams.hashLog; 1699 + size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; 1900 1700 zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); 1901 1701 ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); 1902 1702 zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); 1903 1703 zc->maxNbLdmSequences = maxNbLdmSeq; 1904 1704 1905 1705 ZSTD_window_init(&zc->ldmState.window); 1906 - ZSTD_window_clear(&zc->ldmState.window); 1907 1706 zc->ldmState.loadedDictEnd = 0; 1908 1707 } 1909 1708 1910 - /* Due to alignment, when reusing a workspace, we can actually consume 1911 - * up to 3 extra bytes for alignment. See the comments in zstd_cwksp.h 1912 - */ 1913 - assert(ZSTD_cwksp_used(ws) >= neededSpace && 1914 - ZSTD_cwksp_used(ws) <= neededSpace + 3); 1915 - 1916 1709 DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); 1710 + assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); 1711 + 1917 1712 zc->initialized = 1; 1918 1713 1919 1714 return 0; ··· 1965 1768 U64 pledgedSrcSize, 1966 1769 ZSTD_buffered_policy_e zbuff) 1967 1770 { 1771 + DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=%llu", 1772 + (unsigned long long)pledgedSrcSize); 1968 1773 { 1969 1774 ZSTD_compressionParameters adjusted_cdict_cParams = cdict->matchState.cParams; 1970 1775 unsigned const windowLog = params.cParams.windowLog; ··· 1982 1783 params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, 1983 1784 cdict->dictContentSize, ZSTD_cpm_attachDict); 1984 1785 params.cParams.windowLog = windowLog; 1985 - FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, 1786 + params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ 1787 + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize, 1788 + /* loadedDictSize */ 0, 1986 1789 ZSTDcrp_makeClean, zbuff), ""); 1987 1790 assert(cctx->appliedParams.cParams.strategy == adjusted_cdict_cParams.strategy); 1988 1791 } ··· 2028 1827 const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams; 2029 1828 2030 1829 assert(!cdict->matchState.dedicatedDictSearch); 2031 - 2032 - DEBUGLOG(4, "copying dictionary into context"); 1830 + DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=%llu", 1831 + (unsigned long long)pledgedSrcSize); 2033 1832 2034 1833 { unsigned const windowLog = params.cParams.windowLog; 2035 1834 assert(windowLog != 0); 2036 1835 /* Copy only compression parameters related to tables. */ 2037 1836 params.cParams = *cdict_cParams; 2038 1837 params.cParams.windowLog = windowLog; 2039 - FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, 1838 + params.useRowMatchFinder = cdict->useRowMatchFinder; 1839 + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize, 1840 + /* loadedDictSize */ 0, 2040 1841 ZSTDcrp_leaveDirty, zbuff), ""); 2041 1842 assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy); 2042 1843 assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog); ··· 2046 1843 } 2047 1844 2048 1845 ZSTD_cwksp_mark_tables_dirty(&cctx->workspace); 1846 + assert(params.useRowMatchFinder != ZSTD_ps_auto); 2049 1847 2050 1848 /* copy tables */ 2051 - { size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog); 1849 + { size_t const chainSize = ZSTD_allocateChainTable(cdict_cParams->strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */) 1850 + ? ((size_t)1 << cdict_cParams->chainLog) 1851 + : 0; 2052 1852 size_t const hSize = (size_t)1 << cdict_cParams->hashLog; 2053 1853 2054 1854 ZSTD_memcpy(cctx->blockState.matchState.hashTable, 2055 1855 cdict->matchState.hashTable, 2056 1856 hSize * sizeof(U32)); 2057 - ZSTD_memcpy(cctx->blockState.matchState.chainTable, 1857 + /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ 1858 + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { 1859 + ZSTD_memcpy(cctx->blockState.matchState.chainTable, 2058 1860 cdict->matchState.chainTable, 2059 1861 chainSize * sizeof(U32)); 1862 + } 1863 + /* copy tag table */ 1864 + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { 1865 + size_t const tagTableSize = hSize*sizeof(U16); 1866 + ZSTD_memcpy(cctx->blockState.matchState.tagTable, 1867 + cdict->matchState.tagTable, 1868 + tagTableSize); 1869 + } 2060 1870 } 2061 1871 2062 1872 /* Zero the hashTable3, since the cdict never fills it */ ··· 2133 1917 U64 pledgedSrcSize, 2134 1918 ZSTD_buffered_policy_e zbuff) 2135 1919 { 2136 - DEBUGLOG(5, "ZSTD_copyCCtx_internal"); 2137 1920 RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong, 2138 1921 "Can't copy a ctx that's not in init stage."); 2139 - 1922 + DEBUGLOG(5, "ZSTD_copyCCtx_internal"); 2140 1923 ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); 2141 1924 { ZSTD_CCtx_params params = dstCCtx->requestedParams; 2142 1925 /* Copy only compression parameters related to tables. */ 2143 1926 params.cParams = srcCCtx->appliedParams.cParams; 1927 + assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto); 1928 + assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto); 1929 + assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto); 1930 + params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder; 1931 + params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; 1932 + params.ldmParams = srcCCtx->appliedParams.ldmParams; 2144 1933 params.fParams = fParams; 2145 - ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize, 1934 + ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize, 1935 + /* loadedDictSize */ 0, 2146 1936 ZSTDcrp_leaveDirty, zbuff); 2147 1937 assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog); 2148 1938 assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy); ··· 2160 1938 ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace); 2161 1939 2162 1940 /* copy tables */ 2163 - { size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog); 1941 + { size_t const chainSize = ZSTD_allocateChainTable(srcCCtx->appliedParams.cParams.strategy, 1942 + srcCCtx->appliedParams.useRowMatchFinder, 1943 + 0 /* forDDSDict */) 1944 + ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog) 1945 + : 0; 2164 1946 size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog; 2165 1947 int const h3log = srcCCtx->blockState.matchState.hashLog3; 2166 1948 size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; ··· 2231 2005 int const nbRows = (int)size / ZSTD_ROWSIZE; 2232 2006 int cellNb = 0; 2233 2007 int rowNb; 2008 + /* Protect special index values < ZSTD_WINDOW_START_INDEX. */ 2009 + U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX; 2234 2010 assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */ 2235 2011 assert(size < (1U<<31)); /* can be casted to int */ 2236 2012 ··· 2240 2012 for (rowNb=0 ; rowNb < nbRows ; rowNb++) { 2241 2013 int column; 2242 2014 for (column=0; column<ZSTD_ROWSIZE; column++) { 2243 - if (preserveMark) { 2244 - U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0; 2245 - table[cellNb] += adder; 2015 + U32 newVal; 2016 + if (preserveMark && table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) { 2017 + /* This write is pointless, but is required(?) for the compiler 2018 + * to auto-vectorize the loop. */ 2019 + newVal = ZSTD_DUBT_UNSORTED_MARK; 2020 + } else if (table[cellNb] < reducerThreshold) { 2021 + newVal = 0; 2022 + } else { 2023 + newVal = table[cellNb] - reducerValue; 2246 2024 } 2247 - if (table[cellNb] < reducerValue) table[cellNb] = 0; 2248 - else table[cellNb] -= reducerValue; 2025 + table[cellNb] = newVal; 2249 2026 cellNb++; 2250 2027 } } 2251 2028 } ··· 2273 2040 ZSTD_reduceTable(ms->hashTable, hSize, reducerValue); 2274 2041 } 2275 2042 2276 - if (params->cParams.strategy != ZSTD_fast) { 2043 + if (ZSTD_allocateChainTable(params->cParams.strategy, params->useRowMatchFinder, (U32)ms->dedicatedDictSearch)) { 2277 2044 U32 const chainSize = (U32)1 << params->cParams.chainLog; 2278 2045 if (params->cParams.strategy == ZSTD_btlazy2) 2279 2046 ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue); ··· 2305 2072 assert(nbSeq <= seqStorePtr->maxNbSeq); 2306 2073 for (u=0; u<nbSeq; u++) { 2307 2074 U32 const llv = sequences[u].litLength; 2308 - U32 const mlv = sequences[u].matchLength; 2075 + U32 const mlv = sequences[u].mlBase; 2309 2076 llCodeTable[u] = (BYTE)ZSTD_LLcode(llv); 2310 - ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset); 2077 + ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase); 2311 2078 mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv); 2312 2079 } 2313 - if (seqStorePtr->longLengthID==1) 2080 + if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) 2314 2081 llCodeTable[seqStorePtr->longLengthPos] = MaxLL; 2315 - if (seqStorePtr->longLengthID==2) 2082 + if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) 2316 2083 mlCodeTable[seqStorePtr->longLengthPos] = MaxML; 2317 2084 } 2318 2085 ··· 2326 2093 return (cctxParams->targetCBlockSize != 0); 2327 2094 } 2328 2095 2329 - /* ZSTD_entropyCompressSequences_internal(): 2330 - * actually compresses both literals and sequences */ 2096 + /* ZSTD_blockSplitterEnabled(): 2097 + * Returns if block splitting param is being used 2098 + * If used, compression will do best effort to split a block in order to improve compression ratio. 2099 + * At the time this function is called, the parameter must be finalized. 2100 + * Returns 1 if true, 0 otherwise. */ 2101 + static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams) 2102 + { 2103 + DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter); 2104 + assert(cctxParams->useBlockSplitter != ZSTD_ps_auto); 2105 + return (cctxParams->useBlockSplitter == ZSTD_ps_enable); 2106 + } 2107 + 2108 + /* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types 2109 + * and size of the sequences statistics 2110 + */ 2111 + typedef struct { 2112 + U32 LLtype; 2113 + U32 Offtype; 2114 + U32 MLtype; 2115 + size_t size; 2116 + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ 2117 + } ZSTD_symbolEncodingTypeStats_t; 2118 + 2119 + /* ZSTD_buildSequencesStatistics(): 2120 + * Returns a ZSTD_symbolEncodingTypeStats_t, or a zstd error code in the `size` field. 2121 + * Modifies `nextEntropy` to have the appropriate values as a side effect. 2122 + * nbSeq must be greater than 0. 2123 + * 2124 + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) 2125 + */ 2126 + static ZSTD_symbolEncodingTypeStats_t 2127 + ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, 2128 + const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, 2129 + BYTE* dst, const BYTE* const dstEnd, 2130 + ZSTD_strategy strategy, unsigned* countWorkspace, 2131 + void* entropyWorkspace, size_t entropyWkspSize) { 2132 + BYTE* const ostart = dst; 2133 + const BYTE* const oend = dstEnd; 2134 + BYTE* op = ostart; 2135 + FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; 2136 + FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; 2137 + FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; 2138 + const BYTE* const ofCodeTable = seqStorePtr->ofCode; 2139 + const BYTE* const llCodeTable = seqStorePtr->llCode; 2140 + const BYTE* const mlCodeTable = seqStorePtr->mlCode; 2141 + ZSTD_symbolEncodingTypeStats_t stats; 2142 + 2143 + stats.lastCountSize = 0; 2144 + /* convert length/distances into codes */ 2145 + ZSTD_seqToCodes(seqStorePtr); 2146 + assert(op <= oend); 2147 + assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ 2148 + /* build CTable for Literal Lengths */ 2149 + { unsigned max = MaxLL; 2150 + size_t const mostFrequent = HIST_countFast_wksp(countWorkspace, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ 2151 + DEBUGLOG(5, "Building LL table"); 2152 + nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; 2153 + stats.LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, 2154 + countWorkspace, max, mostFrequent, nbSeq, 2155 + LLFSELog, prevEntropy->litlengthCTable, 2156 + LL_defaultNorm, LL_defaultNormLog, 2157 + ZSTD_defaultAllowed, strategy); 2158 + assert(set_basic < set_compressed && set_rle < set_compressed); 2159 + assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ 2160 + { size_t const countSize = ZSTD_buildCTable( 2161 + op, (size_t)(oend - op), 2162 + CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LLtype, 2163 + countWorkspace, max, llCodeTable, nbSeq, 2164 + LL_defaultNorm, LL_defaultNormLog, MaxLL, 2165 + prevEntropy->litlengthCTable, 2166 + sizeof(prevEntropy->litlengthCTable), 2167 + entropyWorkspace, entropyWkspSize); 2168 + if (ZSTD_isError(countSize)) { 2169 + DEBUGLOG(3, "ZSTD_buildCTable for LitLens failed"); 2170 + stats.size = countSize; 2171 + return stats; 2172 + } 2173 + if (stats.LLtype == set_compressed) 2174 + stats.lastCountSize = countSize; 2175 + op += countSize; 2176 + assert(op <= oend); 2177 + } } 2178 + /* build CTable for Offsets */ 2179 + { unsigned max = MaxOff; 2180 + size_t const mostFrequent = HIST_countFast_wksp( 2181 + countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ 2182 + /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ 2183 + ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; 2184 + DEBUGLOG(5, "Building OF table"); 2185 + nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; 2186 + stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, 2187 + countWorkspace, max, mostFrequent, nbSeq, 2188 + OffFSELog, prevEntropy->offcodeCTable, 2189 + OF_defaultNorm, OF_defaultNormLog, 2190 + defaultPolicy, strategy); 2191 + assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ 2192 + { size_t const countSize = ZSTD_buildCTable( 2193 + op, (size_t)(oend - op), 2194 + CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.Offtype, 2195 + countWorkspace, max, ofCodeTable, nbSeq, 2196 + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, 2197 + prevEntropy->offcodeCTable, 2198 + sizeof(prevEntropy->offcodeCTable), 2199 + entropyWorkspace, entropyWkspSize); 2200 + if (ZSTD_isError(countSize)) { 2201 + DEBUGLOG(3, "ZSTD_buildCTable for Offsets failed"); 2202 + stats.size = countSize; 2203 + return stats; 2204 + } 2205 + if (stats.Offtype == set_compressed) 2206 + stats.lastCountSize = countSize; 2207 + op += countSize; 2208 + assert(op <= oend); 2209 + } } 2210 + /* build CTable for MatchLengths */ 2211 + { unsigned max = MaxML; 2212 + size_t const mostFrequent = HIST_countFast_wksp( 2213 + countWorkspace, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ 2214 + DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); 2215 + nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; 2216 + stats.MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, 2217 + countWorkspace, max, mostFrequent, nbSeq, 2218 + MLFSELog, prevEntropy->matchlengthCTable, 2219 + ML_defaultNorm, ML_defaultNormLog, 2220 + ZSTD_defaultAllowed, strategy); 2221 + assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ 2222 + { size_t const countSize = ZSTD_buildCTable( 2223 + op, (size_t)(oend - op), 2224 + CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.MLtype, 2225 + countWorkspace, max, mlCodeTable, nbSeq, 2226 + ML_defaultNorm, ML_defaultNormLog, MaxML, 2227 + prevEntropy->matchlengthCTable, 2228 + sizeof(prevEntropy->matchlengthCTable), 2229 + entropyWorkspace, entropyWkspSize); 2230 + if (ZSTD_isError(countSize)) { 2231 + DEBUGLOG(3, "ZSTD_buildCTable for MatchLengths failed"); 2232 + stats.size = countSize; 2233 + return stats; 2234 + } 2235 + if (stats.MLtype == set_compressed) 2236 + stats.lastCountSize = countSize; 2237 + op += countSize; 2238 + assert(op <= oend); 2239 + } } 2240 + stats.size = (size_t)(op-ostart); 2241 + return stats; 2242 + } 2243 + 2244 + /* ZSTD_entropyCompressSeqStore_internal(): 2245 + * compresses both literals and sequences 2246 + * Returns compressed size of block, or a zstd error. 2247 + */ 2248 + #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 2331 2249 MEM_STATIC size_t 2332 - ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, 2250 + ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, 2333 2251 const ZSTD_entropyCTables_t* prevEntropy, 2334 2252 ZSTD_entropyCTables_t* nextEntropy, 2335 2253 const ZSTD_CCtx_params* cctxParams, ··· 2494 2110 FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; 2495 2111 FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; 2496 2112 FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; 2497 - U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */ 2498 2113 const seqDef* const sequences = seqStorePtr->sequencesStart; 2114 + const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; 2499 2115 const BYTE* const ofCodeTable = seqStorePtr->ofCode; 2500 2116 const BYTE* const llCodeTable = seqStorePtr->llCode; 2501 2117 const BYTE* const mlCodeTable = seqStorePtr->mlCode; 2502 2118 BYTE* const ostart = (BYTE*)dst; 2503 2119 BYTE* const oend = ostart + dstCapacity; 2504 2120 BYTE* op = ostart; 2505 - size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); 2506 - BYTE* seqHead; 2507 - BYTE* lastNCount = NULL; 2121 + size_t lastCountSize; 2508 2122 2509 2123 entropyWorkspace = count + (MaxSeq + 1); 2510 2124 entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); 2511 2125 2512 - DEBUGLOG(4, "ZSTD_entropyCompressSequences_internal (nbSeq=%zu)", nbSeq); 2126 + DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq); 2513 2127 ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog))); 2514 2128 assert(entropyWkspSize >= HUF_WORKSPACE_SIZE); 2515 2129 2516 2130 /* Compress literals */ 2517 2131 { const BYTE* const literals = seqStorePtr->litStart; 2132 + size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; 2133 + size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; 2134 + /* Base suspicion of uncompressibility on ratio of literals to sequences */ 2135 + unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); 2518 2136 size_t const litSize = (size_t)(seqStorePtr->lit - literals); 2519 2137 size_t const cSize = ZSTD_compressLiterals( 2520 2138 &prevEntropy->huf, &nextEntropy->huf, 2521 2139 cctxParams->cParams.strategy, 2522 - ZSTD_disableLiteralsCompression(cctxParams), 2140 + ZSTD_literalsCompressionIsDisabled(cctxParams), 2523 2141 op, dstCapacity, 2524 2142 literals, litSize, 2525 2143 entropyWorkspace, entropyWkspSize, 2526 - bmi2); 2144 + bmi2, suspectUncompressible); 2527 2145 FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); 2528 2146 assert(cSize <= dstCapacity); 2529 2147 op += cSize; ··· 2551 2165 ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); 2552 2166 return (size_t)(op - ostart); 2553 2167 } 2554 - 2555 - /* seqHead : flags for FSE encoding type */ 2556 - seqHead = op++; 2557 - assert(op <= oend); 2558 - 2559 - /* convert length/distances into codes */ 2560 - ZSTD_seqToCodes(seqStorePtr); 2561 - /* build CTable for Literal Lengths */ 2562 - { unsigned max = MaxLL; 2563 - size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ 2564 - DEBUGLOG(5, "Building LL table"); 2565 - nextEntropy->fse.litlength_repeatMode = prevEntropy->fse.litlength_repeatMode; 2566 - LLtype = ZSTD_selectEncodingType(&nextEntropy->fse.litlength_repeatMode, 2567 - count, max, mostFrequent, nbSeq, 2568 - LLFSELog, prevEntropy->fse.litlengthCTable, 2569 - LL_defaultNorm, LL_defaultNormLog, 2570 - ZSTD_defaultAllowed, strategy); 2571 - assert(set_basic < set_compressed && set_rle < set_compressed); 2572 - assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ 2573 - { size_t const countSize = ZSTD_buildCTable( 2574 - op, (size_t)(oend - op), 2575 - CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, 2576 - count, max, llCodeTable, nbSeq, 2577 - LL_defaultNorm, LL_defaultNormLog, MaxLL, 2578 - prevEntropy->fse.litlengthCTable, 2579 - sizeof(prevEntropy->fse.litlengthCTable), 2580 - entropyWorkspace, entropyWkspSize); 2581 - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed"); 2582 - if (LLtype == set_compressed) 2583 - lastNCount = op; 2584 - op += countSize; 2585 - assert(op <= oend); 2586 - } } 2587 - /* build CTable for Offsets */ 2588 - { unsigned max = MaxOff; 2589 - size_t const mostFrequent = HIST_countFast_wksp( 2590 - count, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ 2591 - /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ 2592 - ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; 2593 - DEBUGLOG(5, "Building OF table"); 2594 - nextEntropy->fse.offcode_repeatMode = prevEntropy->fse.offcode_repeatMode; 2595 - Offtype = ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repeatMode, 2596 - count, max, mostFrequent, nbSeq, 2597 - OffFSELog, prevEntropy->fse.offcodeCTable, 2598 - OF_defaultNorm, OF_defaultNormLog, 2599 - defaultPolicy, strategy); 2600 - assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ 2601 - { size_t const countSize = ZSTD_buildCTable( 2602 - op, (size_t)(oend - op), 2603 - CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, 2604 - count, max, ofCodeTable, nbSeq, 2605 - OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, 2606 - prevEntropy->fse.offcodeCTable, 2607 - sizeof(prevEntropy->fse.offcodeCTable), 2608 - entropyWorkspace, entropyWkspSize); 2609 - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed"); 2610 - if (Offtype == set_compressed) 2611 - lastNCount = op; 2612 - op += countSize; 2613 - assert(op <= oend); 2614 - } } 2615 - /* build CTable for MatchLengths */ 2616 - { unsigned max = MaxML; 2617 - size_t const mostFrequent = HIST_countFast_wksp( 2618 - count, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ 2619 - DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); 2620 - nextEntropy->fse.matchlength_repeatMode = prevEntropy->fse.matchlength_repeatMode; 2621 - MLtype = ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_repeatMode, 2622 - count, max, mostFrequent, nbSeq, 2623 - MLFSELog, prevEntropy->fse.matchlengthCTable, 2624 - ML_defaultNorm, ML_defaultNormLog, 2625 - ZSTD_defaultAllowed, strategy); 2626 - assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ 2627 - { size_t const countSize = ZSTD_buildCTable( 2628 - op, (size_t)(oend - op), 2629 - CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, 2630 - count, max, mlCodeTable, nbSeq, 2631 - ML_defaultNorm, ML_defaultNormLog, MaxML, 2632 - prevEntropy->fse.matchlengthCTable, 2633 - sizeof(prevEntropy->fse.matchlengthCTable), 2634 - entropyWorkspace, entropyWkspSize); 2635 - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed"); 2636 - if (MLtype == set_compressed) 2637 - lastNCount = op; 2638 - op += countSize; 2639 - assert(op <= oend); 2640 - } } 2641 - 2642 - *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); 2168 + { 2169 + ZSTD_symbolEncodingTypeStats_t stats; 2170 + BYTE* seqHead = op++; 2171 + /* build stats for sequences */ 2172 + stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, 2173 + &prevEntropy->fse, &nextEntropy->fse, 2174 + op, oend, 2175 + strategy, count, 2176 + entropyWorkspace, entropyWkspSize); 2177 + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); 2178 + *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); 2179 + lastCountSize = stats.lastCountSize; 2180 + op += stats.size; 2181 + } 2643 2182 2644 2183 { size_t const bitstreamSize = ZSTD_encodeSequences( 2645 2184 op, (size_t)(oend - op), ··· 2584 2273 * In this exceedingly rare case, we will simply emit an uncompressed 2585 2274 * block, since it isn't worth optimizing. 2586 2275 */ 2587 - if (lastNCount && (op - lastNCount) < 4) { 2588 - /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ 2589 - assert(op - lastNCount == 3); 2276 + if (lastCountSize && (lastCountSize + bitstreamSize) < 4) { 2277 + /* lastCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ 2278 + assert(lastCountSize + bitstreamSize == 3); 2590 2279 DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by " 2591 2280 "emitting an uncompressed block."); 2592 2281 return 0; ··· 2598 2287 } 2599 2288 2600 2289 MEM_STATIC size_t 2601 - ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr, 2290 + ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, 2602 2291 const ZSTD_entropyCTables_t* prevEntropy, 2603 2292 ZSTD_entropyCTables_t* nextEntropy, 2604 2293 const ZSTD_CCtx_params* cctxParams, ··· 2607 2296 void* entropyWorkspace, size_t entropyWkspSize, 2608 2297 int bmi2) 2609 2298 { 2610 - size_t const cSize = ZSTD_entropyCompressSequences_internal( 2299 + size_t const cSize = ZSTD_entropyCompressSeqStore_internal( 2611 2300 seqStorePtr, prevEntropy, nextEntropy, cctxParams, 2612 2301 dst, dstCapacity, 2613 2302 entropyWorkspace, entropyWkspSize, bmi2); ··· 2617 2306 */ 2618 2307 if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) 2619 2308 return 0; /* block not compressed */ 2620 - FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSequences_internal failed"); 2309 + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); 2621 2310 2622 2311 /* Check compressibility */ 2623 2312 { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); 2624 2313 if (cSize >= maxCSize) return 0; /* block not compressed */ 2625 2314 } 2626 - DEBUGLOG(4, "ZSTD_entropyCompressSequences() cSize: %zu\n", cSize); 2315 + DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); 2627 2316 return cSize; 2628 2317 } 2629 2318 2630 2319 /* ZSTD_selectBlockCompressor() : 2631 2320 * Not static, but internal use only (used by long distance matcher) 2632 2321 * assumption : strat is a valid strategy */ 2633 - ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode) 2322 + ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) 2634 2323 { 2635 2324 static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { 2636 2325 { ZSTD_compressBlock_fast /* default for 0 */, ··· 2678 2367 ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1); 2679 2368 2680 2369 assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); 2681 - selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; 2370 + DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); 2371 + if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { 2372 + static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { 2373 + { ZSTD_compressBlock_greedy_row, 2374 + ZSTD_compressBlock_lazy_row, 2375 + ZSTD_compressBlock_lazy2_row }, 2376 + { ZSTD_compressBlock_greedy_extDict_row, 2377 + ZSTD_compressBlock_lazy_extDict_row, 2378 + ZSTD_compressBlock_lazy2_extDict_row }, 2379 + { ZSTD_compressBlock_greedy_dictMatchState_row, 2380 + ZSTD_compressBlock_lazy_dictMatchState_row, 2381 + ZSTD_compressBlock_lazy2_dictMatchState_row }, 2382 + { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, 2383 + ZSTD_compressBlock_lazy_dedicatedDictSearch_row, 2384 + ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } 2385 + }; 2386 + DEBUGLOG(4, "Selecting a row-based matchfinder"); 2387 + assert(useRowMatchFinder != ZSTD_ps_auto); 2388 + selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy]; 2389 + } else { 2390 + selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; 2391 + } 2682 2392 assert(selectedCompressor != NULL); 2683 2393 return selectedCompressor; 2684 2394 } ··· 2715 2383 { 2716 2384 ssPtr->lit = ssPtr->litStart; 2717 2385 ssPtr->sequences = ssPtr->sequencesStart; 2718 - ssPtr->longLengthID = 0; 2386 + ssPtr->longLengthType = ZSTD_llt_none; 2719 2387 } 2720 2388 2721 2389 typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; ··· 2762 2430 zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i]; 2763 2431 } 2764 2432 if (zc->externSeqStore.pos < zc->externSeqStore.size) { 2765 - assert(!zc->appliedParams.ldmParams.enableLdm); 2433 + assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); 2766 2434 /* Updates ldmSeqStore.pos */ 2767 2435 lastLLSize = 2768 2436 ZSTD_ldm_blockCompress(&zc->externSeqStore, 2769 2437 ms, &zc->seqStore, 2770 2438 zc->blockState.nextCBlock->rep, 2439 + zc->appliedParams.useRowMatchFinder, 2771 2440 src, srcSize); 2772 2441 assert(zc->externSeqStore.pos <= zc->externSeqStore.size); 2773 - } else if (zc->appliedParams.ldmParams.enableLdm) { 2442 + } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { 2774 2443 rawSeqStore_t ldmSeqStore = kNullRawSeqStore; 2775 2444 2776 2445 ldmSeqStore.seq = zc->ldmSequences; ··· 2785 2452 ZSTD_ldm_blockCompress(&ldmSeqStore, 2786 2453 ms, &zc->seqStore, 2787 2454 zc->blockState.nextCBlock->rep, 2455 + zc->appliedParams.useRowMatchFinder, 2788 2456 src, srcSize); 2789 2457 assert(ldmSeqStore.pos == ldmSeqStore.size); 2790 2458 } else { /* not long range mode */ 2791 - ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode); 2459 + ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, 2460 + zc->appliedParams.useRowMatchFinder, 2461 + dictMode); 2792 2462 ms->ldmSeqStore = NULL; 2793 2463 lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); 2794 2464 } ··· 2819 2483 assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1); 2820 2484 ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); 2821 2485 for (i = 0; i < seqStoreSeqSize; ++i) { 2822 - U32 rawOffset = seqStoreSeqs[i].offset - ZSTD_REP_NUM; 2486 + U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM; 2823 2487 outSeqs[i].litLength = seqStoreSeqs[i].litLength; 2824 - outSeqs[i].matchLength = seqStoreSeqs[i].matchLength + MINMATCH; 2488 + outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH; 2825 2489 outSeqs[i].rep = 0; 2826 2490 2827 2491 if (i == seqStore->longLengthPos) { 2828 - if (seqStore->longLengthID == 1) { 2492 + if (seqStore->longLengthType == ZSTD_llt_literalLength) { 2829 2493 outSeqs[i].litLength += 0x10000; 2830 - } else if (seqStore->longLengthID == 2) { 2494 + } else if (seqStore->longLengthType == ZSTD_llt_matchLength) { 2831 2495 outSeqs[i].matchLength += 0x10000; 2832 2496 } 2833 2497 } 2834 2498 2835 - if (seqStoreSeqs[i].offset <= ZSTD_REP_NUM) { 2499 + if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) { 2836 2500 /* Derive the correct offset corresponding to a repcode */ 2837 - outSeqs[i].rep = seqStoreSeqs[i].offset; 2501 + outSeqs[i].rep = seqStoreSeqs[i].offBase; 2838 2502 if (outSeqs[i].litLength != 0) { 2839 2503 rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1]; 2840 2504 } else { ··· 2848 2512 outSeqs[i].offset = rawOffset; 2849 2513 /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode 2850 2514 so we provide seqStoreSeqs[i].offset - 1 */ 2851 - updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, 2852 - seqStoreSeqs[i].offset - 1, 2853 - seqStoreSeqs[i].litLength == 0); 2515 + ZSTD_updateRep(updatedRepcodes.rep, 2516 + seqStoreSeqs[i].offBase - 1, 2517 + seqStoreSeqs[i].litLength == 0); 2854 2518 literalsRead += outSeqs[i].litLength; 2855 2519 } 2856 2520 /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. ··· 2938 2602 return nbSeqs < 4 && nbLits < 10; 2939 2603 } 2940 2604 2941 - static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc) 2605 + static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) 2942 2606 { 2943 - ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock; 2944 - zc->blockState.prevCBlock = zc->blockState.nextCBlock; 2945 - zc->blockState.nextCBlock = tmp; 2607 + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; 2608 + bs->prevCBlock = bs->nextCBlock; 2609 + bs->nextCBlock = tmp; 2946 2610 } 2947 2611 2948 - static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, 2949 - void* dst, size_t dstCapacity, 2950 - const void* src, size_t srcSize, U32 frame) 2612 + /* Writes the block header */ 2613 + static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { 2614 + U32 const cBlockHeader = cSize == 1 ? 2615 + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : 2616 + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); 2617 + MEM_writeLE24(op, cBlockHeader); 2618 + DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); 2619 + } 2620 + 2621 + /* ZSTD_buildBlockEntropyStats_literals() : 2622 + * Builds entropy for the literals. 2623 + * Stores literals block type (raw, rle, compressed, repeat) and 2624 + * huffman description table to hufMetadata. 2625 + * Requires ENTROPY_WORKSPACE_SIZE workspace 2626 + * @return : size of huffman description table or error code */ 2627 + static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, 2628 + const ZSTD_hufCTables_t* prevHuf, 2629 + ZSTD_hufCTables_t* nextHuf, 2630 + ZSTD_hufCTablesMetadata_t* hufMetadata, 2631 + const int literalsCompressionIsDisabled, 2632 + void* workspace, size_t wkspSize) 2633 + { 2634 + BYTE* const wkspStart = (BYTE*)workspace; 2635 + BYTE* const wkspEnd = wkspStart + wkspSize; 2636 + BYTE* const countWkspStart = wkspStart; 2637 + unsigned* const countWksp = (unsigned*)workspace; 2638 + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); 2639 + BYTE* const nodeWksp = countWkspStart + countWkspSize; 2640 + const size_t nodeWkspSize = wkspEnd-nodeWksp; 2641 + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; 2642 + unsigned huffLog = HUF_TABLELOG_DEFAULT; 2643 + HUF_repeat repeat = prevHuf->repeatMode; 2644 + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); 2645 + 2646 + /* Prepare nextEntropy assuming reusing the existing table */ 2647 + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); 2648 + 2649 + if (literalsCompressionIsDisabled) { 2650 + DEBUGLOG(5, "set_basic - disabled"); 2651 + hufMetadata->hType = set_basic; 2652 + return 0; 2653 + } 2654 + 2655 + /* small ? don't even attempt compression (speed opt) */ 2656 + #ifndef COMPRESS_LITERALS_SIZE_MIN 2657 + #define COMPRESS_LITERALS_SIZE_MIN 63 2658 + #endif 2659 + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; 2660 + if (srcSize <= minLitSize) { 2661 + DEBUGLOG(5, "set_basic - too small"); 2662 + hufMetadata->hType = set_basic; 2663 + return 0; 2664 + } 2665 + } 2666 + 2667 + /* Scan input and build symbol stats */ 2668 + { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); 2669 + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); 2670 + if (largest == srcSize) { 2671 + DEBUGLOG(5, "set_rle"); 2672 + hufMetadata->hType = set_rle; 2673 + return 0; 2674 + } 2675 + if (largest <= (srcSize >> 7)+4) { 2676 + DEBUGLOG(5, "set_basic - no gain"); 2677 + hufMetadata->hType = set_basic; 2678 + return 0; 2679 + } 2680 + } 2681 + 2682 + /* Validate the previous Huffman table */ 2683 + if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { 2684 + repeat = HUF_repeat_none; 2685 + } 2686 + 2687 + /* Build Huffman Tree */ 2688 + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); 2689 + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); 2690 + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, 2691 + maxSymbolValue, huffLog, 2692 + nodeWksp, nodeWkspSize); 2693 + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); 2694 + huffLog = (U32)maxBits; 2695 + { /* Build and write the CTable */ 2696 + size_t const newCSize = HUF_estimateCompressedSize( 2697 + (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); 2698 + size_t const hSize = HUF_writeCTable_wksp( 2699 + hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), 2700 + (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, 2701 + nodeWksp, nodeWkspSize); 2702 + /* Check against repeating the previous CTable */ 2703 + if (repeat != HUF_repeat_none) { 2704 + size_t const oldCSize = HUF_estimateCompressedSize( 2705 + (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); 2706 + if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { 2707 + DEBUGLOG(5, "set_repeat - smaller"); 2708 + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); 2709 + hufMetadata->hType = set_repeat; 2710 + return 0; 2711 + } 2712 + } 2713 + if (newCSize + hSize >= srcSize) { 2714 + DEBUGLOG(5, "set_basic - no gains"); 2715 + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); 2716 + hufMetadata->hType = set_basic; 2717 + return 0; 2718 + } 2719 + DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); 2720 + hufMetadata->hType = set_compressed; 2721 + nextHuf->repeatMode = HUF_repeat_check; 2722 + return hSize; 2723 + } 2724 + } 2725 + } 2726 + 2727 + 2728 + /* ZSTD_buildDummySequencesStatistics(): 2729 + * Returns a ZSTD_symbolEncodingTypeStats_t with all encoding types as set_basic, 2730 + * and updates nextEntropy to the appropriate repeatMode. 2731 + */ 2732 + static ZSTD_symbolEncodingTypeStats_t 2733 + ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { 2734 + ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0}; 2735 + nextEntropy->litlength_repeatMode = FSE_repeat_none; 2736 + nextEntropy->offcode_repeatMode = FSE_repeat_none; 2737 + nextEntropy->matchlength_repeatMode = FSE_repeat_none; 2738 + return stats; 2739 + } 2740 + 2741 + /* ZSTD_buildBlockEntropyStats_sequences() : 2742 + * Builds entropy for the sequences. 2743 + * Stores symbol compression modes and fse table to fseMetadata. 2744 + * Requires ENTROPY_WORKSPACE_SIZE wksp. 2745 + * @return : size of fse tables or error code */ 2746 + static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, 2747 + const ZSTD_fseCTables_t* prevEntropy, 2748 + ZSTD_fseCTables_t* nextEntropy, 2749 + const ZSTD_CCtx_params* cctxParams, 2750 + ZSTD_fseCTablesMetadata_t* fseMetadata, 2751 + void* workspace, size_t wkspSize) 2752 + { 2753 + ZSTD_strategy const strategy = cctxParams->cParams.strategy; 2754 + size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; 2755 + BYTE* const ostart = fseMetadata->fseTablesBuffer; 2756 + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); 2757 + BYTE* op = ostart; 2758 + unsigned* countWorkspace = (unsigned*)workspace; 2759 + unsigned* entropyWorkspace = countWorkspace + (MaxSeq + 1); 2760 + size_t entropyWorkspaceSize = wkspSize - (MaxSeq + 1) * sizeof(*countWorkspace); 2761 + ZSTD_symbolEncodingTypeStats_t stats; 2762 + 2763 + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_sequences (nbSeq=%zu)", nbSeq); 2764 + stats = nbSeq != 0 ? ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, 2765 + prevEntropy, nextEntropy, op, oend, 2766 + strategy, countWorkspace, 2767 + entropyWorkspace, entropyWorkspaceSize) 2768 + : ZSTD_buildDummySequencesStatistics(nextEntropy); 2769 + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); 2770 + fseMetadata->llType = (symbolEncodingType_e) stats.LLtype; 2771 + fseMetadata->ofType = (symbolEncodingType_e) stats.Offtype; 2772 + fseMetadata->mlType = (symbolEncodingType_e) stats.MLtype; 2773 + fseMetadata->lastCountSize = stats.lastCountSize; 2774 + return stats.size; 2775 + } 2776 + 2777 + 2778 + /* ZSTD_buildBlockEntropyStats() : 2779 + * Builds entropy for the block. 2780 + * Requires workspace size ENTROPY_WORKSPACE_SIZE 2781 + * 2782 + * @return : 0 on success or error code 2783 + */ 2784 + size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, 2785 + const ZSTD_entropyCTables_t* prevEntropy, 2786 + ZSTD_entropyCTables_t* nextEntropy, 2787 + const ZSTD_CCtx_params* cctxParams, 2788 + ZSTD_entropyCTablesMetadata_t* entropyMetadata, 2789 + void* workspace, size_t wkspSize) 2790 + { 2791 + size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; 2792 + entropyMetadata->hufMetadata.hufDesSize = 2793 + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, 2794 + &prevEntropy->huf, &nextEntropy->huf, 2795 + &entropyMetadata->hufMetadata, 2796 + ZSTD_literalsCompressionIsDisabled(cctxParams), 2797 + workspace, wkspSize); 2798 + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); 2799 + entropyMetadata->fseMetadata.fseTablesSize = 2800 + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, 2801 + &prevEntropy->fse, &nextEntropy->fse, 2802 + cctxParams, 2803 + &entropyMetadata->fseMetadata, 2804 + workspace, wkspSize); 2805 + FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildBlockEntropyStats_sequences failed"); 2806 + return 0; 2807 + } 2808 + 2809 + /* Returns the size estimate for the literals section (header + content) of a block */ 2810 + static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, 2811 + const ZSTD_hufCTables_t* huf, 2812 + const ZSTD_hufCTablesMetadata_t* hufMetadata, 2813 + void* workspace, size_t wkspSize, 2814 + int writeEntropy) 2815 + { 2816 + unsigned* const countWksp = (unsigned*)workspace; 2817 + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; 2818 + size_t literalSectionHeaderSize = 3 + (litSize >= 1 KB) + (litSize >= 16 KB); 2819 + U32 singleStream = litSize < 256; 2820 + 2821 + if (hufMetadata->hType == set_basic) return litSize; 2822 + else if (hufMetadata->hType == set_rle) return 1; 2823 + else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) { 2824 + size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize); 2825 + if (ZSTD_isError(largest)) return litSize; 2826 + { size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue); 2827 + if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize; 2828 + if (!singleStream) cLitSizeEstimate += 6; /* multi-stream huffman uses 6-byte jump table */ 2829 + return cLitSizeEstimate + literalSectionHeaderSize; 2830 + } } 2831 + assert(0); /* impossible */ 2832 + return 0; 2833 + } 2834 + 2835 + /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ 2836 + static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, 2837 + const BYTE* codeTable, size_t nbSeq, unsigned maxCode, 2838 + const FSE_CTable* fseCTable, 2839 + const U8* additionalBits, 2840 + short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, 2841 + void* workspace, size_t wkspSize) 2842 + { 2843 + unsigned* const countWksp = (unsigned*)workspace; 2844 + const BYTE* ctp = codeTable; 2845 + const BYTE* const ctStart = ctp; 2846 + const BYTE* const ctEnd = ctStart + nbSeq; 2847 + size_t cSymbolTypeSizeEstimateInBits = 0; 2848 + unsigned max = maxCode; 2849 + 2850 + HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize); /* can't fail */ 2851 + if (type == set_basic) { 2852 + /* We selected this encoding type, so it must be valid. */ 2853 + assert(max <= defaultMax); 2854 + (void)defaultMax; 2855 + cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max); 2856 + } else if (type == set_rle) { 2857 + cSymbolTypeSizeEstimateInBits = 0; 2858 + } else if (type == set_compressed || type == set_repeat) { 2859 + cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max); 2860 + } 2861 + if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) { 2862 + return nbSeq * 10; 2863 + } 2864 + while (ctp < ctEnd) { 2865 + if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp]; 2866 + else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */ 2867 + ctp++; 2868 + } 2869 + return cSymbolTypeSizeEstimateInBits >> 3; 2870 + } 2871 + 2872 + /* Returns the size estimate for the sequences section (header + content) of a block */ 2873 + static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, 2874 + const BYTE* llCodeTable, 2875 + const BYTE* mlCodeTable, 2876 + size_t nbSeq, 2877 + const ZSTD_fseCTables_t* fseTables, 2878 + const ZSTD_fseCTablesMetadata_t* fseMetadata, 2879 + void* workspace, size_t wkspSize, 2880 + int writeEntropy) 2881 + { 2882 + size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); 2883 + size_t cSeqSizeEstimate = 0; 2884 + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, 2885 + fseTables->offcodeCTable, NULL, 2886 + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, 2887 + workspace, wkspSize); 2888 + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, 2889 + fseTables->litlengthCTable, LL_bits, 2890 + LL_defaultNorm, LL_defaultNormLog, MaxLL, 2891 + workspace, wkspSize); 2892 + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, 2893 + fseTables->matchlengthCTable, ML_bits, 2894 + ML_defaultNorm, ML_defaultNormLog, MaxML, 2895 + workspace, wkspSize); 2896 + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; 2897 + return cSeqSizeEstimate + sequencesSectionHeaderSize; 2898 + } 2899 + 2900 + /* Returns the size estimate for a given stream of literals, of, ll, ml */ 2901 + static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, 2902 + const BYTE* ofCodeTable, 2903 + const BYTE* llCodeTable, 2904 + const BYTE* mlCodeTable, 2905 + size_t nbSeq, 2906 + const ZSTD_entropyCTables_t* entropy, 2907 + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, 2908 + void* workspace, size_t wkspSize, 2909 + int writeLitEntropy, int writeSeqEntropy) { 2910 + size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, 2911 + &entropy->huf, &entropyMetadata->hufMetadata, 2912 + workspace, wkspSize, writeLitEntropy); 2913 + size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, 2914 + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, 2915 + workspace, wkspSize, writeSeqEntropy); 2916 + return seqSize + literalsSize + ZSTD_blockHeaderSize; 2917 + } 2918 + 2919 + /* Builds entropy statistics and uses them for blocksize estimation. 2920 + * 2921 + * Returns the estimated compressed size of the seqStore, or a zstd error. 2922 + */ 2923 + static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) { 2924 + ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata; 2925 + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); 2926 + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, 2927 + &zc->blockState.prevCBlock->entropy, 2928 + &zc->blockState.nextCBlock->entropy, 2929 + &zc->appliedParams, 2930 + entropyMetadata, 2931 + zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); 2932 + return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), 2933 + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, 2934 + (size_t)(seqStore->sequences - seqStore->sequencesStart), 2935 + &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, 2936 + (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); 2937 + } 2938 + 2939 + /* Returns literals bytes represented in a seqStore */ 2940 + static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { 2941 + size_t literalsBytes = 0; 2942 + size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; 2943 + size_t i; 2944 + for (i = 0; i < nbSeqs; ++i) { 2945 + seqDef seq = seqStore->sequencesStart[i]; 2946 + literalsBytes += seq.litLength; 2947 + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { 2948 + literalsBytes += 0x10000; 2949 + } 2950 + } 2951 + return literalsBytes; 2952 + } 2953 + 2954 + /* Returns match bytes represented in a seqStore */ 2955 + static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { 2956 + size_t matchBytes = 0; 2957 + size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; 2958 + size_t i; 2959 + for (i = 0; i < nbSeqs; ++i) { 2960 + seqDef seq = seqStore->sequencesStart[i]; 2961 + matchBytes += seq.mlBase + MINMATCH; 2962 + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { 2963 + matchBytes += 0x10000; 2964 + } 2965 + } 2966 + return matchBytes; 2967 + } 2968 + 2969 + /* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx). 2970 + * Stores the result in resultSeqStore. 2971 + */ 2972 + static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, 2973 + const seqStore_t* originalSeqStore, 2974 + size_t startIdx, size_t endIdx) { 2975 + BYTE* const litEnd = originalSeqStore->lit; 2976 + size_t literalsBytes; 2977 + size_t literalsBytesPreceding = 0; 2978 + 2979 + *resultSeqStore = *originalSeqStore; 2980 + if (startIdx > 0) { 2981 + resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; 2982 + literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); 2983 + } 2984 + 2985 + /* Move longLengthPos into the correct position if necessary */ 2986 + if (originalSeqStore->longLengthType != ZSTD_llt_none) { 2987 + if (originalSeqStore->longLengthPos < startIdx || originalSeqStore->longLengthPos > endIdx) { 2988 + resultSeqStore->longLengthType = ZSTD_llt_none; 2989 + } else { 2990 + resultSeqStore->longLengthPos -= (U32)startIdx; 2991 + } 2992 + } 2993 + resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; 2994 + resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; 2995 + literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); 2996 + resultSeqStore->litStart += literalsBytesPreceding; 2997 + if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { 2998 + /* This accounts for possible last literals if the derived chunk reaches the end of the block */ 2999 + resultSeqStore->lit = litEnd; 3000 + } else { 3001 + resultSeqStore->lit = resultSeqStore->litStart+literalsBytes; 3002 + } 3003 + resultSeqStore->llCode += startIdx; 3004 + resultSeqStore->mlCode += startIdx; 3005 + resultSeqStore->ofCode += startIdx; 3006 + } 3007 + 3008 + /* 3009 + * Returns the raw offset represented by the combination of offCode, ll0, and repcode history. 3010 + * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq(). 3011 + */ 3012 + static U32 3013 + ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) 3014 + { 3015 + U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */ 3016 + assert(STORED_IS_REPCODE(offCode)); 3017 + if (adjustedOffCode == ZSTD_REP_NUM) { 3018 + /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */ 3019 + assert(rep[0] > 0); 3020 + return rep[0] - 1; 3021 + } 3022 + return rep[adjustedOffCode]; 3023 + } 3024 + 3025 + /* 3026 + * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in offset history that may arise 3027 + * due to emission of RLE/raw blocks that disturb the offset history, 3028 + * and replaces any repcodes within the seqStore that may be invalid. 3029 + * 3030 + * dRepcodes are updated as would be on the decompression side. 3031 + * cRepcodes are updated exactly in accordance with the seqStore. 3032 + * 3033 + * Note : this function assumes seq->offBase respects the following numbering scheme : 3034 + * 0 : invalid 3035 + * 1-3 : repcode 1-3 3036 + * 4+ : real_offset+3 3037 + */ 3038 + static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, 3039 + seqStore_t* const seqStore, U32 const nbSeq) { 3040 + U32 idx = 0; 3041 + for (; idx < nbSeq; ++idx) { 3042 + seqDef* const seq = seqStore->sequencesStart + idx; 3043 + U32 const ll0 = (seq->litLength == 0); 3044 + U32 const offCode = OFFBASE_TO_STORED(seq->offBase); 3045 + assert(seq->offBase > 0); 3046 + if (STORED_IS_REPCODE(offCode)) { 3047 + U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); 3048 + U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); 3049 + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace 3050 + * the repcode with the offset it actually references, determined by the compression 3051 + * repcode history. 3052 + */ 3053 + if (dRawOffset != cRawOffset) { 3054 + seq->offBase = cRawOffset + ZSTD_REP_NUM; 3055 + } 3056 + } 3057 + /* Compression repcode history is always updated with values directly from the unmodified seqStore. 3058 + * Decompression repcode history may use modified seq->offset value taken from compression repcode history. 3059 + */ 3060 + ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0); 3061 + ZSTD_updateRep(cRepcodes->rep, offCode, ll0); 3062 + } 3063 + } 3064 + 3065 + /* ZSTD_compressSeqStore_singleBlock(): 3066 + * Compresses a seqStore into a block with a block header, into the buffer dst. 3067 + * 3068 + * Returns the total size of that block (including header) or a ZSTD error code. 3069 + */ 3070 + static size_t 3071 + ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, 3072 + repcodes_t* const dRep, repcodes_t* const cRep, 3073 + void* dst, size_t dstCapacity, 3074 + const void* src, size_t srcSize, 3075 + U32 lastBlock, U32 isPartition) 3076 + { 3077 + const U32 rleMaxLength = 25; 3078 + BYTE* op = (BYTE*)dst; 3079 + const BYTE* ip = (const BYTE*)src; 3080 + size_t cSize; 3081 + size_t cSeqsSize; 3082 + 3083 + /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */ 3084 + repcodes_t const dRepOriginal = *dRep; 3085 + DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock"); 3086 + if (isPartition) 3087 + ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); 3088 + 3089 + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "Block header doesn't fit"); 3090 + cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore, 3091 + &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, 3092 + &zc->appliedParams, 3093 + op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, 3094 + srcSize, 3095 + zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, 3096 + zc->bmi2); 3097 + FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!"); 3098 + 3099 + if (!zc->isFirstBlock && 3100 + cSeqsSize < rleMaxLength && 3101 + ZSTD_isRLE((BYTE const*)src, srcSize)) { 3102 + /* We don't want to emit our first block as a RLE even if it qualifies because 3103 + * doing so will cause the decoder (cli only) to throw a "should consume all input error." 3104 + * This is only an issue for zstd <= v1.4.3 3105 + */ 3106 + cSeqsSize = 1; 3107 + } 3108 + 3109 + if (zc->seqCollector.collectSequences) { 3110 + ZSTD_copyBlockSequences(zc); 3111 + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); 3112 + return 0; 3113 + } 3114 + 3115 + if (cSeqsSize == 0) { 3116 + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); 3117 + FORWARD_IF_ERROR(cSize, "Nocompress block failed"); 3118 + DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize); 3119 + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ 3120 + } else if (cSeqsSize == 1) { 3121 + cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock); 3122 + FORWARD_IF_ERROR(cSize, "RLE compress block failed"); 3123 + DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize); 3124 + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ 3125 + } else { 3126 + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); 3127 + writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); 3128 + cSize = ZSTD_blockHeaderSize + cSeqsSize; 3129 + DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize); 3130 + } 3131 + 3132 + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) 3133 + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; 3134 + 3135 + return cSize; 3136 + } 3137 + 3138 + /* Struct to keep track of where we are in our recursive calls. */ 3139 + typedef struct { 3140 + U32* splitLocations; /* Array of split indices */ 3141 + size_t idx; /* The current index within splitLocations being worked on */ 3142 + } seqStoreSplits; 3143 + 3144 + #define MIN_SEQUENCES_BLOCK_SPLITTING 300 3145 + 3146 + /* Helper function to perform the recursive search for block splits. 3147 + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. 3148 + * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then 3149 + * we do not recurse. 3150 + * 3151 + * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. 3152 + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). 3153 + * In practice, recursion depth usually doesn't go beyond 4. 3154 + * 3155 + * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize 3156 + * maximum of 128 KB, this value is actually impossible to reach. 3157 + */ 3158 + static void 3159 + ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, 3160 + ZSTD_CCtx* zc, const seqStore_t* origSeqStore) 3161 + { 3162 + seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; 3163 + seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; 3164 + seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; 3165 + size_t estimatedOriginalSize; 3166 + size_t estimatedFirstHalfSize; 3167 + size_t estimatedSecondHalfSize; 3168 + size_t midIdx = (startIdx + endIdx)/2; 3169 + 3170 + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { 3171 + DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); 3172 + return; 3173 + } 3174 + DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); 3175 + ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); 3176 + ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); 3177 + ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); 3178 + estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); 3179 + estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); 3180 + estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); 3181 + DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", 3182 + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); 3183 + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { 3184 + return; 3185 + } 3186 + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { 3187 + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); 3188 + splits->splitLocations[splits->idx] = (U32)midIdx; 3189 + splits->idx++; 3190 + ZSTD_deriveBlockSplitsHelper(splits, midIdx, endIdx, zc, origSeqStore); 3191 + } 3192 + } 3193 + 3194 + /* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio. 3195 + * 3196 + * Returns the number of splits made (which equals the size of the partition table - 1). 3197 + */ 3198 + static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { 3199 + seqStoreSplits splits = {partitions, 0}; 3200 + if (nbSeq <= 4) { 3201 + DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); 3202 + /* Refuse to try and split anything with less than 4 sequences */ 3203 + return 0; 3204 + } 3205 + ZSTD_deriveBlockSplitsHelper(&splits, 0, nbSeq, zc, &zc->seqStore); 3206 + splits.splitLocations[splits.idx] = nbSeq; 3207 + DEBUGLOG(5, "ZSTD_deriveBlockSplits: final nb partitions: %zu", splits.idx+1); 3208 + return splits.idx; 3209 + } 3210 + 3211 + /* ZSTD_compressBlock_splitBlock(): 3212 + * Attempts to split a given block into multiple blocks to improve compression ratio. 3213 + * 3214 + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. 3215 + */ 3216 + static size_t 3217 + ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, 3218 + const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) 3219 + { 3220 + size_t cSize = 0; 3221 + const BYTE* ip = (const BYTE*)src; 3222 + BYTE* op = (BYTE*)dst; 3223 + size_t i = 0; 3224 + size_t srcBytesTotal = 0; 3225 + U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ 3226 + seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore; 3227 + seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore; 3228 + size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); 3229 + 3230 + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history 3231 + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two 3232 + * separate repcode histories that simulate repcode history on compression and decompression side, 3233 + * and use the histories to determine whether we must replace a particular repcode with its raw offset. 3234 + * 3235 + * 1) cRep gets updated for each partition, regardless of whether the block was emitted as uncompressed 3236 + * or RLE. This allows us to retrieve the offset value that an invalid repcode references within 3237 + * a nocompress/RLE block. 3238 + * 2) dRep gets updated only for compressed partitions, and when a repcode gets replaced, will use 3239 + * the replacement offset value rather than the original repcode to update the repcode history. 3240 + * dRep also will be the final repcode history sent to the next block. 3241 + * 3242 + * See ZSTD_seqStore_resolveOffCodes() for more details. 3243 + */ 3244 + repcodes_t dRep; 3245 + repcodes_t cRep; 3246 + ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); 3247 + ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); 3248 + ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); 3249 + 3250 + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", 3251 + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, 3252 + (unsigned)zc->blockState.matchState.nextToUpdate); 3253 + 3254 + if (numSplits == 0) { 3255 + size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, 3256 + &dRep, &cRep, 3257 + op, dstCapacity, 3258 + ip, blockSize, 3259 + lastBlock, 0 /* isPartition */); 3260 + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); 3261 + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); 3262 + assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); 3263 + return cSizeSingleBlock; 3264 + } 3265 + 3266 + ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); 3267 + for (i = 0; i <= numSplits; ++i) { 3268 + size_t srcBytes; 3269 + size_t cSizeChunk; 3270 + U32 const lastPartition = (i == numSplits); 3271 + U32 lastBlockEntireSrc = 0; 3272 + 3273 + srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); 3274 + srcBytesTotal += srcBytes; 3275 + if (lastPartition) { 3276 + /* This is the final partition, need to account for possible last literals */ 3277 + srcBytes += blockSize - srcBytesTotal; 3278 + lastBlockEntireSrc = lastBlock; 3279 + } else { 3280 + ZSTD_deriveSeqStoreChunk(nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]); 3281 + } 3282 + 3283 + cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, currSeqStore, 3284 + &dRep, &cRep, 3285 + op, dstCapacity, 3286 + ip, srcBytes, 3287 + lastBlockEntireSrc, 1 /* isPartition */); 3288 + DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); 3289 + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); 3290 + 3291 + ip += srcBytes; 3292 + op += cSizeChunk; 3293 + dstCapacity -= cSizeChunk; 3294 + cSize += cSizeChunk; 3295 + *currSeqStore = *nextSeqStore; 3296 + assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); 3297 + } 3298 + /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes 3299 + * for the next block. 3300 + */ 3301 + ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); 3302 + return cSize; 3303 + } 3304 + 3305 + static size_t 3306 + ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, 3307 + void* dst, size_t dstCapacity, 3308 + const void* src, size_t srcSize, U32 lastBlock) 3309 + { 3310 + const BYTE* ip = (const BYTE*)src; 3311 + BYTE* op = (BYTE*)dst; 3312 + U32 nbSeq; 3313 + size_t cSize; 3314 + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); 3315 + assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable); 3316 + 3317 + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); 3318 + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); 3319 + if (bss == ZSTDbss_noCompress) { 3320 + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) 3321 + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; 3322 + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); 3323 + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); 3324 + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); 3325 + return cSize; 3326 + } 3327 + nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart); 3328 + } 3329 + 3330 + cSize = ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity, src, srcSize, lastBlock, nbSeq); 3331 + FORWARD_IF_ERROR(cSize, "Splitting blocks failed!"); 3332 + return cSize; 3333 + } 3334 + 3335 + static size_t 3336 + ZSTD_compressBlock_internal(ZSTD_CCtx* zc, 3337 + void* dst, size_t dstCapacity, 3338 + const void* src, size_t srcSize, U32 frame) 2951 3339 { 2952 3340 /* This the upper bound for the length of an rle block. 2953 3341 * This isn't the actual upper bound. Finding the real threshold ··· 3692 2632 3693 2633 if (zc->seqCollector.collectSequences) { 3694 2634 ZSTD_copyBlockSequences(zc); 3695 - ZSTD_confirmRepcodesAndEntropyTables(zc); 2635 + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); 3696 2636 return 0; 3697 2637 } 3698 2638 3699 2639 /* encode sequences and literals */ 3700 - cSize = ZSTD_entropyCompressSequences(&zc->seqStore, 2640 + cSize = ZSTD_entropyCompressSeqStore(&zc->seqStore, 3701 2641 &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, 3702 2642 &zc->appliedParams, 3703 2643 dst, dstCapacity, 3704 2644 srcSize, 3705 2645 zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, 3706 2646 zc->bmi2); 3707 - 3708 - if (zc->seqCollector.collectSequences) { 3709 - ZSTD_copyBlockSequences(zc); 3710 - return 0; 3711 - } 3712 - 3713 2647 3714 2648 if (frame && 3715 2649 /* We don't want to emit our first block as a RLE even if it qualifies because ··· 3720 2666 3721 2667 out: 3722 2668 if (!ZSTD_isError(cSize) && cSize > 1) { 3723 - ZSTD_confirmRepcodesAndEntropyTables(zc); 2669 + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); 3724 2670 } 3725 2671 /* We check that dictionaries have offset codes available for the first 3726 2672 * block. After the first block, the offcode table might not have large ··· 3773 2719 size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); 3774 2720 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); 3775 2721 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { 3776 - ZSTD_confirmRepcodesAndEntropyTables(zc); 2722 + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); 3777 2723 return cSize; 3778 2724 } 3779 2725 } ··· 3813 2759 void const* ip, 3814 2760 void const* iend) 3815 2761 { 3816 - if (ZSTD_window_needOverflowCorrection(ms->window, iend)) { 3817 - U32 const maxDist = (U32)1 << params->cParams.windowLog; 3818 - U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); 2762 + U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); 2763 + U32 const maxDist = (U32)1 << params->cParams.windowLog; 2764 + if (ZSTD_window_needOverflowCorrection(ms->window, cycleLog, maxDist, ms->loadedDictEnd, ip, iend)) { 3819 2765 U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip); 3820 2766 ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30); 3821 2767 ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30); ··· 3838 2784 * Frame is supposed already started (header already produced) 3839 2785 * @return : compressed size, or an error code 3840 2786 */ 3841 - static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, 2787 + static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, 3842 2788 void* dst, size_t dstCapacity, 3843 2789 const void* src, size_t srcSize, 3844 2790 U32 lastFrameChunk) ··· 3868 2814 ZSTD_overflowCorrectIfNeeded( 3869 2815 ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize); 3870 2816 ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); 2817 + ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); 3871 2818 3872 2819 /* Ensure hash/chain table insertion resumes no sooner than lowlimit */ 3873 2820 if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit; ··· 3879 2824 FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed"); 3880 2825 assert(cSize > 0); 3881 2826 assert(cSize <= blockSize + ZSTD_blockHeaderSize); 2827 + } else if (ZSTD_blockSplitterEnabled(&cctx->appliedParams)) { 2828 + cSize = ZSTD_compressBlock_splitBlock(cctx, op, dstCapacity, ip, blockSize, lastBlock); 2829 + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_splitBlock failed"); 2830 + assert(cSize > 0 || cctx->seqCollector.collectSequences == 1); 3882 2831 } else { 3883 2832 cSize = ZSTD_compressBlock_internal(cctx, 3884 2833 op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, ··· 4005 2946 { 4006 2947 RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong, 4007 2948 "wrong cctx stage"); 4008 - RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm, 2949 + RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable, 4009 2950 parameter_unsupported, 4010 2951 "incompatible with ldm"); 4011 2952 cctx->externSeqStore.seq = seq; ··· 4042 2983 4043 2984 if (!srcSize) return fhSize; /* do not generate an empty block if no input */ 4044 2985 4045 - if (!ZSTD_window_update(&ms->window, src, srcSize)) { 2986 + if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContiguous)) { 2987 + ms->forceNonContiguous = 0; 4046 2988 ms->nextToUpdate = ms->window.dictLimit; 4047 2989 } 4048 - if (cctx->appliedParams.ldmParams.enableLdm) { 4049 - ZSTD_window_update(&cctx->ldmState.window, src, srcSize); 2990 + if (cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { 2991 + ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0); 4050 2992 } 4051 2993 4052 2994 if (!frame) { ··· 4115 3055 { 4116 3056 const BYTE* ip = (const BYTE*) src; 4117 3057 const BYTE* const iend = ip + srcSize; 4118 - 4119 - ZSTD_window_update(&ms->window, src, srcSize); 4120 - ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); 4121 - 4122 - if (params->ldmParams.enableLdm && ls != NULL) { 4123 - ZSTD_window_update(&ls->window, src, srcSize); 4124 - ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); 4125 - } 3058 + int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; 4126 3059 4127 3060 /* Assert that we the ms params match the params we're being given */ 4128 3061 ZSTD_assertEqualCParams(params->cParams, ms->cParams); 4129 3062 3063 + if (srcSize > ZSTD_CHUNKSIZE_MAX) { 3064 + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. 3065 + * Dictionaries right at the edge will immediately trigger overflow 3066 + * correction, but I don't want to insert extra constraints here. 3067 + */ 3068 + U32 const maxDictSize = ZSTD_CURRENT_MAX - 1; 3069 + /* We must have cleared our windows when our source is this large. */ 3070 + assert(ZSTD_window_isEmpty(ms->window)); 3071 + if (loadLdmDict) 3072 + assert(ZSTD_window_isEmpty(ls->window)); 3073 + /* If the dictionary is too large, only load the suffix of the dictionary. */ 3074 + if (srcSize > maxDictSize) { 3075 + ip = iend - maxDictSize; 3076 + src = ip; 3077 + srcSize = maxDictSize; 3078 + } 3079 + } 3080 + 3081 + DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); 3082 + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); 3083 + ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); 3084 + ms->forceNonContiguous = params->deterministicRefPrefix; 3085 + 3086 + if (loadLdmDict) { 3087 + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); 3088 + ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); 3089 + } 3090 + 4130 3091 if (srcSize <= HASH_READ_SIZE) return 0; 4131 3092 4132 - while (iend - ip > HASH_READ_SIZE) { 4133 - size_t const remaining = (size_t)(iend - ip); 4134 - size_t const chunk = MIN(remaining, ZSTD_CHUNKSIZE_MAX); 4135 - const BYTE* const ichunk = ip + chunk; 3093 + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); 4136 3094 4137 - ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, ichunk); 3095 + if (loadLdmDict) 3096 + ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams); 4138 3097 4139 - if (params->ldmParams.enableLdm && ls != NULL) 4140 - ZSTD_ldm_fillHashTable(ls, (const BYTE*)src, (const BYTE*)src + srcSize, &params->ldmParams); 3098 + switch(params->cParams.strategy) 3099 + { 3100 + case ZSTD_fast: 3101 + ZSTD_fillHashTable(ms, iend, dtlm); 3102 + break; 3103 + case ZSTD_dfast: 3104 + ZSTD_fillDoubleHashTable(ms, iend, dtlm); 3105 + break; 4141 3106 4142 - switch(params->cParams.strategy) 4143 - { 4144 - case ZSTD_fast: 4145 - ZSTD_fillHashTable(ms, ichunk, dtlm); 4146 - break; 4147 - case ZSTD_dfast: 4148 - ZSTD_fillDoubleHashTable(ms, ichunk, dtlm); 4149 - break; 4150 - 4151 - case ZSTD_greedy: 4152 - case ZSTD_lazy: 4153 - case ZSTD_lazy2: 4154 - if (chunk >= HASH_READ_SIZE && ms->dedicatedDictSearch) { 4155 - assert(chunk == remaining); /* must load everything in one go */ 4156 - ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, ichunk-HASH_READ_SIZE); 4157 - } else if (chunk >= HASH_READ_SIZE) { 4158 - ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE); 3107 + case ZSTD_greedy: 3108 + case ZSTD_lazy: 3109 + case ZSTD_lazy2: 3110 + assert(srcSize >= HASH_READ_SIZE); 3111 + if (ms->dedicatedDictSearch) { 3112 + assert(ms->chainTable != NULL); 3113 + ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend-HASH_READ_SIZE); 3114 + } else { 3115 + assert(params->useRowMatchFinder != ZSTD_ps_auto); 3116 + if (params->useRowMatchFinder == ZSTD_ps_enable) { 3117 + size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16); 3118 + ZSTD_memset(ms->tagTable, 0, tagTableSize); 3119 + ZSTD_row_update(ms, iend-HASH_READ_SIZE); 3120 + DEBUGLOG(4, "Using row-based hash table for lazy dict"); 3121 + } else { 3122 + ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE); 3123 + DEBUGLOG(4, "Using chain-based hash table for lazy dict"); 4159 3124 } 4160 - break; 4161 - 4162 - case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ 4163 - case ZSTD_btopt: 4164 - case ZSTD_btultra: 4165 - case ZSTD_btultra2: 4166 - if (chunk >= HASH_READ_SIZE) 4167 - ZSTD_updateTree(ms, ichunk-HASH_READ_SIZE, ichunk); 4168 - break; 4169 - 4170 - default: 4171 - assert(0); /* not possible : not a valid strategy id */ 4172 3125 } 3126 + break; 4173 3127 4174 - ip = ichunk; 3128 + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ 3129 + case ZSTD_btopt: 3130 + case ZSTD_btultra: 3131 + case ZSTD_btultra2: 3132 + assert(srcSize >= HASH_READ_SIZE); 3133 + ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); 3134 + break; 3135 + 3136 + default: 3137 + assert(0); /* not possible : not a valid strategy id */ 4175 3138 } 4176 3139 4177 3140 ms->nextToUpdate = (U32)(iend - ms->window.base); ··· 4333 3250 const BYTE* const dictEnd = dictPtr + dictSize; 4334 3251 size_t dictID; 4335 3252 size_t eSize; 4336 - 4337 3253 ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog))); 4338 3254 assert(dictSize >= 8); 4339 3255 assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY); ··· 4403 3321 const ZSTD_CCtx_params* params, U64 pledgedSrcSize, 4404 3322 ZSTD_buffered_policy_e zbuff) 4405 3323 { 3324 + size_t const dictContentSize = cdict ? cdict->dictContentSize : dictSize; 4406 3325 DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog); 4407 3326 /* params are supposed to be fully validated at this point */ 4408 3327 assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); ··· 4418 3335 return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff); 4419 3336 } 4420 3337 4421 - FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, *params, pledgedSrcSize, 3338 + FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, 3339 + dictContentSize, 4422 3340 ZSTDcrp_makeClean, zbuff) , ""); 4423 3341 { size_t const dictID = cdict ? 4424 3342 ZSTD_compress_insertDictionary( ··· 4434 3350 FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); 4435 3351 assert(dictID <= UINT_MAX); 4436 3352 cctx->dictID = (U32)dictID; 4437 - cctx->dictContentSize = cdict ? cdict->dictContentSize : dictSize; 3353 + cctx->dictContentSize = dictContentSize; 4438 3354 } 4439 3355 return 0; 4440 3356 } ··· 4569 3485 const void* dict,size_t dictSize, 4570 3486 ZSTD_parameters params) 4571 3487 { 4572 - ZSTD_CCtx_params cctxParams; 4573 3488 DEBUGLOG(4, "ZSTD_compress_advanced"); 4574 3489 FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); 4575 - ZSTD_CCtxParams_init_internal(&cctxParams, &params, ZSTD_NO_CLEVEL); 3490 + ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, &params, ZSTD_NO_CLEVEL); 4576 3491 return ZSTD_compress_advanced_internal(cctx, 4577 3492 dst, dstCapacity, 4578 3493 src, srcSize, 4579 3494 dict, dictSize, 4580 - &cctxParams); 3495 + &cctx->simpleApiParams); 4581 3496 } 4582 3497 4583 3498 /* Internal */ ··· 4600 3517 const void* dict, size_t dictSize, 4601 3518 int compressionLevel) 4602 3519 { 4603 - ZSTD_CCtx_params cctxParams; 4604 3520 { 4605 3521 ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict); 4606 3522 assert(params.fParams.contentSizeFlag == 1); 4607 - ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel); 3523 + ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel); 4608 3524 } 4609 3525 DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize); 4610 - return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctxParams); 3526 + return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctx->simpleApiParams); 4611 3527 } 4612 3528 4613 3529 size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, ··· 4643 3561 DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict)); 4644 3562 return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) 4645 3563 + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) 4646 - + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) 3564 + /* enableDedicatedDictSearch == 1 ensures that CDict estimation will not be too small 3565 + * in case we are using DDS with row-hash. */ 3566 + + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams), 3567 + /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0) 4647 3568 + (dictLoadMethod == ZSTD_dlm_byRef ? 0 4648 3569 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); 4649 3570 } ··· 4677 3592 assert(!ZSTD_checkCParams(params.cParams)); 4678 3593 cdict->matchState.cParams = params.cParams; 4679 3594 cdict->matchState.dedicatedDictSearch = params.enableDedicatedDictSearch; 4680 - if (cdict->matchState.dedicatedDictSearch && dictSize > ZSTD_CHUNKSIZE_MAX) { 4681 - cdict->matchState.dedicatedDictSearch = 0; 4682 - } 4683 3595 if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) { 4684 3596 cdict->dictContent = dictBuffer; 4685 3597 } else { ··· 4697 3615 &cdict->matchState, 4698 3616 &cdict->workspace, 4699 3617 &params.cParams, 3618 + params.useRowMatchFinder, 4700 3619 ZSTDcrp_makeClean, 4701 3620 ZSTDirp_reset, 4702 3621 ZSTD_resetTarget_CDict), ""); ··· 4721 3638 4722 3639 static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, 4723 3640 ZSTD_dictLoadMethod_e dictLoadMethod, 4724 - ZSTD_compressionParameters cParams, ZSTD_customMem customMem) 3641 + ZSTD_compressionParameters cParams, 3642 + ZSTD_paramSwitch_e useRowMatchFinder, 3643 + U32 enableDedicatedDictSearch, 3644 + ZSTD_customMem customMem) 4725 3645 { 4726 3646 if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; 4727 3647 4728 3648 { size_t const workspaceSize = 4729 3649 ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + 4730 3650 ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + 4731 - ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + 3651 + ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) + 4732 3652 (dictLoadMethod == ZSTD_dlm_byRef ? 0 4733 3653 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); 4734 3654 void* const workspace = ZSTD_customMalloc(workspaceSize, customMem); ··· 4750 3664 ZSTD_cwksp_move(&cdict->workspace, &ws); 4751 3665 cdict->customMem = customMem; 4752 3666 cdict->compressionLevel = ZSTD_NO_CLEVEL; /* signals advanced API usage */ 4753 - 3667 + cdict->useRowMatchFinder = useRowMatchFinder; 4754 3668 return cdict; 4755 3669 } 4756 3670 } ··· 4772 3686 &cctxParams, customMem); 4773 3687 } 4774 3688 4775 - ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( 3689 + ZSTD_CDict* ZSTD_createCDict_advanced2( 4776 3690 const void* dict, size_t dictSize, 4777 3691 ZSTD_dictLoadMethod_e dictLoadMethod, 4778 3692 ZSTD_dictContentType_e dictContentType, ··· 4802 3716 &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); 4803 3717 } 4804 3718 3719 + DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch); 4805 3720 cctxParams.cParams = cParams; 3721 + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); 4806 3722 4807 3723 cdict = ZSTD_createCDict_advanced_internal(dictSize, 4808 3724 dictLoadMethod, cctxParams.cParams, 3725 + cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, 4809 3726 customMem); 4810 3727 4811 3728 if (ZSTD_isError( ZSTD_initCDict_internal(cdict, ··· 4877 3788 ZSTD_dictContentType_e dictContentType, 4878 3789 ZSTD_compressionParameters cParams) 4879 3790 { 4880 - size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0); 3791 + ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams); 3792 + /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */ 3793 + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0); 4881 3794 size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) 4882 3795 + (dictLoadMethod == ZSTD_dlm_byRef ? 0 4883 3796 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))) ··· 4904 3813 4905 3814 ZSTD_CCtxParams_init(&params, 0); 4906 3815 params.cParams = cParams; 3816 + params.useRowMatchFinder = useRowMatchFinder; 3817 + cdict->useRowMatchFinder = useRowMatchFinder; 4907 3818 4908 3819 if (ZSTD_isError( ZSTD_initCDict_internal(cdict, 4909 3820 dict, dictSize, ··· 4932 3839 return cdict->dictID; 4933 3840 } 4934 3841 4935 - 4936 - /* ZSTD_compressBegin_usingCDict_advanced() : 4937 - * cdict must be != NULL */ 4938 - size_t ZSTD_compressBegin_usingCDict_advanced( 3842 + /* ZSTD_compressBegin_usingCDict_internal() : 3843 + * Implementation of various ZSTD_compressBegin_usingCDict* functions. 3844 + */ 3845 + static size_t ZSTD_compressBegin_usingCDict_internal( 4939 3846 ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, 4940 3847 ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) 4941 3848 { 4942 3849 ZSTD_CCtx_params cctxParams; 4943 - DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced"); 3850 + DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_internal"); 4944 3851 RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!"); 4945 3852 /* Initialize the cctxParams from the cdict */ 4946 3853 { ··· 4972 3879 ZSTDb_not_buffered); 4973 3880 } 4974 3881 3882 + 3883 + /* ZSTD_compressBegin_usingCDict_advanced() : 3884 + * This function is DEPRECATED. 3885 + * cdict must be != NULL */ 3886 + size_t ZSTD_compressBegin_usingCDict_advanced( 3887 + ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, 3888 + ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) 3889 + { 3890 + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, pledgedSrcSize); 3891 + } 3892 + 4975 3893 /* ZSTD_compressBegin_usingCDict() : 4976 - * pledgedSrcSize=0 means "unknown" 4977 - * if pledgedSrcSize>0, it will enable contentSizeFlag */ 3894 + * cdict must be != NULL */ 4978 3895 size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) 4979 3896 { 4980 3897 ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; 4981 - DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag); 4982 - return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); 3898 + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); 4983 3899 } 4984 3900 3901 + /*! ZSTD_compress_usingCDict_internal(): 3902 + * Implementation of various ZSTD_compress_usingCDict* functions. 3903 + */ 3904 + static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, 3905 + void* dst, size_t dstCapacity, 3906 + const void* src, size_t srcSize, 3907 + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) 3908 + { 3909 + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ 3910 + return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); 3911 + } 3912 + 3913 + /*! ZSTD_compress_usingCDict_advanced(): 3914 + * This function is DEPRECATED. 3915 + */ 4985 3916 size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, 4986 3917 void* dst, size_t dstCapacity, 4987 3918 const void* src, size_t srcSize, 4988 3919 const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) 4989 3920 { 4990 - FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ 4991 - return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); 3921 + return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); 4992 3922 } 4993 3923 4994 3924 /*! ZSTD_compress_usingCDict() : ··· 5025 3909 const ZSTD_CDict* cdict) 5026 3910 { 5027 3911 ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; 5028 - return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); 3912 + return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); 5029 3913 } 5030 3914 5031 3915 ··· 5429 4313 FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ 5430 4314 ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* single usage */ 5431 4315 assert(prefixDict.dict==NULL || cctx->cdict==NULL); /* only one can be set */ 5432 - if (cctx->cdict) 5433 - params.compressionLevel = cctx->cdict->compressionLevel; /* let cdict take priority in terms of compression level */ 4316 + if (cctx->cdict && !cctx->localDict.cdict) { 4317 + /* Let the cdict's compression level take priority over the requested params. 4318 + * But do not take the cdict's compression level if the "cdict" is actually a localDict 4319 + * generated from ZSTD_initLocalDict(). 4320 + */ 4321 + params.compressionLevel = cctx->cdict->compressionLevel; 4322 + } 5434 4323 DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); 5435 4324 if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */ 5436 4325 { ··· 5448 4327 dictSize, mode); 5449 4328 } 5450 4329 5451 - if (ZSTD_CParams_shouldEnableLdm(&params.cParams)) { 5452 - /* Enable LDM by default for optimal parser and window size >= 128MB */ 5453 - DEBUGLOG(4, "LDM enabled by default (window size >= 128MB, strategy >= btopt)"); 5454 - params.ldmParams.enableLdm = 1; 5455 - } 4330 + params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams); 4331 + params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams); 4332 + params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams); 5456 4333 5457 4334 { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; 5458 4335 assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); ··· 5555 4436 size_t posInSrc; /* Number of bytes given by sequences provided so far */ 5556 4437 } ZSTD_sequencePosition; 5557 4438 5558 - /* Returns a ZSTD error code if sequence is not valid */ 5559 - static size_t ZSTD_validateSequence(U32 offCode, U32 matchLength, 5560 - size_t posInSrc, U32 windowLog, size_t dictSize, U32 minMatch) { 5561 - size_t offsetBound; 5562 - U32 windowSize = 1 << windowLog; 5563 - /* posInSrc represents the amount of data the the decoder would decode up to this point. 4439 + /* ZSTD_validateSequence() : 4440 + * @offCode : is presumed to follow format required by ZSTD_storeSeq() 4441 + * @returns a ZSTD error code if sequence is not valid 4442 + */ 4443 + static size_t 4444 + ZSTD_validateSequence(U32 offCode, U32 matchLength, 4445 + size_t posInSrc, U32 windowLog, size_t dictSize) 4446 + { 4447 + U32 const windowSize = 1 << windowLog; 4448 + /* posInSrc represents the amount of data the decoder would decode up to this point. 5564 4449 * As long as the amount of data decoded is less than or equal to window size, offsets may be 5565 4450 * larger than the total length of output decoded in order to reference the dict, even larger than 5566 4451 * window size. After output surpasses windowSize, we're limited to windowSize offsets again. 5567 4452 */ 5568 - offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; 5569 - RETURN_ERROR_IF(offCode > offsetBound + ZSTD_REP_MOVE, corruption_detected, "Offset too large!"); 5570 - RETURN_ERROR_IF(matchLength < minMatch, corruption_detected, "Matchlength too small"); 4453 + size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; 4454 + RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!"); 4455 + RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small"); 5571 4456 return 0; 5572 4457 } 5573 4458 5574 4459 /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ 5575 - static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) { 5576 - U32 offCode = rawOffset + ZSTD_REP_MOVE; 5577 - U32 repCode = 0; 4460 + static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) 4461 + { 4462 + U32 offCode = STORE_OFFSET(rawOffset); 5578 4463 5579 4464 if (!ll0 && rawOffset == rep[0]) { 5580 - repCode = 1; 4465 + offCode = STORE_REPCODE_1; 5581 4466 } else if (rawOffset == rep[1]) { 5582 - repCode = 2 - ll0; 4467 + offCode = STORE_REPCODE(2 - ll0); 5583 4468 } else if (rawOffset == rep[2]) { 5584 - repCode = 3 - ll0; 4469 + offCode = STORE_REPCODE(3 - ll0); 5585 4470 } else if (ll0 && rawOffset == rep[0] - 1) { 5586 - repCode = 3; 5587 - } 5588 - if (repCode) { 5589 - /* ZSTD_storeSeq expects a number in the range [0, 2] to represent a repcode */ 5590 - offCode = repCode - 1; 4471 + offCode = STORE_REPCODE_3; 5591 4472 } 5592 4473 return offCode; 5593 4474 } ··· 5595 4476 /* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of 5596 4477 * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. 5597 4478 */ 5598 - static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, 5599 - const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, 5600 - const void* src, size_t blockSize) { 4479 + static size_t 4480 + ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, 4481 + ZSTD_sequencePosition* seqPos, 4482 + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, 4483 + const void* src, size_t blockSize) 4484 + { 5601 4485 U32 idx = seqPos->idx; 5602 4486 BYTE const* ip = (BYTE const*)(src); 5603 4487 const BYTE* const iend = ip + blockSize; 5604 4488 repcodes_t updatedRepcodes; 5605 4489 U32 dictSize; 5606 - U32 litLength; 5607 - U32 matchLength; 5608 - U32 ll0; 5609 - U32 offCode; 5610 4490 5611 4491 if (cctx->cdict) { 5612 4492 dictSize = (U32)cctx->cdict->dictContentSize; ··· 5616 4498 } 5617 4499 ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); 5618 4500 for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) { 5619 - litLength = inSeqs[idx].litLength; 5620 - matchLength = inSeqs[idx].matchLength; 5621 - ll0 = litLength == 0; 5622 - offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); 5623 - updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); 4501 + U32 const litLength = inSeqs[idx].litLength; 4502 + U32 const ll0 = (litLength == 0); 4503 + U32 const matchLength = inSeqs[idx].matchLength; 4504 + U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); 4505 + ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); 5624 4506 5625 4507 DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); 5626 4508 if (cctx->appliedParams.validateSequences) { 5627 4509 seqPos->posInSrc += litLength + matchLength; 5628 4510 FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, 5629 - cctx->appliedParams.cParams.windowLog, dictSize, 5630 - cctx->appliedParams.cParams.minMatch), 4511 + cctx->appliedParams.cParams.windowLog, dictSize), 5631 4512 "Sequence validation failed"); 5632 4513 } 5633 4514 RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, 5634 4515 "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); 5635 - ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength - MINMATCH); 4516 + ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); 5636 4517 ip += matchLength + litLength; 5637 4518 } 5638 4519 ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); ··· 5658 4541 * avoid splitting a match, or to avoid splitting a match such that it would produce a match 5659 4542 * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. 5660 4543 */ 5661 - static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, 5662 - const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, 5663 - const void* src, size_t blockSize) { 4544 + static size_t 4545 + ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, 4546 + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, 4547 + const void* src, size_t blockSize) 4548 + { 5664 4549 U32 idx = seqPos->idx; 5665 4550 U32 startPosInSequence = seqPos->posInSequence; 5666 4551 U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize; ··· 5672 4553 repcodes_t updatedRepcodes; 5673 4554 U32 bytesAdjustment = 0; 5674 4555 U32 finalMatchSplit = 0; 5675 - U32 litLength; 5676 - U32 matchLength; 5677 - U32 rawOffset; 5678 - U32 offCode; 5679 4556 5680 4557 if (cctx->cdict) { 5681 4558 dictSize = cctx->cdict->dictContentSize; ··· 5685 4570 ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); 5686 4571 while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { 5687 4572 const ZSTD_Sequence currSeq = inSeqs[idx]; 5688 - litLength = currSeq.litLength; 5689 - matchLength = currSeq.matchLength; 5690 - rawOffset = currSeq.offset; 4573 + U32 litLength = currSeq.litLength; 4574 + U32 matchLength = currSeq.matchLength; 4575 + U32 const rawOffset = currSeq.offset; 4576 + U32 offCode; 5691 4577 5692 4578 /* Modify the sequence depending on where endPosInSequence lies */ 5693 4579 if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { ··· 5741 4625 } 5742 4626 } 5743 4627 /* Check if this offset can be represented with a repcode */ 5744 - { U32 ll0 = (litLength == 0); 4628 + { U32 const ll0 = (litLength == 0); 5745 4629 offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0); 5746 - updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); 4630 + ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); 5747 4631 } 5748 4632 5749 4633 if (cctx->appliedParams.validateSequences) { 5750 4634 seqPos->posInSrc += litLength + matchLength; 5751 4635 FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, 5752 - cctx->appliedParams.cParams.windowLog, dictSize, 5753 - cctx->appliedParams.cParams.minMatch), 4636 + cctx->appliedParams.cParams.windowLog, dictSize), 5754 4637 "Sequence validation failed"); 5755 4638 } 5756 4639 DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); 5757 4640 RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, 5758 4641 "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); 5759 - ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength - MINMATCH); 4642 + ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); 5760 4643 ip += matchLength + litLength; 5761 4644 } 5762 4645 DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); ··· 5780 4665 typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, 5781 4666 const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, 5782 4667 const void* src, size_t blockSize); 5783 - static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) { 4668 + static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) 4669 + { 5784 4670 ZSTD_sequenceCopier sequenceCopier = NULL; 5785 4671 assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode)); 5786 4672 if (mode == ZSTD_sf_explicitBlockDelimiters) { ··· 5795 4679 5796 4680 /* Compress, block-by-block, all of the sequences given. 5797 4681 * 5798 - * Returns the cumulative size of all compressed blocks (including their headers), otherwise a ZSTD error. 4682 + * Returns the cumulative size of all compressed blocks (including their headers), 4683 + * otherwise a ZSTD error. 5799 4684 */ 5800 - static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, 5801 - void* dst, size_t dstCapacity, 5802 - const ZSTD_Sequence* inSeqs, size_t inSeqsSize, 5803 - const void* src, size_t srcSize) { 4685 + static size_t 4686 + ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, 4687 + void* dst, size_t dstCapacity, 4688 + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, 4689 + const void* src, size_t srcSize) 4690 + { 5804 4691 size_t cSize = 0; 5805 4692 U32 lastBlock; 5806 4693 size_t blockSize; ··· 5813 4694 5814 4695 BYTE const* ip = (BYTE const*)src; 5815 4696 BYTE* op = (BYTE*)dst; 5816 - ZSTD_sequenceCopier sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); 4697 + ZSTD_sequenceCopier const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); 5817 4698 5818 4699 DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize); 5819 4700 /* Special case: empty frame */ ··· 5851 4732 continue; 5852 4733 } 5853 4734 5854 - compressedSeqsSize = ZSTD_entropyCompressSequences(&cctx->seqStore, 4735 + compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, 5855 4736 &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, 5856 4737 &cctx->appliedParams, 5857 4738 op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, ··· 5883 4764 } else { 5884 4765 U32 cBlockHeader; 5885 4766 /* Error checking and repcodes update */ 5886 - ZSTD_confirmRepcodesAndEntropyTables(cctx); 4767 + ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); 5887 4768 if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) 5888 4769 cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; 5889 4770 ··· 5913 4794 5914 4795 size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity, 5915 4796 const ZSTD_Sequence* inSeqs, size_t inSeqsSize, 5916 - const void* src, size_t srcSize) { 4797 + const void* src, size_t srcSize) 4798 + { 5917 4799 BYTE* op = (BYTE*)dst; 5918 4800 size_t cSize = 0; 5919 4801 size_t compressedBlocksSize = 0; ··· 5981 4861 5982 4862 5983 4863 /*-===== Pre-defined compression levels =====-*/ 4864 + #include "clevels.h" 5984 4865 5985 - #define ZSTD_MAX_CLEVEL 22 5986 4866 int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; } 5987 4867 int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; } 5988 - 5989 - static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = { 5990 - { /* "default" - for any srcSize > 256 KB */ 5991 - /* W, C, H, S, L, TL, strat */ 5992 - { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels */ 5993 - { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */ 5994 - { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */ 5995 - { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */ 5996 - { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */ 5997 - { 21, 18, 19, 2, 5, 2, ZSTD_greedy }, /* level 5 */ 5998 - { 21, 19, 19, 3, 5, 4, ZSTD_greedy }, /* level 6 */ 5999 - { 21, 19, 19, 3, 5, 8, ZSTD_lazy }, /* level 7 */ 6000 - { 21, 19, 19, 3, 5, 16, ZSTD_lazy2 }, /* level 8 */ 6001 - { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */ 6002 - { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 10 */ 6003 - { 22, 21, 22, 4, 5, 16, ZSTD_lazy2 }, /* level 11 */ 6004 - { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 12 */ 6005 - { 22, 21, 22, 5, 5, 32, ZSTD_btlazy2 }, /* level 13 */ 6006 - { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */ 6007 - { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */ 6008 - { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */ 6009 - { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */ 6010 - { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */ 6011 - { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */ 6012 - { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */ 6013 - { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */ 6014 - { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */ 6015 - }, 6016 - { /* for srcSize <= 256 KB */ 6017 - /* W, C, H, S, L, T, strat */ 6018 - { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ 6019 - { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */ 6020 - { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */ 6021 - { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */ 6022 - { 18, 16, 17, 2, 5, 2, ZSTD_greedy }, /* level 4.*/ 6023 - { 18, 18, 18, 3, 5, 2, ZSTD_greedy }, /* level 5.*/ 6024 - { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/ 6025 - { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */ 6026 - { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ 6027 - { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ 6028 - { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ 6029 - { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/ 6030 - { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/ 6031 - { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */ 6032 - { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ 6033 - { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/ 6034 - { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/ 6035 - { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/ 6036 - { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/ 6037 - { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ 6038 - { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/ 6039 - { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/ 6040 - { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/ 6041 - }, 6042 - { /* for srcSize <= 128 KB */ 6043 - /* W, C, H, S, L, T, strat */ 6044 - { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ 6045 - { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */ 6046 - { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */ 6047 - { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */ 6048 - { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */ 6049 - { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */ 6050 - { 17, 17, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */ 6051 - { 17, 17, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */ 6052 - { 17, 17, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ 6053 - { 17, 17, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ 6054 - { 17, 17, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ 6055 - { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */ 6056 - { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */ 6057 - { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/ 6058 - { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ 6059 - { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/ 6060 - { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/ 6061 - { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/ 6062 - { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/ 6063 - { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/ 6064 - { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/ 6065 - { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ 6066 - { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/ 6067 - }, 6068 - { /* for srcSize <= 16 KB */ 6069 - /* W, C, H, S, L, T, strat */ 6070 - { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ 6071 - { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */ 6072 - { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */ 6073 - { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */ 6074 - { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */ 6075 - { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/ 6076 - { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */ 6077 - { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */ 6078 - { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/ 6079 - { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/ 6080 - { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/ 6081 - { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/ 6082 - { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/ 6083 - { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/ 6084 - { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/ 6085 - { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/ 6086 - { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/ 6087 - { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/ 6088 - { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/ 6089 - { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ 6090 - { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/ 6091 - { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ 6092 - { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/ 6093 - }, 6094 - }; 4868 + int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; } 6095 4869 6096 4870 static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, size_t const dictSize) 6097 4871 { ··· 6013 4999 { 6014 5000 return (cParams->strategy >= ZSTD_greedy) 6015 5001 && (cParams->strategy <= ZSTD_lazy2) 6016 - && (cParams->hashLog >= cParams->chainLog) 5002 + && (cParams->hashLog > cParams->chainLog) 6017 5003 && (cParams->chainLog <= 24); 6018 5004 } 6019 5005 ··· 6032 5018 case ZSTD_lazy: 6033 5019 case ZSTD_lazy2: 6034 5020 cParams->hashLog -= ZSTD_LAZY_DDSS_BUCKET_LOG; 5021 + if (cParams->hashLog < ZSTD_HASHLOG_MIN) { 5022 + cParams->hashLog = ZSTD_HASHLOG_MIN; 5023 + } 6035 5024 break; 6036 5025 case ZSTD_btlazy2: 6037 5026 case ZSTD_btopt: ··· 6083 5066 else row = compressionLevel; 6084 5067 6085 5068 { ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row]; 5069 + DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u strat: %u", tableID, row, (U32)cp.strategy); 6086 5070 /* acceleration factor */ 6087 5071 if (compressionLevel < 0) { 6088 5072 int const clampedCompressionLevel = MAX(ZSTD_minCLevel(), compressionLevel);
+293 -82
lib/zstd/compress/zstd_compress_internal.h
··· 57 57 } ZSTD_localDict; 58 58 59 59 typedef struct { 60 - HUF_CElt CTable[HUF_CTABLE_SIZE_U32(255)]; 60 + HUF_CElt CTable[HUF_CTABLE_SIZE_ST(255)]; 61 61 HUF_repeat repeatMode; 62 62 } ZSTD_hufCTables_t; 63 63 ··· 75 75 ZSTD_fseCTables_t fse; 76 76 } ZSTD_entropyCTables_t; 77 77 78 + /* ********************************************* 79 + * Entropy buffer statistics structs and funcs * 80 + ***********************************************/ 81 + /* ZSTD_hufCTablesMetadata_t : 82 + * Stores Literals Block Type for a super-block in hType, and 83 + * huffman tree description in hufDesBuffer. 84 + * hufDesSize refers to the size of huffman tree description in bytes. 85 + * This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */ 78 86 typedef struct { 79 - U32 off; /* Offset code (offset + ZSTD_REP_MOVE) for the match */ 87 + symbolEncodingType_e hType; 88 + BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; 89 + size_t hufDesSize; 90 + } ZSTD_hufCTablesMetadata_t; 91 + 92 + /* ZSTD_fseCTablesMetadata_t : 93 + * Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and 94 + * fse tables in fseTablesBuffer. 95 + * fseTablesSize refers to the size of fse tables in bytes. 96 + * This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */ 97 + typedef struct { 98 + symbolEncodingType_e llType; 99 + symbolEncodingType_e ofType; 100 + symbolEncodingType_e mlType; 101 + BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; 102 + size_t fseTablesSize; 103 + size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ 104 + } ZSTD_fseCTablesMetadata_t; 105 + 106 + typedef struct { 107 + ZSTD_hufCTablesMetadata_t hufMetadata; 108 + ZSTD_fseCTablesMetadata_t fseMetadata; 109 + } ZSTD_entropyCTablesMetadata_t; 110 + 111 + /* ZSTD_buildBlockEntropyStats() : 112 + * Builds entropy for the block. 113 + * @return : 0 on success or error code */ 114 + size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, 115 + const ZSTD_entropyCTables_t* prevEntropy, 116 + ZSTD_entropyCTables_t* nextEntropy, 117 + const ZSTD_CCtx_params* cctxParams, 118 + ZSTD_entropyCTablesMetadata_t* entropyMetadata, 119 + void* workspace, size_t wkspSize); 120 + 121 + /* ******************************* 122 + * Compression internals structs * 123 + *********************************/ 124 + 125 + typedef struct { 126 + U32 off; /* Offset sumtype code for the match, using ZSTD_storeSeq() format */ 80 127 U32 len; /* Raw length of match */ 81 128 } ZSTD_match_t; 82 129 ··· 173 126 U32 offCodeSumBasePrice; /* to compare to log2(offreq) */ 174 127 ZSTD_OptPrice_e priceType; /* prices can be determined dynamically, or follow a pre-defined cost structure */ 175 128 const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated dictionary statistics */ 176 - ZSTD_literalCompressionMode_e literalCompressionMode; 129 + ZSTD_paramSwitch_e literalCompressionMode; 177 130 } optState_t; 178 131 179 132 typedef struct { ··· 182 135 } ZSTD_compressedBlockState_t; 183 136 184 137 typedef struct { 185 - BYTE const* nextSrc; /* next block here to continue on current prefix */ 186 - BYTE const* base; /* All regular indexes relative to this position */ 187 - BYTE const* dictBase; /* extDict indexes relative to this position */ 188 - U32 dictLimit; /* below that point, need extDict */ 189 - U32 lowLimit; /* below that point, no more valid data */ 138 + BYTE const* nextSrc; /* next block here to continue on current prefix */ 139 + BYTE const* base; /* All regular indexes relative to this position */ 140 + BYTE const* dictBase; /* extDict indexes relative to this position */ 141 + U32 dictLimit; /* below that point, need extDict */ 142 + U32 lowLimit; /* below that point, no more valid data */ 143 + U32 nbOverflowCorrections; /* Number of times overflow correction has run since 144 + * ZSTD_window_init(). Useful for debugging coredumps 145 + * and for ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY. 146 + */ 190 147 } ZSTD_window_t; 191 148 149 + #define ZSTD_WINDOW_START_INDEX 2 150 + 192 151 typedef struct ZSTD_matchState_t ZSTD_matchState_t; 152 + 153 + #define ZSTD_ROW_HASH_CACHE_SIZE 8 /* Size of prefetching hash cache for row-based matchfinder */ 154 + 193 155 struct ZSTD_matchState_t { 194 156 ZSTD_window_t window; /* State for window round buffer management */ 195 157 U32 loadedDictEnd; /* index of end of dictionary, within context's referential. ··· 210 154 */ 211 155 U32 nextToUpdate; /* index from which to continue table update */ 212 156 U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ 157 + 158 + U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ 159 + U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ 160 + U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ 161 + 213 162 U32* hashTable; 214 163 U32* hashTable3; 215 164 U32* chainTable; 165 + 166 + U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */ 167 + 216 168 int dedicatedDictSearch; /* Indicates whether this matchState is using the 217 169 * dedicated dictionary search structure. 218 170 */ ··· 260 196 } ldmState_t; 261 197 262 198 typedef struct { 263 - U32 enableLdm; /* 1 if enable long distance matching */ 199 + ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */ 264 200 U32 hashLog; /* Log size of hashTable */ 265 201 U32 bucketSizeLog; /* Log bucket size for collision resolution, at most 8 */ 266 202 U32 minMatchLength; /* Minimum match length */ ··· 291 227 * There is no guarantee that hint is close to actual source size */ 292 228 293 229 ZSTD_dictAttachPref_e attachDictPref; 294 - ZSTD_literalCompressionMode_e literalCompressionMode; 230 + ZSTD_paramSwitch_e literalCompressionMode; 295 231 296 232 /* Multithreading: used to pass parameters to mtctx */ 297 233 int nbWorkers; ··· 313 249 ZSTD_sequenceFormat_e blockDelimiters; 314 250 int validateSequences; 315 251 252 + /* Block splitting */ 253 + ZSTD_paramSwitch_e useBlockSplitter; 254 + 255 + /* Param for deciding whether to use row-based matchfinder */ 256 + ZSTD_paramSwitch_e useRowMatchFinder; 257 + 258 + /* Always load a dictionary in ext-dict mode (not prefix mode)? */ 259 + int deterministicRefPrefix; 260 + 316 261 /* Internal use, for createCCtxParams() and freeCCtxParams() only */ 317 262 ZSTD_customMem customMem; 318 263 }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ ··· 339 266 ZSTDb_buffered 340 267 } ZSTD_buffered_policy_e; 341 268 269 + /* 270 + * Struct that contains all elements of block splitter that should be allocated 271 + * in a wksp. 272 + */ 273 + #define ZSTD_MAX_NB_BLOCK_SPLITS 196 274 + typedef struct { 275 + seqStore_t fullSeqStoreChunk; 276 + seqStore_t firstHalfSeqStore; 277 + seqStore_t secondHalfSeqStore; 278 + seqStore_t currSeqStore; 279 + seqStore_t nextSeqStore; 280 + 281 + U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS]; 282 + ZSTD_entropyCTablesMetadata_t entropyMetadata; 283 + } ZSTD_blockSplitCtx; 284 + 342 285 struct ZSTD_CCtx_s { 343 286 ZSTD_compressionStage_e stage; 344 287 int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */ 345 288 int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ 346 289 ZSTD_CCtx_params requestedParams; 347 290 ZSTD_CCtx_params appliedParams; 291 + ZSTD_CCtx_params simpleApiParams; /* Param storage used by the simple API - not sticky. Must only be used in top-level simple API functions for storage. */ 348 292 U32 dictID; 349 293 size_t dictContentSize; 350 294 ··· 386 296 ZSTD_blockState_t blockState; 387 297 U32* entropyWorkspace; /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */ 388 298 389 - /* Wether we are streaming or not */ 299 + /* Whether we are streaming or not */ 390 300 ZSTD_buffered_policy_e bufferedPolicy; 391 301 392 302 /* streaming */ ··· 414 324 /* Multi-threading */ 415 325 416 326 /* Tracing */ 327 + 328 + /* Workspace for block splitter */ 329 + ZSTD_blockSplitCtx blockSplitCtx; 417 330 }; 418 331 419 332 typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; ··· 451 358 typedef size_t (*ZSTD_blockCompressor) ( 452 359 ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 453 360 void const* src, size_t srcSize); 454 - ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode); 361 + ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode); 455 362 456 363 457 364 MEM_STATIC U32 ZSTD_LLcode(U32 litLength) ··· 483 390 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 }; 484 391 static const U32 ML_deltaCode = 36; 485 392 return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase]; 486 - } 487 - 488 - typedef struct repcodes_s { 489 - U32 rep[3]; 490 - } repcodes_t; 491 - 492 - MEM_STATIC repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0) 493 - { 494 - repcodes_t newReps; 495 - if (offset >= ZSTD_REP_NUM) { /* full offset */ 496 - newReps.rep[2] = rep[1]; 497 - newReps.rep[1] = rep[0]; 498 - newReps.rep[0] = offset - ZSTD_REP_MOVE; 499 - } else { /* repcode */ 500 - U32 const repCode = offset + ll0; 501 - if (repCode > 0) { /* note : if repCode==0, no change */ 502 - U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; 503 - newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2]; 504 - newReps.rep[1] = rep[0]; 505 - newReps.rep[0] = currentOffset; 506 - } else { /* repCode == 0 */ 507 - ZSTD_memcpy(&newReps, rep, sizeof(newReps)); 508 - } 509 - } 510 - return newReps; 511 393 } 512 394 513 395 /* ZSTD_cParam_withinBounds: ··· 533 465 return (srcSize >> minlog) + 2; 534 466 } 535 467 536 - MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParams) 468 + MEM_STATIC int ZSTD_literalsCompressionIsDisabled(const ZSTD_CCtx_params* cctxParams) 537 469 { 538 470 switch (cctxParams->literalCompressionMode) { 539 - case ZSTD_lcm_huffman: 471 + case ZSTD_ps_enable: 540 472 return 0; 541 - case ZSTD_lcm_uncompressed: 473 + case ZSTD_ps_disable: 542 474 return 1; 543 475 default: 544 476 assert(0 /* impossible: pre-validated */); 545 477 ZSTD_FALLTHROUGH; 546 - case ZSTD_lcm_auto: 478 + case ZSTD_ps_auto: 547 479 return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0); 548 480 } 549 481 } ··· 553 485 * Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single 554 486 * large copies. 555 487 */ 556 - static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) { 488 + static void 489 + ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) 490 + { 557 491 assert(iend > ilimit_w); 558 492 if (ip <= ilimit_w) { 559 493 ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap); ··· 565 495 while (ip < iend) *op++ = *ip++; 566 496 } 567 497 498 + #define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) 499 + #define STORE_REPCODE_1 STORE_REPCODE(1) 500 + #define STORE_REPCODE_2 STORE_REPCODE(2) 501 + #define STORE_REPCODE_3 STORE_REPCODE(3) 502 + #define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1) 503 + #define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE) 504 + #define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE) 505 + #define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE) 506 + #define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE) 507 + #define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */ 508 + #define STORED_TO_OFFBASE(o) ((o)+1) 509 + #define OFFBASE_TO_STORED(o) ((o)-1) 510 + 568 511 /*! ZSTD_storeSeq() : 569 - * Store a sequence (litlen, litPtr, offCode and mlBase) into seqStore_t. 570 - * `offCode` : distance to match + ZSTD_REP_MOVE (values <= ZSTD_REP_MOVE are repCodes). 571 - * `mlBase` : matchLength - MINMATCH 512 + * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t. 513 + * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET(). 514 + * @matchLength : must be >= MINMATCH 572 515 * Allowed to overread literals up to litLimit. 573 516 */ 574 - HINT_INLINE UNUSED_ATTR 575 - void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* literals, const BYTE* litLimit, U32 offCode, size_t mlBase) 517 + HINT_INLINE UNUSED_ATTR void 518 + ZSTD_storeSeq(seqStore_t* seqStorePtr, 519 + size_t litLength, const BYTE* literals, const BYTE* litLimit, 520 + U32 offBase_minus1, 521 + size_t matchLength) 576 522 { 577 523 BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; 578 524 BYTE const* const litEnd = literals + litLength; ··· 597 511 if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ 598 512 { U32 const pos = (U32)((const BYTE*)literals - g_start); 599 513 DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", 600 - pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offCode); 514 + pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1); 601 515 } 602 516 #endif 603 517 assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); ··· 621 535 622 536 /* literal Length */ 623 537 if (litLength>0xFFFF) { 624 - assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ 625 - seqStorePtr->longLengthID = 1; 538 + assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ 539 + seqStorePtr->longLengthType = ZSTD_llt_literalLength; 626 540 seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); 627 541 } 628 542 seqStorePtr->sequences[0].litLength = (U16)litLength; 629 543 630 544 /* match offset */ 631 - seqStorePtr->sequences[0].offset = offCode + 1; 545 + seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1); 632 546 633 547 /* match Length */ 634 - if (mlBase>0xFFFF) { 635 - assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ 636 - seqStorePtr->longLengthID = 2; 637 - seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); 548 + assert(matchLength >= MINMATCH); 549 + { size_t const mlBase = matchLength - MINMATCH; 550 + if (mlBase>0xFFFF) { 551 + assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ 552 + seqStorePtr->longLengthType = ZSTD_llt_matchLength; 553 + seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); 554 + } 555 + seqStorePtr->sequences[0].mlBase = (U16)mlBase; 638 556 } 639 - seqStorePtr->sequences[0].matchLength = (U16)mlBase; 640 557 641 558 seqStorePtr->sequences++; 559 + } 560 + 561 + /* ZSTD_updateRep() : 562 + * updates in-place @rep (array of repeat offsets) 563 + * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq() 564 + */ 565 + MEM_STATIC void 566 + ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) 567 + { 568 + if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */ 569 + rep[2] = rep[1]; 570 + rep[1] = rep[0]; 571 + rep[0] = STORED_OFFSET(offBase_minus1); 572 + } else { /* repcode */ 573 + U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0; 574 + if (repCode > 0) { /* note : if repCode==0, no change */ 575 + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; 576 + rep[2] = (repCode >= 2) ? rep[1] : rep[2]; 577 + rep[1] = rep[0]; 578 + rep[0] = currentOffset; 579 + } else { /* repCode == 0 */ 580 + /* nothing to do */ 581 + } 582 + } 583 + } 584 + 585 + typedef struct repcodes_s { 586 + U32 rep[3]; 587 + } repcodes_t; 588 + 589 + MEM_STATIC repcodes_t 590 + ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) 591 + { 592 + repcodes_t newReps; 593 + ZSTD_memcpy(&newReps, rep, sizeof(newReps)); 594 + ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); 595 + return newReps; 642 596 } 643 597 644 598 ··· 904 778 window->dictLimit = end; 905 779 } 906 780 781 + MEM_STATIC U32 ZSTD_window_isEmpty(ZSTD_window_t const window) 782 + { 783 + return window.dictLimit == ZSTD_WINDOW_START_INDEX && 784 + window.lowLimit == ZSTD_WINDOW_START_INDEX && 785 + (window.nextSrc - window.base) == ZSTD_WINDOW_START_INDEX; 786 + } 787 + 907 788 /* 908 789 * ZSTD_window_hasExtDict(): 909 790 * Returns non-zero if the window has a non-empty extDict. ··· 934 801 ZSTD_noDict; 935 802 } 936 803 804 + /* Defining this macro to non-zero tells zstd to run the overflow correction 805 + * code much more frequently. This is very inefficient, and should only be 806 + * used for tests and fuzzers. 807 + */ 808 + #ifndef ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 809 + # ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION 810 + # define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 1 811 + # else 812 + # define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 0 813 + # endif 814 + #endif 815 + 816 + /* 817 + * ZSTD_window_canOverflowCorrect(): 818 + * Returns non-zero if the indices are large enough for overflow correction 819 + * to work correctly without impacting compression ratio. 820 + */ 821 + MEM_STATIC U32 ZSTD_window_canOverflowCorrect(ZSTD_window_t const window, 822 + U32 cycleLog, 823 + U32 maxDist, 824 + U32 loadedDictEnd, 825 + void const* src) 826 + { 827 + U32 const cycleSize = 1u << cycleLog; 828 + U32 const curr = (U32)((BYTE const*)src - window.base); 829 + U32 const minIndexToOverflowCorrect = cycleSize 830 + + MAX(maxDist, cycleSize) 831 + + ZSTD_WINDOW_START_INDEX; 832 + 833 + /* Adjust the min index to backoff the overflow correction frequency, 834 + * so we don't waste too much CPU in overflow correction. If this 835 + * computation overflows we don't really care, we just need to make 836 + * sure it is at least minIndexToOverflowCorrect. 837 + */ 838 + U32 const adjustment = window.nbOverflowCorrections + 1; 839 + U32 const adjustedIndex = MAX(minIndexToOverflowCorrect * adjustment, 840 + minIndexToOverflowCorrect); 841 + U32 const indexLargeEnough = curr > adjustedIndex; 842 + 843 + /* Only overflow correct early if the dictionary is invalidated already, 844 + * so we don't hurt compression ratio. 845 + */ 846 + U32 const dictionaryInvalidated = curr > maxDist + loadedDictEnd; 847 + 848 + return indexLargeEnough && dictionaryInvalidated; 849 + } 850 + 937 851 /* 938 852 * ZSTD_window_needOverflowCorrection(): 939 853 * Returns non-zero if the indices are getting too large and need overflow 940 854 * protection. 941 855 */ 942 856 MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, 857 + U32 cycleLog, 858 + U32 maxDist, 859 + U32 loadedDictEnd, 860 + void const* src, 943 861 void const* srcEnd) 944 862 { 945 863 U32 const curr = (U32)((BYTE const*)srcEnd - window.base); 864 + if (ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) { 865 + if (ZSTD_window_canOverflowCorrect(window, cycleLog, maxDist, loadedDictEnd, src)) { 866 + return 1; 867 + } 868 + } 946 869 return curr > ZSTD_CURRENT_MAX; 947 870 } 948 871 ··· 1010 821 * 1011 822 * The least significant cycleLog bits of the indices must remain the same, 1012 823 * which may be 0. Every index up to maxDist in the past must be valid. 1013 - * NOTE: (maxDist & cycleMask) must be zero. 1014 824 */ 1015 825 MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, 1016 826 U32 maxDist, void const* src) ··· 1033 845 * 3. (cctx->lowLimit + 1<<windowLog) < 1<<32: 1034 846 * windowLog <= 31 ==> 3<<29 + 1<<windowLog < 7<<29 < 1<<32. 1035 847 */ 1036 - U32 const cycleMask = (1U << cycleLog) - 1; 848 + U32 const cycleSize = 1u << cycleLog; 849 + U32 const cycleMask = cycleSize - 1; 1037 850 U32 const curr = (U32)((BYTE const*)src - window->base); 1038 - U32 const currentCycle0 = curr & cycleMask; 1039 - /* Exclude zero so that newCurrent - maxDist >= 1. */ 1040 - U32 const currentCycle1 = currentCycle0 == 0 ? (1U << cycleLog) : currentCycle0; 1041 - U32 const newCurrent = currentCycle1 + maxDist; 851 + U32 const currentCycle = curr & cycleMask; 852 + /* Ensure newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX. */ 853 + U32 const currentCycleCorrection = currentCycle < ZSTD_WINDOW_START_INDEX 854 + ? MAX(cycleSize, ZSTD_WINDOW_START_INDEX) 855 + : 0; 856 + U32 const newCurrent = currentCycle 857 + + currentCycleCorrection 858 + + MAX(maxDist, cycleSize); 1042 859 U32 const correction = curr - newCurrent; 1043 - assert((maxDist & cycleMask) == 0); 860 + /* maxDist must be a power of two so that: 861 + * (newCurrent & cycleMask) == (curr & cycleMask) 862 + * This is required to not corrupt the chains / binary tree. 863 + */ 864 + assert((maxDist & (maxDist - 1)) == 0); 865 + assert((curr & cycleMask) == (newCurrent & cycleMask)); 1044 866 assert(curr > newCurrent); 1045 - /* Loose bound, should be around 1<<29 (see above) */ 1046 - assert(correction > 1<<28); 867 + if (!ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) { 868 + /* Loose bound, should be around 1<<29 (see above) */ 869 + assert(correction > 1<<28); 870 + } 1047 871 1048 872 window->base += correction; 1049 873 window->dictBase += correction; 1050 - if (window->lowLimit <= correction) window->lowLimit = 1; 1051 - else window->lowLimit -= correction; 1052 - if (window->dictLimit <= correction) window->dictLimit = 1; 1053 - else window->dictLimit -= correction; 874 + if (window->lowLimit < correction + ZSTD_WINDOW_START_INDEX) { 875 + window->lowLimit = ZSTD_WINDOW_START_INDEX; 876 + } else { 877 + window->lowLimit -= correction; 878 + } 879 + if (window->dictLimit < correction + ZSTD_WINDOW_START_INDEX) { 880 + window->dictLimit = ZSTD_WINDOW_START_INDEX; 881 + } else { 882 + window->dictLimit -= correction; 883 + } 1054 884 1055 885 /* Ensure we can still reference the full window. */ 1056 886 assert(newCurrent >= maxDist); 1057 - assert(newCurrent - maxDist >= 1); 887 + assert(newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX); 1058 888 /* Ensure that lowLimit and dictLimit didn't underflow. */ 1059 889 assert(window->lowLimit <= newCurrent); 1060 890 assert(window->dictLimit <= newCurrent); 891 + 892 + ++window->nbOverflowCorrections; 1061 893 1062 894 DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction, 1063 895 window->lowLimit); ··· 1183 975 1184 976 MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { 1185 977 ZSTD_memset(window, 0, sizeof(*window)); 1186 - window->base = (BYTE const*)""; 1187 - window->dictBase = (BYTE const*)""; 1188 - window->dictLimit = 1; /* start from 1, so that 1st position is valid */ 1189 - window->lowLimit = 1; /* it ensures first and later CCtx usages compress the same */ 1190 - window->nextSrc = window->base + 1; /* see issue #1241 */ 978 + window->base = (BYTE const*)" "; 979 + window->dictBase = (BYTE const*)" "; 980 + ZSTD_STATIC_ASSERT(ZSTD_DUBT_UNSORTED_MARK < ZSTD_WINDOW_START_INDEX); /* Start above ZSTD_DUBT_UNSORTED_MARK */ 981 + window->dictLimit = ZSTD_WINDOW_START_INDEX; /* start from >0, so that 1st position is valid */ 982 + window->lowLimit = ZSTD_WINDOW_START_INDEX; /* it ensures first and later CCtx usages compress the same */ 983 + window->nextSrc = window->base + ZSTD_WINDOW_START_INDEX; /* see issue #1241 */ 984 + window->nbOverflowCorrections = 0; 1191 985 } 1192 986 1193 987 /* ··· 1200 990 * Returns non-zero if the segment is contiguous. 1201 991 */ 1202 992 MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, 1203 - void const* src, size_t srcSize) 993 + void const* src, size_t srcSize, 994 + int forceNonContiguous) 1204 995 { 1205 996 BYTE const* const ip = (BYTE const*)src; 1206 997 U32 contiguous = 1; ··· 1211 1000 assert(window->base != NULL); 1212 1001 assert(window->dictBase != NULL); 1213 1002 /* Check if blocks follow each other */ 1214 - if (src != window->nextSrc) { 1003 + if (src != window->nextSrc || forceNonContiguous) { 1215 1004 /* not contiguous */ 1216 1005 size_t const distanceFromBase = (size_t)(window->nextSrc - window->base); 1217 1006 DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit); ··· 1241 1030 */ 1242 1031 MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog) 1243 1032 { 1244 - U32 const maxDistance = 1U << windowLog; 1245 - U32 const lowestValid = ms->window.lowLimit; 1246 - U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; 1247 - U32 const isDictionary = (ms->loadedDictEnd != 0); 1033 + U32 const maxDistance = 1U << windowLog; 1034 + U32 const lowestValid = ms->window.lowLimit; 1035 + U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; 1036 + U32 const isDictionary = (ms->loadedDictEnd != 0); 1248 1037 /* When using a dictionary the entire dictionary is valid if a single byte of the dictionary 1249 1038 * is within the window. We invalidate the dictionary (and set loadedDictEnd to 0) when it isn't 1250 1039 * valid for the entire block. So this check is sufficient to find the lowest valid match index. 1251 1040 */ 1252 - U32 const matchLowest = isDictionary ? lowestValid : withinWindow; 1041 + U32 const matchLowest = isDictionary ? lowestValid : withinWindow; 1253 1042 return matchLowest; 1254 1043 } 1255 1044
+5 -4
lib/zstd/compress/zstd_compress_literals.c
··· 73 73 void* dst, size_t dstCapacity, 74 74 const void* src, size_t srcSize, 75 75 void* entropyWorkspace, size_t entropyWorkspaceSize, 76 - const int bmi2) 76 + const int bmi2, 77 + unsigned suspectUncompressible) 77 78 { 78 79 size_t const minGain = ZSTD_minGain(srcSize, strategy); 79 80 size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); ··· 106 105 HUF_compress1X_repeat( 107 106 ostart+lhSize, dstCapacity-lhSize, src, srcSize, 108 107 HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, 109 - (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) : 108 + (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) : 110 109 HUF_compress4X_repeat( 111 110 ostart+lhSize, dstCapacity-lhSize, src, srcSize, 112 111 HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, 113 - (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2); 112 + (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible); 114 113 if (repeat != HUF_repeat_none) { 115 114 /* reused the existing table */ 116 115 DEBUGLOG(5, "Reusing previous huffman table"); ··· 118 117 } 119 118 } 120 119 121 - if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) { 120 + if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { 122 121 ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); 123 122 return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); 124 123 }
+3 -1
lib/zstd/compress/zstd_compress_literals.h
··· 18 18 19 19 size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); 20 20 21 + /* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ 21 22 size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, 22 23 ZSTD_hufCTables_t* nextHuf, 23 24 ZSTD_strategy strategy, int disableLiteralCompression, 24 25 void* dst, size_t dstCapacity, 25 26 const void* src, size_t srcSize, 26 27 void* entropyWorkspace, size_t entropyWorkspaceSize, 27 - const int bmi2); 28 + const int bmi2, 29 + unsigned suspectUncompressible); 28 30 29 31 #endif /* ZSTD_COMPRESS_LITERALS_H */
+17 -14
lib/zstd/compress/zstd_compress_sequences.c
··· 85 85 { 86 86 unsigned cost = 0; 87 87 unsigned s; 88 + 89 + assert(total > 0); 88 90 for (s = 0; s <= max; ++s) { 89 91 unsigned norm = (unsigned)((256 * count[s]) / total); 90 92 if (count[s] != 0 && norm == 0) ··· 275 273 assert(nbSeq_1 > 1); 276 274 assert(entropyWorkspaceSize >= sizeof(ZSTD_BuildCTableWksp)); 277 275 (void)entropyWorkspaceSize; 278 - FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), ""); 279 - { size_t const NCountSize = FSE_writeNCount(op, oend - op, wksp->norm, max, tableLog); /* overflow protected */ 276 + FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed"); 277 + assert(oend >= op); 278 + { size_t const NCountSize = FSE_writeNCount(op, (size_t)(oend - op), wksp->norm, max, tableLog); /* overflow protected */ 280 279 FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed"); 281 - FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), ""); 280 + FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp failed"); 282 281 return NCountSize; 283 282 } 284 283 } ··· 313 310 FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); 314 311 BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); 315 312 if (MEM_32bits()) BIT_flushBits(&blockStream); 316 - BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]); 313 + BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]); 317 314 if (MEM_32bits()) BIT_flushBits(&blockStream); 318 315 if (longOffsets) { 319 316 U32 const ofBits = ofCodeTable[nbSeq-1]; 320 317 unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); 321 318 if (extraBits) { 322 - BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits); 319 + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, extraBits); 323 320 BIT_flushBits(&blockStream); 324 321 } 325 - BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits, 322 + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase >> extraBits, 326 323 ofBits - extraBits); 327 324 } else { 328 - BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]); 325 + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]); 329 326 } 330 327 BIT_flushBits(&blockStream); 331 328 ··· 339 336 U32 const mlBits = ML_bits[mlCode]; 340 337 DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u", 341 338 (unsigned)sequences[n].litLength, 342 - (unsigned)sequences[n].matchLength + MINMATCH, 343 - (unsigned)sequences[n].offset); 339 + (unsigned)sequences[n].mlBase + MINMATCH, 340 + (unsigned)sequences[n].offBase); 344 341 /* 32b*/ /* 64b*/ 345 342 /* (7)*/ /* (7)*/ 346 343 FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */ /* 15 */ ··· 351 348 BIT_flushBits(&blockStream); /* (7)*/ 352 349 BIT_addBits(&blockStream, sequences[n].litLength, llBits); 353 350 if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); 354 - BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); 351 + BIT_addBits(&blockStream, sequences[n].mlBase, mlBits); 355 352 if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream); 356 353 if (longOffsets) { 357 354 unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); 358 355 if (extraBits) { 359 - BIT_addBits(&blockStream, sequences[n].offset, extraBits); 356 + BIT_addBits(&blockStream, sequences[n].offBase, extraBits); 360 357 BIT_flushBits(&blockStream); /* (7)*/ 361 358 } 362 - BIT_addBits(&blockStream, sequences[n].offset >> extraBits, 359 + BIT_addBits(&blockStream, sequences[n].offBase >> extraBits, 363 360 ofBits - extraBits); /* 31 */ 364 361 } else { 365 - BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */ 362 + BIT_addBits(&blockStream, sequences[n].offBase, ofBits); /* 31 */ 366 363 } 367 364 BIT_flushBits(&blockStream); /* (7)*/ 368 365 DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr)); ··· 399 396 400 397 #if DYNAMIC_BMI2 401 398 402 - static TARGET_ATTRIBUTE("bmi2") size_t 399 + static BMI2_TARGET_ATTRIBUTE size_t 403 400 ZSTD_encodeSequences_bmi2( 404 401 void* dst, size_t dstCapacity, 405 402 FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+8 -287
lib/zstd/compress/zstd_compress_superblock.c
··· 15 15 16 16 #include "../common/zstd_internal.h" /* ZSTD_getSequenceLength */ 17 17 #include "hist.h" /* HIST_countFast_wksp */ 18 - #include "zstd_compress_internal.h" 18 + #include "zstd_compress_internal.h" /* ZSTD_[huf|fse|entropy]CTablesMetadata_t */ 19 19 #include "zstd_compress_sequences.h" 20 20 #include "zstd_compress_literals.h" 21 - 22 - /*-************************************* 23 - * Superblock entropy buffer structs 24 - ***************************************/ 25 - /* ZSTD_hufCTablesMetadata_t : 26 - * Stores Literals Block Type for a super-block in hType, and 27 - * huffman tree description in hufDesBuffer. 28 - * hufDesSize refers to the size of huffman tree description in bytes. 29 - * This metadata is populated in ZSTD_buildSuperBlockEntropy_literal() */ 30 - typedef struct { 31 - symbolEncodingType_e hType; 32 - BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; 33 - size_t hufDesSize; 34 - } ZSTD_hufCTablesMetadata_t; 35 - 36 - /* ZSTD_fseCTablesMetadata_t : 37 - * Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and 38 - * fse tables in fseTablesBuffer. 39 - * fseTablesSize refers to the size of fse tables in bytes. 40 - * This metadata is populated in ZSTD_buildSuperBlockEntropy_sequences() */ 41 - typedef struct { 42 - symbolEncodingType_e llType; 43 - symbolEncodingType_e ofType; 44 - symbolEncodingType_e mlType; 45 - BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; 46 - size_t fseTablesSize; 47 - size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_compressSubBlock_sequences() */ 48 - } ZSTD_fseCTablesMetadata_t; 49 - 50 - typedef struct { 51 - ZSTD_hufCTablesMetadata_t hufMetadata; 52 - ZSTD_fseCTablesMetadata_t fseMetadata; 53 - } ZSTD_entropyCTablesMetadata_t; 54 - 55 - 56 - /* ZSTD_buildSuperBlockEntropy_literal() : 57 - * Builds entropy for the super-block literals. 58 - * Stores literals block type (raw, rle, compressed, repeat) and 59 - * huffman description table to hufMetadata. 60 - * @return : size of huffman description table or error code */ 61 - static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t srcSize, 62 - const ZSTD_hufCTables_t* prevHuf, 63 - ZSTD_hufCTables_t* nextHuf, 64 - ZSTD_hufCTablesMetadata_t* hufMetadata, 65 - const int disableLiteralsCompression, 66 - void* workspace, size_t wkspSize) 67 - { 68 - BYTE* const wkspStart = (BYTE*)workspace; 69 - BYTE* const wkspEnd = wkspStart + wkspSize; 70 - BYTE* const countWkspStart = wkspStart; 71 - unsigned* const countWksp = (unsigned*)workspace; 72 - const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); 73 - BYTE* const nodeWksp = countWkspStart + countWkspSize; 74 - const size_t nodeWkspSize = wkspEnd-nodeWksp; 75 - unsigned maxSymbolValue = 255; 76 - unsigned huffLog = HUF_TABLELOG_DEFAULT; 77 - HUF_repeat repeat = prevHuf->repeatMode; 78 - 79 - DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=%zu)", srcSize); 80 - 81 - /* Prepare nextEntropy assuming reusing the existing table */ 82 - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); 83 - 84 - if (disableLiteralsCompression) { 85 - DEBUGLOG(5, "set_basic - disabled"); 86 - hufMetadata->hType = set_basic; 87 - return 0; 88 - } 89 - 90 - /* small ? don't even attempt compression (speed opt) */ 91 - # define COMPRESS_LITERALS_SIZE_MIN 63 92 - { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; 93 - if (srcSize <= minLitSize) { 94 - DEBUGLOG(5, "set_basic - too small"); 95 - hufMetadata->hType = set_basic; 96 - return 0; 97 - } 98 - } 99 - 100 - /* Scan input and build symbol stats */ 101 - { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); 102 - FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); 103 - if (largest == srcSize) { 104 - DEBUGLOG(5, "set_rle"); 105 - hufMetadata->hType = set_rle; 106 - return 0; 107 - } 108 - if (largest <= (srcSize >> 7)+4) { 109 - DEBUGLOG(5, "set_basic - no gain"); 110 - hufMetadata->hType = set_basic; 111 - return 0; 112 - } 113 - } 114 - 115 - /* Validate the previous Huffman table */ 116 - if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { 117 - repeat = HUF_repeat_none; 118 - } 119 - 120 - /* Build Huffman Tree */ 121 - ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); 122 - huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); 123 - { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, 124 - maxSymbolValue, huffLog, 125 - nodeWksp, nodeWkspSize); 126 - FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); 127 - huffLog = (U32)maxBits; 128 - { /* Build and write the CTable */ 129 - size_t const newCSize = HUF_estimateCompressedSize( 130 - (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); 131 - size_t const hSize = HUF_writeCTable_wksp( 132 - hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), 133 - (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, 134 - nodeWksp, nodeWkspSize); 135 - /* Check against repeating the previous CTable */ 136 - if (repeat != HUF_repeat_none) { 137 - size_t const oldCSize = HUF_estimateCompressedSize( 138 - (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); 139 - if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { 140 - DEBUGLOG(5, "set_repeat - smaller"); 141 - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); 142 - hufMetadata->hType = set_repeat; 143 - return 0; 144 - } 145 - } 146 - if (newCSize + hSize >= srcSize) { 147 - DEBUGLOG(5, "set_basic - no gains"); 148 - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); 149 - hufMetadata->hType = set_basic; 150 - return 0; 151 - } 152 - DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); 153 - hufMetadata->hType = set_compressed; 154 - nextHuf->repeatMode = HUF_repeat_check; 155 - return hSize; 156 - } 157 - } 158 - } 159 - 160 - /* ZSTD_buildSuperBlockEntropy_sequences() : 161 - * Builds entropy for the super-block sequences. 162 - * Stores symbol compression modes and fse table to fseMetadata. 163 - * @return : size of fse tables or error code */ 164 - static size_t ZSTD_buildSuperBlockEntropy_sequences(seqStore_t* seqStorePtr, 165 - const ZSTD_fseCTables_t* prevEntropy, 166 - ZSTD_fseCTables_t* nextEntropy, 167 - const ZSTD_CCtx_params* cctxParams, 168 - ZSTD_fseCTablesMetadata_t* fseMetadata, 169 - void* workspace, size_t wkspSize) 170 - { 171 - BYTE* const wkspStart = (BYTE*)workspace; 172 - BYTE* const wkspEnd = wkspStart + wkspSize; 173 - BYTE* const countWkspStart = wkspStart; 174 - unsigned* const countWksp = (unsigned*)workspace; 175 - const size_t countWkspSize = (MaxSeq + 1) * sizeof(unsigned); 176 - BYTE* const cTableWksp = countWkspStart + countWkspSize; 177 - const size_t cTableWkspSize = wkspEnd-cTableWksp; 178 - ZSTD_strategy const strategy = cctxParams->cParams.strategy; 179 - FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; 180 - FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; 181 - FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; 182 - const BYTE* const ofCodeTable = seqStorePtr->ofCode; 183 - const BYTE* const llCodeTable = seqStorePtr->llCode; 184 - const BYTE* const mlCodeTable = seqStorePtr->mlCode; 185 - size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; 186 - BYTE* const ostart = fseMetadata->fseTablesBuffer; 187 - BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); 188 - BYTE* op = ostart; 189 - 190 - assert(cTableWkspSize >= (1 << MaxFSELog) * sizeof(FSE_FUNCTION_TYPE)); 191 - DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_sequences (nbSeq=%zu)", nbSeq); 192 - ZSTD_memset(workspace, 0, wkspSize); 193 - 194 - fseMetadata->lastCountSize = 0; 195 - /* convert length/distances into codes */ 196 - ZSTD_seqToCodes(seqStorePtr); 197 - /* build CTable for Literal Lengths */ 198 - { U32 LLtype; 199 - unsigned max = MaxLL; 200 - size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, llCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ 201 - DEBUGLOG(5, "Building LL table"); 202 - nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; 203 - LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, 204 - countWksp, max, mostFrequent, nbSeq, 205 - LLFSELog, prevEntropy->litlengthCTable, 206 - LL_defaultNorm, LL_defaultNormLog, 207 - ZSTD_defaultAllowed, strategy); 208 - assert(set_basic < set_compressed && set_rle < set_compressed); 209 - assert(!(LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ 210 - { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, 211 - countWksp, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL, 212 - prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable), 213 - cTableWksp, cTableWkspSize); 214 - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed"); 215 - if (LLtype == set_compressed) 216 - fseMetadata->lastCountSize = countSize; 217 - op += countSize; 218 - fseMetadata->llType = (symbolEncodingType_e) LLtype; 219 - } } 220 - /* build CTable for Offsets */ 221 - { U32 Offtype; 222 - unsigned max = MaxOff; 223 - size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, ofCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ 224 - /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ 225 - ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; 226 - DEBUGLOG(5, "Building OF table"); 227 - nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; 228 - Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, 229 - countWksp, max, mostFrequent, nbSeq, 230 - OffFSELog, prevEntropy->offcodeCTable, 231 - OF_defaultNorm, OF_defaultNormLog, 232 - defaultPolicy, strategy); 233 - assert(!(Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ 234 - { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, 235 - countWksp, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, 236 - prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable), 237 - cTableWksp, cTableWkspSize); 238 - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed"); 239 - if (Offtype == set_compressed) 240 - fseMetadata->lastCountSize = countSize; 241 - op += countSize; 242 - fseMetadata->ofType = (symbolEncodingType_e) Offtype; 243 - } } 244 - /* build CTable for MatchLengths */ 245 - { U32 MLtype; 246 - unsigned max = MaxML; 247 - size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, mlCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ 248 - DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); 249 - nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; 250 - MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, 251 - countWksp, max, mostFrequent, nbSeq, 252 - MLFSELog, prevEntropy->matchlengthCTable, 253 - ML_defaultNorm, ML_defaultNormLog, 254 - ZSTD_defaultAllowed, strategy); 255 - assert(!(MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ 256 - { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, 257 - countWksp, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML, 258 - prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable), 259 - cTableWksp, cTableWkspSize); 260 - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed"); 261 - if (MLtype == set_compressed) 262 - fseMetadata->lastCountSize = countSize; 263 - op += countSize; 264 - fseMetadata->mlType = (symbolEncodingType_e) MLtype; 265 - } } 266 - assert((size_t) (op-ostart) <= sizeof(fseMetadata->fseTablesBuffer)); 267 - return op-ostart; 268 - } 269 - 270 - 271 - /* ZSTD_buildSuperBlockEntropy() : 272 - * Builds entropy for the super-block. 273 - * @return : 0 on success or error code */ 274 - static size_t 275 - ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr, 276 - const ZSTD_entropyCTables_t* prevEntropy, 277 - ZSTD_entropyCTables_t* nextEntropy, 278 - const ZSTD_CCtx_params* cctxParams, 279 - ZSTD_entropyCTablesMetadata_t* entropyMetadata, 280 - void* workspace, size_t wkspSize) 281 - { 282 - size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; 283 - DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy"); 284 - entropyMetadata->hufMetadata.hufDesSize = 285 - ZSTD_buildSuperBlockEntropy_literal(seqStorePtr->litStart, litSize, 286 - &prevEntropy->huf, &nextEntropy->huf, 287 - &entropyMetadata->hufMetadata, 288 - ZSTD_disableLiteralsCompression(cctxParams), 289 - workspace, wkspSize); 290 - FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildSuperBlockEntropy_literal failed"); 291 - entropyMetadata->fseMetadata.fseTablesSize = 292 - ZSTD_buildSuperBlockEntropy_sequences(seqStorePtr, 293 - &prevEntropy->fse, &nextEntropy->fse, 294 - cctxParams, 295 - &entropyMetadata->fseMetadata, 296 - workspace, wkspSize); 297 - FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildSuperBlockEntropy_sequences failed"); 298 - return 0; 299 - } 300 21 301 22 /* ZSTD_compressSubBlock_literal() : 302 23 * Compresses literals section for a sub-block. ··· 132 411 const seqDef* sp = sstart; 133 412 size_t matchLengthSum = 0; 134 413 size_t litLengthSum = 0; 135 - /* Only used by assert(), suppress unused variable warnings in production. */ 136 - (void)litLengthSum; 414 + (void)(litLengthSum); /* suppress unused variable warning on some environments */ 137 415 while (send-sp > 0) { 138 416 ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); 139 417 litLengthSum += seqLen.litLength; ··· 325 605 static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type, 326 606 const BYTE* codeTable, unsigned maxCode, 327 607 size_t nbSeq, const FSE_CTable* fseCTable, 328 - const U32* additionalBits, 608 + const U8* additionalBits, 329 609 short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, 330 610 void* workspace, size_t wkspSize) 331 611 { ··· 366 646 void* workspace, size_t wkspSize, 367 647 int writeEntropy) 368 648 { 369 - size_t sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ 649 + size_t const sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ 370 650 size_t cSeqSizeEstimate = 0; 651 + if (nbSeq == 0) return sequencesSectionHeaderSize; 371 652 cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff, 372 653 nbSeq, fseTables->offcodeCTable, NULL, 373 654 OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, ··· 475 754 /* I think there is an optimization opportunity here. 476 755 * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful 477 756 * since it recalculates estimate from scratch. 478 - * For example, it would recount literal distribution and symbol codes everytime. 757 + * For example, it would recount literal distribution and symbol codes every time. 479 758 */ 480 759 cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount, 481 760 &nextCBlock->entropy, entropyMetadata, ··· 539 818 repcodes_t rep; 540 819 ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); 541 820 for (seq = sstart; seq < sp; ++seq) { 542 - rep = ZSTD_updateRep(rep.rep, seq->offset - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); 821 + ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); 543 822 } 544 823 ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); 545 824 } ··· 554 833 unsigned lastBlock) { 555 834 ZSTD_entropyCTablesMetadata_t entropyMetadata; 556 835 557 - FORWARD_IF_ERROR(ZSTD_buildSuperBlockEntropy(&zc->seqStore, 836 + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, 558 837 &zc->blockState.prevCBlock->entropy, 559 838 &zc->blockState.nextCBlock->entropy, 560 839 &zc->appliedParams,
+173 -60
lib/zstd/compress/zstd_cwksp.h
··· 32 32 #define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128 33 33 #endif 34 34 35 + 36 + /* Set our tables and aligneds to align by 64 bytes */ 37 + #define ZSTD_CWKSP_ALIGNMENT_BYTES 64 38 + 35 39 /*-************************************* 36 40 * Structures 37 41 ***************************************/ ··· 118 114 * - Tables: these are any of several different datastructures (hash tables, 119 115 * chain tables, binary trees) that all respect a common format: they are 120 116 * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). 121 - * Their sizes depend on the cparams. 117 + * Their sizes depend on the cparams. These tables are 64-byte aligned. 122 118 * 123 119 * - Aligned: these buffers are used for various purposes that require 4 byte 124 - * alignment, but don't require any initialization before they're used. 120 + * alignment, but don't require any initialization before they're used. These 121 + * buffers are each aligned to 64 bytes. 125 122 * 126 123 * - Buffers: these buffers are used for various purposes that don't require 127 124 * any alignment or initialization before they're used. This means they can ··· 135 130 * 136 131 * 1. Objects 137 132 * 2. Buffers 138 - * 3. Aligned 139 - * 4. Tables 133 + * 3. Aligned/Tables 140 134 * 141 135 * Attempts to reserve objects of different types out of order will fail. 142 136 */ ··· 188 184 * Since tables aren't currently redzoned, you don't need to call through this 189 185 * to figure out how much space you need for the matchState tables. Everything 190 186 * else is though. 187 + * 188 + * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size(). 191 189 */ 192 190 MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { 193 191 if (size == 0) ··· 197 191 return size; 198 192 } 199 193 200 - MEM_STATIC void ZSTD_cwksp_internal_advance_phase( 201 - ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) { 202 - assert(phase >= ws->phase); 203 - if (phase > ws->phase) { 204 - if (ws->phase < ZSTD_cwksp_alloc_buffers && 205 - phase >= ZSTD_cwksp_alloc_buffers) { 206 - ws->tableValidEnd = ws->objectEnd; 207 - } 208 - if (ws->phase < ZSTD_cwksp_alloc_aligned && 209 - phase >= ZSTD_cwksp_alloc_aligned) { 210 - /* If unaligned allocations down from a too-large top have left us 211 - * unaligned, we need to realign our alloc ptr. Technically, this 212 - * can consume space that is unaccounted for in the neededSpace 213 - * calculation. However, I believe this can only happen when the 214 - * workspace is too large, and specifically when it is too large 215 - * by a larger margin than the space that will be consumed. */ 216 - /* TODO: cleaner, compiler warning friendly way to do this??? */ 217 - ws->allocStart = (BYTE*)ws->allocStart - ((size_t)ws->allocStart & (sizeof(U32)-1)); 218 - if (ws->allocStart < ws->tableValidEnd) { 219 - ws->tableValidEnd = ws->allocStart; 220 - } 221 - } 222 - ws->phase = phase; 223 - } 194 + /* 195 + * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes. 196 + * Used to determine the number of bytes required for a given "aligned". 197 + */ 198 + MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { 199 + return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES)); 224 200 } 225 201 226 202 /* 227 - * Returns whether this object/buffer/etc was allocated in this workspace. 203 + * Returns the amount of additional space the cwksp must allocate 204 + * for internal purposes (currently only alignment). 228 205 */ 229 - MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) { 230 - return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); 206 + MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { 207 + /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes 208 + * to align the beginning of tables section, as well as another n_2=[0, 63] bytes 209 + * to align the beginning of the aligned section. 210 + * 211 + * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and 212 + * aligneds being sized in multiples of 64 bytes. 213 + */ 214 + size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; 215 + return slackSpace; 216 + } 217 + 218 + 219 + /* 220 + * Return the number of additional bytes required to align a pointer to the given number of bytes. 221 + * alignBytes must be a power of two. 222 + */ 223 + MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) { 224 + size_t const alignBytesMask = alignBytes - 1; 225 + size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; 226 + assert((alignBytes & alignBytesMask) == 0); 227 + assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); 228 + return bytes; 231 229 } 232 230 233 231 /* 234 232 * Internal function. Do not use directly. 233 + * Reserves the given number of bytes within the aligned/buffer segment of the wksp, 234 + * which counts from the end of the wksp (as opposed to the object/table segment). 235 + * 236 + * Returns a pointer to the beginning of that space. 235 237 */ 236 - MEM_STATIC void* ZSTD_cwksp_reserve_internal( 237 - ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) { 238 - void* alloc; 239 - void* bottom = ws->tableEnd; 240 - ZSTD_cwksp_internal_advance_phase(ws, phase); 241 - alloc = (BYTE *)ws->allocStart - bytes; 242 - 243 - if (bytes == 0) 244 - return NULL; 245 - 246 - 238 + MEM_STATIC void* 239 + ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes) 240 + { 241 + void* const alloc = (BYTE*)ws->allocStart - bytes; 242 + void* const bottom = ws->tableEnd; 247 243 DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", 248 244 alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); 249 245 ZSTD_cwksp_assert_internal_consistency(ws); ··· 255 247 ws->allocFailed = 1; 256 248 return NULL; 257 249 } 250 + /* the area is reserved from the end of wksp. 251 + * If it overlaps with tableValidEnd, it voids guarantees on values' range */ 258 252 if (alloc < ws->tableValidEnd) { 259 253 ws->tableValidEnd = alloc; 260 254 } 261 255 ws->allocStart = alloc; 256 + return alloc; 257 + } 258 + 259 + /* 260 + * Moves the cwksp to the next phase, and does any necessary allocations. 261 + * cwksp initialization must necessarily go through each phase in order. 262 + * Returns a 0 on success, or zstd error 263 + */ 264 + MEM_STATIC size_t 265 + ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) 266 + { 267 + assert(phase >= ws->phase); 268 + if (phase > ws->phase) { 269 + /* Going from allocating objects to allocating buffers */ 270 + if (ws->phase < ZSTD_cwksp_alloc_buffers && 271 + phase >= ZSTD_cwksp_alloc_buffers) { 272 + ws->tableValidEnd = ws->objectEnd; 273 + } 274 + 275 + /* Going from allocating buffers to allocating aligneds/tables */ 276 + if (ws->phase < ZSTD_cwksp_alloc_aligned && 277 + phase >= ZSTD_cwksp_alloc_aligned) { 278 + { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ 279 + size_t const bytesToAlign = 280 + ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); 281 + DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); 282 + ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ 283 + RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), 284 + memory_allocation, "aligned phase - alignment initial allocation failed!"); 285 + } 286 + { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ 287 + void* const alloc = ws->objectEnd; 288 + size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); 289 + void* const objectEnd = (BYTE*)alloc + bytesToAlign; 290 + DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); 291 + RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, 292 + "table phase - alignment initial allocation failed!"); 293 + ws->objectEnd = objectEnd; 294 + ws->tableEnd = objectEnd; /* table area starts being empty */ 295 + if (ws->tableValidEnd < ws->tableEnd) { 296 + ws->tableValidEnd = ws->tableEnd; 297 + } } } 298 + ws->phase = phase; 299 + ZSTD_cwksp_assert_internal_consistency(ws); 300 + } 301 + return 0; 302 + } 303 + 304 + /* 305 + * Returns whether this object/buffer/etc was allocated in this workspace. 306 + */ 307 + MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) 308 + { 309 + return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); 310 + } 311 + 312 + /* 313 + * Internal function. Do not use directly. 314 + */ 315 + MEM_STATIC void* 316 + ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) 317 + { 318 + void* alloc; 319 + if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || bytes == 0) { 320 + return NULL; 321 + } 322 + 323 + 324 + alloc = ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes); 262 325 263 326 264 327 return alloc; ··· 338 259 /* 339 260 * Reserves and returns unaligned memory. 340 261 */ 341 - MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) { 262 + MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) 263 + { 342 264 return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); 343 265 } 344 266 345 267 /* 346 - * Reserves and returns memory sized on and aligned on sizeof(unsigned). 268 + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). 347 269 */ 348 - MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) { 349 - assert((bytes & (sizeof(U32)-1)) == 0); 350 - return ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, sizeof(U32)), ZSTD_cwksp_alloc_aligned); 270 + MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) 271 + { 272 + void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES), 273 + ZSTD_cwksp_alloc_aligned); 274 + assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); 275 + return ptr; 351 276 } 352 277 353 278 /* 354 - * Aligned on sizeof(unsigned). These buffers have the special property that 279 + * Aligned on 64 bytes. These buffers have the special property that 355 280 * their values remain constrained, allowing us to re-use them without 356 281 * memset()-ing them. 357 282 */ 358 - MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { 283 + MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) 284 + { 359 285 const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; 360 - void* alloc = ws->tableEnd; 361 - void* end = (BYTE *)alloc + bytes; 362 - void* top = ws->allocStart; 286 + void* alloc; 287 + void* end; 288 + void* top; 289 + 290 + if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { 291 + return NULL; 292 + } 293 + alloc = ws->tableEnd; 294 + end = (BYTE *)alloc + bytes; 295 + top = ws->allocStart; 363 296 364 297 DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining", 365 298 alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); 366 299 assert((bytes & (sizeof(U32)-1)) == 0); 367 - ZSTD_cwksp_internal_advance_phase(ws, phase); 368 300 ZSTD_cwksp_assert_internal_consistency(ws); 369 301 assert(end <= top); 370 302 if (end > top) { ··· 386 296 ws->tableEnd = end; 387 297 388 298 299 + assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); 300 + assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); 389 301 return alloc; 390 302 } 391 303 392 304 /* 393 305 * Aligned on sizeof(void*). 306 + * Note : should happen only once, at workspace first initialization 394 307 */ 395 - MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) { 396 - size_t roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*)); 308 + MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) 309 + { 310 + size_t const roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*)); 397 311 void* alloc = ws->objectEnd; 398 312 void* end = (BYTE*)alloc + roundedBytes; 399 313 400 314 401 - DEBUGLOG(5, 315 + DEBUGLOG(4, 402 316 "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining", 403 317 alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes); 404 - assert(((size_t)alloc & (sizeof(void*)-1)) == 0); 405 - assert((bytes & (sizeof(void*)-1)) == 0); 318 + assert((size_t)alloc % ZSTD_ALIGNOF(void*) == 0); 319 + assert(bytes % ZSTD_ALIGNOF(void*) == 0); 406 320 ZSTD_cwksp_assert_internal_consistency(ws); 407 321 /* we must be in the first phase, no advance is possible */ 408 322 if (ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) { 409 - DEBUGLOG(4, "cwksp: object alloc failed!"); 323 + DEBUGLOG(3, "cwksp: object alloc failed!"); 410 324 ws->allocFailed = 1; 411 325 return NULL; 412 326 } ··· 422 328 return alloc; 423 329 } 424 330 425 - MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) { 331 + MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) 332 + { 426 333 DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty"); 427 334 428 335 ··· 545 450 /*-************************************* 546 451 * Functions Checking Free Space 547 452 ***************************************/ 453 + 454 + /* ZSTD_alignmentSpaceWithinBounds() : 455 + * Returns if the estimated space needed for a wksp is within an acceptable limit of the 456 + * actual amount of space used. 457 + */ 458 + MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, 459 + size_t const estimatedSpace, int resizedWorkspace) { 460 + if (resizedWorkspace) { 461 + /* Resized/newly allocated wksp should have exact bounds */ 462 + return ZSTD_cwksp_used(ws) == estimatedSpace; 463 + } else { 464 + /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes 465 + * than estimatedSpace. See the comments in zstd_cwksp.h for details. 466 + */ 467 + return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); 468 + } 469 + } 470 + 548 471 549 472 MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) { 550 473 return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd);
+295 -118
lib/zstd/compress/zstd_double_fast.c
··· 48 48 49 49 50 50 FORCE_INLINE_TEMPLATE 51 - size_t ZSTD_compressBlock_doubleFast_generic( 51 + size_t ZSTD_compressBlock_doubleFast_noDict_generic( 52 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 53 + void const* src, size_t srcSize, U32 const mls /* template */) 54 + { 55 + ZSTD_compressionParameters const* cParams = &ms->cParams; 56 + U32* const hashLong = ms->hashTable; 57 + const U32 hBitsL = cParams->hashLog; 58 + U32* const hashSmall = ms->chainTable; 59 + const U32 hBitsS = cParams->chainLog; 60 + const BYTE* const base = ms->window.base; 61 + const BYTE* const istart = (const BYTE*)src; 62 + const BYTE* anchor = istart; 63 + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); 64 + /* presumes that, if there is a dictionary, it must be using Attach mode */ 65 + const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); 66 + const BYTE* const prefixLowest = base + prefixLowestIndex; 67 + const BYTE* const iend = istart + srcSize; 68 + const BYTE* const ilimit = iend - HASH_READ_SIZE; 69 + U32 offset_1=rep[0], offset_2=rep[1]; 70 + U32 offsetSaved = 0; 71 + 72 + size_t mLength; 73 + U32 offset; 74 + U32 curr; 75 + 76 + /* how many positions to search before increasing step size */ 77 + const size_t kStepIncr = 1 << kSearchStrength; 78 + /* the position at which to increment the step size if no match is found */ 79 + const BYTE* nextStep; 80 + size_t step; /* the current step size */ 81 + 82 + size_t hl0; /* the long hash at ip */ 83 + size_t hl1; /* the long hash at ip1 */ 84 + 85 + U32 idxl0; /* the long match index for ip */ 86 + U32 idxl1; /* the long match index for ip1 */ 87 + 88 + const BYTE* matchl0; /* the long match for ip */ 89 + const BYTE* matchs0; /* the short match for ip */ 90 + const BYTE* matchl1; /* the long match for ip1 */ 91 + 92 + const BYTE* ip = istart; /* the current position */ 93 + const BYTE* ip1; /* the next position */ 94 + 95 + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic"); 96 + 97 + /* init */ 98 + ip += ((ip - prefixLowest) == 0); 99 + { 100 + U32 const current = (U32)(ip - base); 101 + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); 102 + U32 const maxRep = current - windowLow; 103 + if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; 104 + if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; 105 + } 106 + 107 + /* Outer Loop: one iteration per match found and stored */ 108 + while (1) { 109 + step = 1; 110 + nextStep = ip + kStepIncr; 111 + ip1 = ip + step; 112 + 113 + if (ip1 > ilimit) { 114 + goto _cleanup; 115 + } 116 + 117 + hl0 = ZSTD_hashPtr(ip, hBitsL, 8); 118 + idxl0 = hashLong[hl0]; 119 + matchl0 = base + idxl0; 120 + 121 + /* Inner Loop: one iteration per search / position */ 122 + do { 123 + const size_t hs0 = ZSTD_hashPtr(ip, hBitsS, mls); 124 + const U32 idxs0 = hashSmall[hs0]; 125 + curr = (U32)(ip-base); 126 + matchs0 = base + idxs0; 127 + 128 + hashLong[hl0] = hashSmall[hs0] = curr; /* update hash tables */ 129 + 130 + /* check noDict repcode */ 131 + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { 132 + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; 133 + ip++; 134 + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); 135 + goto _match_stored; 136 + } 137 + 138 + hl1 = ZSTD_hashPtr(ip1, hBitsL, 8); 139 + 140 + if (idxl0 > prefixLowestIndex) { 141 + /* check prefix long match */ 142 + if (MEM_read64(matchl0) == MEM_read64(ip)) { 143 + mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8; 144 + offset = (U32)(ip-matchl0); 145 + while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ 146 + goto _match_found; 147 + } 148 + } 149 + 150 + idxl1 = hashLong[hl1]; 151 + matchl1 = base + idxl1; 152 + 153 + if (idxs0 > prefixLowestIndex) { 154 + /* check prefix short match */ 155 + if (MEM_read32(matchs0) == MEM_read32(ip)) { 156 + goto _search_next_long; 157 + } 158 + } 159 + 160 + if (ip1 >= nextStep) { 161 + PREFETCH_L1(ip1 + 64); 162 + PREFETCH_L1(ip1 + 128); 163 + step++; 164 + nextStep += kStepIncr; 165 + } 166 + ip = ip1; 167 + ip1 += step; 168 + 169 + hl0 = hl1; 170 + idxl0 = idxl1; 171 + matchl0 = matchl1; 172 + #if defined(__aarch64__) 173 + PREFETCH_L1(ip+256); 174 + #endif 175 + } while (ip1 <= ilimit); 176 + 177 + _cleanup: 178 + /* save reps for next block */ 179 + rep[0] = offset_1 ? offset_1 : offsetSaved; 180 + rep[1] = offset_2 ? offset_2 : offsetSaved; 181 + 182 + /* Return the last literals size */ 183 + return (size_t)(iend - anchor); 184 + 185 + _search_next_long: 186 + 187 + /* check prefix long +1 match */ 188 + if (idxl1 > prefixLowestIndex) { 189 + if (MEM_read64(matchl1) == MEM_read64(ip1)) { 190 + ip = ip1; 191 + mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8; 192 + offset = (U32)(ip-matchl1); 193 + while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */ 194 + goto _match_found; 195 + } 196 + } 197 + 198 + /* if no long +1 match, explore the short match we found */ 199 + mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4; 200 + offset = (U32)(ip - matchs0); 201 + while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */ 202 + 203 + /* fall-through */ 204 + 205 + _match_found: /* requires ip, offset, mLength */ 206 + offset_2 = offset_1; 207 + offset_1 = offset; 208 + 209 + if (step < 4) { 210 + /* It is unsafe to write this value back to the hashtable when ip1 is 211 + * greater than or equal to the new ip we will have after we're done 212 + * processing this match. Rather than perform that test directly 213 + * (ip1 >= ip + mLength), which costs speed in practice, we do a simpler 214 + * more predictable test. The minmatch even if we take a short match is 215 + * 4 bytes, so as long as step, the distance between ip and ip1 216 + * (initially) is less than 4, we know ip1 < new ip. */ 217 + hashLong[hl1] = (U32)(ip1 - base); 218 + } 219 + 220 + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); 221 + 222 + _match_stored: 223 + /* match found */ 224 + ip += mLength; 225 + anchor = ip; 226 + 227 + if (ip <= ilimit) { 228 + /* Complementary insertion */ 229 + /* done after iLimit test, as candidates could be > iend-8 */ 230 + { U32 const indexToInsert = curr+2; 231 + hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; 232 + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); 233 + hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; 234 + hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base); 235 + } 236 + 237 + /* check immediate repcode */ 238 + while ( (ip <= ilimit) 239 + && ( (offset_2>0) 240 + & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { 241 + /* store sequence */ 242 + size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; 243 + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ 244 + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); 245 + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); 246 + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength); 247 + ip += rLength; 248 + anchor = ip; 249 + continue; /* faster when present ... (?) */ 250 + } 251 + } 252 + } 253 + } 254 + 255 + 256 + FORCE_INLINE_TEMPLATE 257 + size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( 52 258 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 53 259 void const* src, size_t srcSize, 54 - U32 const mls /* template */, ZSTD_dictMode_e const dictMode) 260 + U32 const mls /* template */) 55 261 { 56 262 ZSTD_compressionParameters const* cParams = &ms->cParams; 57 263 U32* const hashLong = ms->hashTable; ··· 278 72 U32 offsetSaved = 0; 279 73 280 74 const ZSTD_matchState_t* const dms = ms->dictMatchState; 281 - const ZSTD_compressionParameters* const dictCParams = 282 - dictMode == ZSTD_dictMatchState ? 283 - &dms->cParams : NULL; 284 - const U32* const dictHashLong = dictMode == ZSTD_dictMatchState ? 285 - dms->hashTable : NULL; 286 - const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ? 287 - dms->chainTable : NULL; 288 - const U32 dictStartIndex = dictMode == ZSTD_dictMatchState ? 289 - dms->window.dictLimit : 0; 290 - const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ? 291 - dms->window.base : NULL; 292 - const BYTE* const dictStart = dictMode == ZSTD_dictMatchState ? 293 - dictBase + dictStartIndex : NULL; 294 - const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ? 295 - dms->window.nextSrc : NULL; 296 - const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ? 297 - prefixLowestIndex - (U32)(dictEnd - dictBase) : 298 - 0; 299 - const U32 dictHBitsL = dictMode == ZSTD_dictMatchState ? 300 - dictCParams->hashLog : hBitsL; 301 - const U32 dictHBitsS = dictMode == ZSTD_dictMatchState ? 302 - dictCParams->chainLog : hBitsS; 75 + const ZSTD_compressionParameters* const dictCParams = &dms->cParams; 76 + const U32* const dictHashLong = dms->hashTable; 77 + const U32* const dictHashSmall = dms->chainTable; 78 + const U32 dictStartIndex = dms->window.dictLimit; 79 + const BYTE* const dictBase = dms->window.base; 80 + const BYTE* const dictStart = dictBase + dictStartIndex; 81 + const BYTE* const dictEnd = dms->window.nextSrc; 82 + const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); 83 + const U32 dictHBitsL = dictCParams->hashLog; 84 + const U32 dictHBitsS = dictCParams->chainLog; 303 85 const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); 304 86 305 - DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_generic"); 306 - 307 - assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState); 87 + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); 308 88 309 89 /* if a dictionary is attached, it must be within window range */ 310 - if (dictMode == ZSTD_dictMatchState) { 311 - assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); 312 - } 90 + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); 313 91 314 92 /* init */ 315 93 ip += (dictAndPrefixLength == 0); 316 - if (dictMode == ZSTD_noDict) { 317 - U32 const curr = (U32)(ip - base); 318 - U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); 319 - U32 const maxRep = curr - windowLow; 320 - if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; 321 - if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; 322 - } 323 - if (dictMode == ZSTD_dictMatchState) { 324 - /* dictMatchState repCode checks don't currently handle repCode == 0 325 - * disabling. */ 326 - assert(offset_1 <= dictAndPrefixLength); 327 - assert(offset_2 <= dictAndPrefixLength); 328 - } 94 + 95 + /* dictMatchState repCode checks don't currently handle repCode == 0 96 + * disabling. */ 97 + assert(offset_1 <= dictAndPrefixLength); 98 + assert(offset_2 <= dictAndPrefixLength); 329 99 330 100 /* Main Search Loop */ 331 101 while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ ··· 317 135 const BYTE* matchLong = base + matchIndexL; 318 136 const BYTE* match = base + matchIndexS; 319 137 const U32 repIndex = curr + 1 - offset_1; 320 - const BYTE* repMatch = (dictMode == ZSTD_dictMatchState 321 - && repIndex < prefixLowestIndex) ? 138 + const BYTE* repMatch = (repIndex < prefixLowestIndex) ? 322 139 dictBase + (repIndex - dictIndexDelta) : 323 140 base + repIndex; 324 141 hashLong[h2] = hashSmall[h] = curr; /* update hash tables */ 325 142 326 - /* check dictMatchState repcode */ 327 - if (dictMode == ZSTD_dictMatchState 328 - && ((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) 143 + /* check repcode */ 144 + if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) 329 145 && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { 330 146 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; 331 147 mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; 332 148 ip++; 333 - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); 334 - goto _match_stored; 335 - } 336 - 337 - /* check noDict repcode */ 338 - if ( dictMode == ZSTD_noDict 339 - && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) { 340 - mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; 341 - ip++; 342 - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); 149 + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); 343 150 goto _match_stored; 344 151 } 345 152 ··· 340 169 while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ 341 170 goto _match_found; 342 171 } 343 - } else if (dictMode == ZSTD_dictMatchState) { 172 + } else { 344 173 /* check dictMatchState long match */ 345 174 U32 const dictMatchIndexL = dictHashLong[dictHL]; 346 175 const BYTE* dictMatchL = dictBase + dictMatchIndexL; ··· 358 187 if (MEM_read32(match) == MEM_read32(ip)) { 359 188 goto _search_next_long; 360 189 } 361 - } else if (dictMode == ZSTD_dictMatchState) { 190 + } else { 362 191 /* check dictMatchState short match */ 363 192 U32 const dictMatchIndexS = dictHashSmall[dictHS]; 364 193 match = dictBase + dictMatchIndexS; ··· 391 220 while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ 392 221 goto _match_found; 393 222 } 394 - } else if (dictMode == ZSTD_dictMatchState) { 223 + } else { 395 224 /* check dict long +1 match */ 396 225 U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; 397 226 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; ··· 405 234 } } } 406 235 407 236 /* if no long +1 match, explore the short match we found */ 408 - if (dictMode == ZSTD_dictMatchState && matchIndexS < prefixLowestIndex) { 237 + if (matchIndexS < prefixLowestIndex) { 409 238 mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4; 410 239 offset = (U32)(curr - matchIndexS); 411 240 while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ ··· 419 248 offset_2 = offset_1; 420 249 offset_1 = offset; 421 250 422 - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); 251 + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); 423 252 424 253 _match_stored: 425 254 /* match found */ ··· 437 266 } 438 267 439 268 /* check immediate repcode */ 440 - if (dictMode == ZSTD_dictMatchState) { 441 - while (ip <= ilimit) { 442 - U32 const current2 = (U32)(ip-base); 443 - U32 const repIndex2 = current2 - offset_2; 444 - const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState 445 - && repIndex2 < prefixLowestIndex ? 446 - dictBase + repIndex2 - dictIndexDelta : 447 - base + repIndex2; 448 - if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) 449 - && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { 450 - const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; 451 - size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; 452 - U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ 453 - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); 454 - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; 455 - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; 456 - ip += repLength2; 457 - anchor = ip; 458 - continue; 459 - } 460 - break; 461 - } } 462 - 463 - if (dictMode == ZSTD_noDict) { 464 - while ( (ip <= ilimit) 465 - && ( (offset_2>0) 466 - & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { 467 - /* store sequence */ 468 - size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; 469 - U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ 470 - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); 471 - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); 472 - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, rLength-MINMATCH); 473 - ip += rLength; 269 + while (ip <= ilimit) { 270 + U32 const current2 = (U32)(ip-base); 271 + U32 const repIndex2 = current2 - offset_2; 272 + const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ? 273 + dictBase + repIndex2 - dictIndexDelta : 274 + base + repIndex2; 275 + if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) 276 + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { 277 + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; 278 + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; 279 + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ 280 + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); 281 + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; 282 + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; 283 + ip += repLength2; 474 284 anchor = ip; 475 - continue; /* faster when present ... (?) */ 476 - } } } 285 + continue; 286 + } 287 + break; 288 + } 289 + } 477 290 } /* while (ip < ilimit) */ 478 291 479 292 /* save reps for next block */ ··· 467 312 /* Return the last literals size */ 468 313 return (size_t)(iend - anchor); 469 314 } 315 + 316 + #define ZSTD_GEN_DFAST_FN(dictMode, mls) \ 317 + static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls( \ 318 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ 319 + void const* src, size_t srcSize) \ 320 + { \ 321 + return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \ 322 + } 323 + 324 + ZSTD_GEN_DFAST_FN(noDict, 4) 325 + ZSTD_GEN_DFAST_FN(noDict, 5) 326 + ZSTD_GEN_DFAST_FN(noDict, 6) 327 + ZSTD_GEN_DFAST_FN(noDict, 7) 328 + 329 + ZSTD_GEN_DFAST_FN(dictMatchState, 4) 330 + ZSTD_GEN_DFAST_FN(dictMatchState, 5) 331 + ZSTD_GEN_DFAST_FN(dictMatchState, 6) 332 + ZSTD_GEN_DFAST_FN(dictMatchState, 7) 470 333 471 334 472 335 size_t ZSTD_compressBlock_doubleFast( ··· 496 323 { 497 324 default: /* includes case 3 */ 498 325 case 4 : 499 - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict); 326 + return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize); 500 327 case 5 : 501 - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict); 328 + return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize); 502 329 case 6 : 503 - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict); 330 + return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize); 504 331 case 7 : 505 - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict); 332 + return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize); 506 333 } 507 334 } 508 335 ··· 516 343 { 517 344 default: /* includes case 3 */ 518 345 case 4 : 519 - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState); 346 + return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize); 520 347 case 5 : 521 - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState); 348 + return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize); 522 349 case 6 : 523 - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState); 350 + return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize); 524 351 case 7 : 525 - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState); 352 + return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize); 526 353 } 527 354 } 528 355 ··· 558 385 559 386 /* if extDict is invalidated due to maxDistance, switch to "regular" variant */ 560 387 if (prefixStartIndex == dictStartIndex) 561 - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, mls, ZSTD_noDict); 388 + return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize); 562 389 563 390 /* Search Loop */ 564 391 while (ip < ilimit) { /* < instead of <=, because (ip+1) */ ··· 580 407 hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */ 581 408 582 409 if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */ 583 - & (repIndex > dictStartIndex)) 410 + & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */ 584 411 && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { 585 412 const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; 586 413 mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; 587 414 ip++; 588 - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); 415 + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); 589 416 } else { 590 417 if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { 591 418 const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; ··· 596 423 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ 597 424 offset_2 = offset_1; 598 425 offset_1 = offset; 599 - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); 426 + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); 600 427 601 428 } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { 602 429 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); ··· 621 448 } 622 449 offset_2 = offset_1; 623 450 offset_1 = offset; 624 - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); 451 + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); 625 452 626 453 } else { 627 454 ip += ((ip-anchor) >> kSearchStrength) + 1; ··· 648 475 U32 const repIndex2 = current2 - offset_2; 649 476 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; 650 477 if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */ 651 - & (repIndex2 > dictStartIndex)) 478 + & (offset_2 <= current2 - dictStartIndex)) 652 479 && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { 653 480 const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; 654 481 size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; 655 482 U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ 656 - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); 483 + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); 657 484 hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; 658 485 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; 659 486 ip += repLength2; ··· 671 498 return (size_t)(iend - anchor); 672 499 } 673 500 501 + ZSTD_GEN_DFAST_FN(extDict, 4) 502 + ZSTD_GEN_DFAST_FN(extDict, 5) 503 + ZSTD_GEN_DFAST_FN(extDict, 6) 504 + ZSTD_GEN_DFAST_FN(extDict, 7) 674 505 675 506 size_t ZSTD_compressBlock_doubleFast_extDict( 676 507 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ··· 685 508 { 686 509 default: /* includes case 3 */ 687 510 case 4 : 688 - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); 511 + return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize); 689 512 case 5 : 690 - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); 513 + return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize); 691 514 case 6 : 692 - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); 515 + return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize); 693 516 case 7 : 694 - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); 517 + return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); 695 518 } 696 519 }
+306 -127
lib/zstd/compress/zstd_fast.c
··· 43 43 } 44 44 45 45 46 + /* 47 + * If you squint hard enough (and ignore repcodes), the search operation at any 48 + * given position is broken into 4 stages: 49 + * 50 + * 1. Hash (map position to hash value via input read) 51 + * 2. Lookup (map hash val to index via hashtable read) 52 + * 3. Load (map index to value at that position via input read) 53 + * 4. Compare 54 + * 55 + * Each of these steps involves a memory read at an address which is computed 56 + * from the previous step. This means these steps must be sequenced and their 57 + * latencies are cumulative. 58 + * 59 + * Rather than do 1->2->3->4 sequentially for a single position before moving 60 + * onto the next, this implementation interleaves these operations across the 61 + * next few positions: 62 + * 63 + * R = Repcode Read & Compare 64 + * H = Hash 65 + * T = Table Lookup 66 + * M = Match Read & Compare 67 + * 68 + * Pos | Time --> 69 + * ----+------------------- 70 + * N | ... M 71 + * N+1 | ... TM 72 + * N+2 | R H T M 73 + * N+3 | H TM 74 + * N+4 | R H T M 75 + * N+5 | H ... 76 + * N+6 | R ... 77 + * 78 + * This is very much analogous to the pipelining of execution in a CPU. And just 79 + * like a CPU, we have to dump the pipeline when we find a match (i.e., take a 80 + * branch). 81 + * 82 + * When this happens, we throw away our current state, and do the following prep 83 + * to re-enter the loop: 84 + * 85 + * Pos | Time --> 86 + * ----+------------------- 87 + * N | H T 88 + * N+1 | H 89 + * 90 + * This is also the work we do at the beginning to enter the loop initially. 91 + */ 46 92 FORCE_INLINE_TEMPLATE size_t 47 - ZSTD_compressBlock_fast_generic( 93 + ZSTD_compressBlock_fast_noDict_generic( 48 94 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 49 95 void const* src, size_t srcSize, 50 - U32 const mls) 96 + U32 const mls, U32 const hasStep) 51 97 { 52 98 const ZSTD_compressionParameters* const cParams = &ms->cParams; 53 99 U32* const hashTable = ms->hashTable; 54 100 U32 const hlog = cParams->hashLog; 55 101 /* support stepSize of 0 */ 56 - size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; 102 + size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2; 57 103 const BYTE* const base = ms->window.base; 58 104 const BYTE* const istart = (const BYTE*)src; 59 - /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */ 60 - const BYTE* ip0 = istart; 61 - const BYTE* ip1; 62 - const BYTE* anchor = istart; 63 105 const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); 64 106 const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); 65 107 const BYTE* const prefixStart = base + prefixStartIndex; 66 108 const BYTE* const iend = istart + srcSize; 67 109 const BYTE* const ilimit = iend - HASH_READ_SIZE; 68 - U32 offset_1=rep[0], offset_2=rep[1]; 110 + 111 + const BYTE* anchor = istart; 112 + const BYTE* ip0 = istart; 113 + const BYTE* ip1; 114 + const BYTE* ip2; 115 + const BYTE* ip3; 116 + U32 current0; 117 + 118 + U32 rep_offset1 = rep[0]; 119 + U32 rep_offset2 = rep[1]; 69 120 U32 offsetSaved = 0; 70 121 71 - /* init */ 122 + size_t hash0; /* hash for ip0 */ 123 + size_t hash1; /* hash for ip1 */ 124 + U32 idx; /* match idx for ip0 */ 125 + U32 mval; /* src value at match idx */ 126 + 127 + U32 offcode; 128 + const BYTE* match0; 129 + size_t mLength; 130 + 131 + /* ip0 and ip1 are always adjacent. The targetLength skipping and 132 + * uncompressibility acceleration is applied to every other position, 133 + * matching the behavior of #1562. step therefore represents the gap 134 + * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */ 135 + size_t step; 136 + const BYTE* nextStep; 137 + const size_t kStepIncr = (1 << (kSearchStrength - 1)); 138 + 72 139 DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); 73 140 ip0 += (ip0 == prefixStart); 74 - ip1 = ip0 + 1; 75 141 { U32 const curr = (U32)(ip0 - base); 76 142 U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); 77 143 U32 const maxRep = curr - windowLow; 78 - if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; 79 - if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; 144 + if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0; 145 + if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0; 80 146 } 81 147 82 - /* Main Search Loop */ 83 - #ifdef __INTEL_COMPILER 84 - /* From intel 'The vector pragma indicates that the loop should be 85 - * vectorized if it is legal to do so'. Can be used together with 86 - * #pragma ivdep (but have opted to exclude that because intel 87 - * warns against using it).*/ 88 - #pragma vector always 89 - #endif 90 - while (ip1 < ilimit) { /* < instead of <=, because check at ip0+2 */ 91 - size_t mLength; 92 - BYTE const* ip2 = ip0 + 2; 93 - size_t const h0 = ZSTD_hashPtr(ip0, hlog, mls); 94 - U32 const val0 = MEM_read32(ip0); 95 - size_t const h1 = ZSTD_hashPtr(ip1, hlog, mls); 96 - U32 const val1 = MEM_read32(ip1); 97 - U32 const current0 = (U32)(ip0-base); 98 - U32 const current1 = (U32)(ip1-base); 99 - U32 const matchIndex0 = hashTable[h0]; 100 - U32 const matchIndex1 = hashTable[h1]; 101 - BYTE const* repMatch = ip2 - offset_1; 102 - const BYTE* match0 = base + matchIndex0; 103 - const BYTE* match1 = base + matchIndex1; 104 - U32 offcode; 148 + /* start each op */ 149 + _start: /* Requires: ip0 */ 105 150 106 - #if defined(__aarch64__) 107 - PREFETCH_L1(ip0+256); 108 - #endif 151 + step = stepSize; 152 + nextStep = ip0 + kStepIncr; 109 153 110 - hashTable[h0] = current0; /* update hash table */ 111 - hashTable[h1] = current1; /* update hash table */ 154 + /* calculate positions, ip0 - anchor == 0, so we skip step calc */ 155 + ip1 = ip0 + 1; 156 + ip2 = ip0 + step; 157 + ip3 = ip2 + 1; 112 158 113 - assert(ip0 + 1 == ip1); 159 + if (ip3 >= ilimit) { 160 + goto _cleanup; 161 + } 114 162 115 - if ((offset_1 > 0) & (MEM_read32(repMatch) == MEM_read32(ip2))) { 116 - mLength = (ip2[-1] == repMatch[-1]) ? 1 : 0; 117 - ip0 = ip2 - mLength; 118 - match0 = repMatch - mLength; 163 + hash0 = ZSTD_hashPtr(ip0, hlog, mls); 164 + hash1 = ZSTD_hashPtr(ip1, hlog, mls); 165 + 166 + idx = hashTable[hash0]; 167 + 168 + do { 169 + /* load repcode match for ip[2]*/ 170 + const U32 rval = MEM_read32(ip2 - rep_offset1); 171 + 172 + /* write back hash table entry */ 173 + current0 = (U32)(ip0 - base); 174 + hashTable[hash0] = current0; 175 + 176 + /* check repcode at ip[2] */ 177 + if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) { 178 + ip0 = ip2; 179 + match0 = ip0 - rep_offset1; 180 + mLength = ip0[-1] == match0[-1]; 181 + ip0 -= mLength; 182 + match0 -= mLength; 183 + offcode = STORE_REPCODE_1; 119 184 mLength += 4; 120 - offcode = 0; 121 185 goto _match; 122 186 } 123 - if ((matchIndex0 > prefixStartIndex) && MEM_read32(match0) == val0) { 124 - /* found a regular match */ 187 + 188 + /* load match for ip[0] */ 189 + if (idx >= prefixStartIndex) { 190 + mval = MEM_read32(base + idx); 191 + } else { 192 + mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ 193 + } 194 + 195 + /* check match at ip[0] */ 196 + if (MEM_read32(ip0) == mval) { 197 + /* found a match! */ 125 198 goto _offset; 126 199 } 127 - if ((matchIndex1 > prefixStartIndex) && MEM_read32(match1) == val1) { 128 - /* found a regular match after one literal */ 129 - ip0 = ip1; 130 - match0 = match1; 200 + 201 + /* lookup ip[1] */ 202 + idx = hashTable[hash1]; 203 + 204 + /* hash ip[2] */ 205 + hash0 = hash1; 206 + hash1 = ZSTD_hashPtr(ip2, hlog, mls); 207 + 208 + /* advance to next positions */ 209 + ip0 = ip1; 210 + ip1 = ip2; 211 + ip2 = ip3; 212 + 213 + /* write back hash table entry */ 214 + current0 = (U32)(ip0 - base); 215 + hashTable[hash0] = current0; 216 + 217 + /* load match for ip[0] */ 218 + if (idx >= prefixStartIndex) { 219 + mval = MEM_read32(base + idx); 220 + } else { 221 + mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ 222 + } 223 + 224 + /* check match at ip[0] */ 225 + if (MEM_read32(ip0) == mval) { 226 + /* found a match! */ 131 227 goto _offset; 132 228 } 133 - { size_t const step = ((size_t)(ip0-anchor) >> (kSearchStrength - 1)) + stepSize; 134 - assert(step >= 2); 135 - ip0 += step; 136 - ip1 += step; 137 - continue; 229 + 230 + /* lookup ip[1] */ 231 + idx = hashTable[hash1]; 232 + 233 + /* hash ip[2] */ 234 + hash0 = hash1; 235 + hash1 = ZSTD_hashPtr(ip2, hlog, mls); 236 + 237 + /* advance to next positions */ 238 + ip0 = ip1; 239 + ip1 = ip2; 240 + ip2 = ip0 + step; 241 + ip3 = ip1 + step; 242 + 243 + /* calculate step */ 244 + if (ip2 >= nextStep) { 245 + step++; 246 + PREFETCH_L1(ip1 + 64); 247 + PREFETCH_L1(ip1 + 128); 248 + nextStep += kStepIncr; 138 249 } 139 - _offset: /* Requires: ip0, match0 */ 140 - /* Compute the offset code */ 141 - offset_2 = offset_1; 142 - offset_1 = (U32)(ip0-match0); 143 - offcode = offset_1 + ZSTD_REP_MOVE; 144 - mLength = 4; 145 - /* Count the backwards match length */ 146 - while (((ip0>anchor) & (match0>prefixStart)) 147 - && (ip0[-1] == match0[-1])) { ip0--; match0--; mLength++; } /* catch up */ 250 + } while (ip3 < ilimit); 148 251 149 - _match: /* Requires: ip0, match0, offcode */ 150 - /* Count the forward length */ 151 - mLength += ZSTD_count(ip0+mLength, match0+mLength, iend); 152 - ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, offcode, mLength-MINMATCH); 153 - /* match found */ 154 - ip0 += mLength; 155 - anchor = ip0; 156 - 157 - if (ip0 <= ilimit) { 158 - /* Fill Table */ 159 - assert(base+current0+2 > istart); /* check base overflow */ 160 - hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ 161 - hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); 162 - 163 - if (offset_2 > 0) { /* offset_2==0 means offset_2 is invalidated */ 164 - while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - offset_2)) ) { 165 - /* store sequence */ 166 - size_t const rLength = ZSTD_count(ip0+4, ip0+4-offset_2, iend) + 4; 167 - { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */ 168 - hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); 169 - ip0 += rLength; 170 - ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 /*offCode*/, rLength-MINMATCH); 171 - anchor = ip0; 172 - continue; /* faster when present (confirmed on gcc-8) ... (?) */ 173 - } } } 174 - ip1 = ip0 + 1; 175 - } 252 + _cleanup: 253 + /* Note that there are probably still a couple positions we could search. 254 + * However, it seems to be a meaningful performance hit to try to search 255 + * them. So let's not. */ 176 256 177 257 /* save reps for next block */ 178 - rep[0] = offset_1 ? offset_1 : offsetSaved; 179 - rep[1] = offset_2 ? offset_2 : offsetSaved; 258 + rep[0] = rep_offset1 ? rep_offset1 : offsetSaved; 259 + rep[1] = rep_offset2 ? rep_offset2 : offsetSaved; 180 260 181 261 /* Return the last literals size */ 182 262 return (size_t)(iend - anchor); 263 + 264 + _offset: /* Requires: ip0, idx */ 265 + 266 + /* Compute the offset code. */ 267 + match0 = base + idx; 268 + rep_offset2 = rep_offset1; 269 + rep_offset1 = (U32)(ip0-match0); 270 + offcode = STORE_OFFSET(rep_offset1); 271 + mLength = 4; 272 + 273 + /* Count the backwards match length. */ 274 + while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) { 275 + ip0--; 276 + match0--; 277 + mLength++; 278 + } 279 + 280 + _match: /* Requires: ip0, match0, offcode */ 281 + 282 + /* Count the forward length. */ 283 + mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend); 284 + 285 + ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); 286 + 287 + ip0 += mLength; 288 + anchor = ip0; 289 + 290 + /* write next hash table entry */ 291 + if (ip1 < ip0) { 292 + hashTable[hash1] = (U32)(ip1 - base); 293 + } 294 + 295 + /* Fill table and check for immediate repcode. */ 296 + if (ip0 <= ilimit) { 297 + /* Fill Table */ 298 + assert(base+current0+2 > istart); /* check base overflow */ 299 + hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ 300 + hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); 301 + 302 + if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */ 303 + while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) { 304 + /* store sequence */ 305 + size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4; 306 + { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ 307 + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); 308 + ip0 += rLength; 309 + ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength); 310 + anchor = ip0; 311 + continue; /* faster when present (confirmed on gcc-8) ... (?) */ 312 + } } } 313 + 314 + goto _start; 183 315 } 184 316 317 + #define ZSTD_GEN_FAST_FN(dictMode, mls, step) \ 318 + static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step( \ 319 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ 320 + void const* src, size_t srcSize) \ 321 + { \ 322 + return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \ 323 + } 324 + 325 + ZSTD_GEN_FAST_FN(noDict, 4, 1) 326 + ZSTD_GEN_FAST_FN(noDict, 5, 1) 327 + ZSTD_GEN_FAST_FN(noDict, 6, 1) 328 + ZSTD_GEN_FAST_FN(noDict, 7, 1) 329 + 330 + ZSTD_GEN_FAST_FN(noDict, 4, 0) 331 + ZSTD_GEN_FAST_FN(noDict, 5, 0) 332 + ZSTD_GEN_FAST_FN(noDict, 6, 0) 333 + ZSTD_GEN_FAST_FN(noDict, 7, 0) 185 334 186 335 size_t ZSTD_compressBlock_fast( 187 336 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ··· 338 189 { 339 190 U32 const mls = ms->cParams.minMatch; 340 191 assert(ms->dictMatchState == NULL); 341 - switch(mls) 342 - { 343 - default: /* includes case 3 */ 344 - case 4 : 345 - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4); 346 - case 5 : 347 - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5); 348 - case 6 : 349 - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6); 350 - case 7 : 351 - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7); 192 + if (ms->cParams.targetLength > 1) { 193 + switch(mls) 194 + { 195 + default: /* includes case 3 */ 196 + case 4 : 197 + return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize); 198 + case 5 : 199 + return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize); 200 + case 6 : 201 + return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize); 202 + case 7 : 203 + return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize); 204 + } 205 + } else { 206 + switch(mls) 207 + { 208 + default: /* includes case 3 */ 209 + case 4 : 210 + return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize); 211 + case 5 : 212 + return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize); 213 + case 6 : 214 + return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize); 215 + case 7 : 216 + return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize); 217 + } 218 + 352 219 } 353 220 } 354 221 355 222 FORCE_INLINE_TEMPLATE 356 223 size_t ZSTD_compressBlock_fast_dictMatchState_generic( 357 224 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 358 - void const* src, size_t srcSize, U32 const mls) 225 + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) 359 226 { 360 227 const ZSTD_compressionParameters* const cParams = &ms->cParams; 361 228 U32* const hashTable = ms->hashTable; ··· 407 242 assert(endIndex - prefixStartIndex <= maxDistance); 408 243 (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ 409 244 245 + (void)hasStep; /* not currently specialized on whether it's accelerated */ 246 + 410 247 /* ensure there will be no underflow 411 248 * when translating a dict index into a local index */ 412 249 assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); ··· 439 272 const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; 440 273 mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; 441 274 ip++; 442 - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); 275 + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); 443 276 } else if ( (matchIndex <= prefixStartIndex) ) { 444 277 size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); 445 278 U32 const dictMatchIndex = dictHashTable[dictHash]; ··· 459 292 } /* catch up */ 460 293 offset_2 = offset_1; 461 294 offset_1 = offset; 462 - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); 295 + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); 463 296 } 464 297 } else if (MEM_read32(match) != MEM_read32(ip)) { 465 298 /* it's not a match, and we're not going to check the dictionary */ ··· 474 307 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ 475 308 offset_2 = offset_1; 476 309 offset_1 = offset; 477 - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); 310 + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); 478 311 } 479 312 480 313 /* match found */ ··· 499 332 const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; 500 333 size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; 501 334 U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ 502 - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); 335 + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); 503 336 hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; 504 337 ip += repLength2; 505 338 anchor = ip; ··· 518 351 return (size_t)(iend - anchor); 519 352 } 520 353 354 + 355 + ZSTD_GEN_FAST_FN(dictMatchState, 4, 0) 356 + ZSTD_GEN_FAST_FN(dictMatchState, 5, 0) 357 + ZSTD_GEN_FAST_FN(dictMatchState, 6, 0) 358 + ZSTD_GEN_FAST_FN(dictMatchState, 7, 0) 359 + 521 360 size_t ZSTD_compressBlock_fast_dictMatchState( 522 361 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 523 362 void const* src, size_t srcSize) ··· 534 361 { 535 362 default: /* includes case 3 */ 536 363 case 4 : 537 - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4); 364 + return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize); 538 365 case 5 : 539 - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5); 366 + return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize); 540 367 case 6 : 541 - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6); 368 + return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize); 542 369 case 7 : 543 - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7); 370 + return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize); 544 371 } 545 372 } 546 373 547 374 548 375 static size_t ZSTD_compressBlock_fast_extDict_generic( 549 376 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 550 - void const* src, size_t srcSize, U32 const mls) 377 + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) 551 378 { 552 379 const ZSTD_compressionParameters* const cParams = &ms->cParams; 553 380 U32* const hashTable = ms->hashTable; ··· 571 398 const BYTE* const ilimit = iend - 8; 572 399 U32 offset_1=rep[0], offset_2=rep[1]; 573 400 401 + (void)hasStep; /* not currently specialized on whether it's accelerated */ 402 + 574 403 DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1); 575 404 576 405 /* switch to "regular" variant if extDict is invalidated due to maxDistance */ 577 406 if (prefixStartIndex == dictStartIndex) 578 - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls); 407 + return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); 579 408 580 409 /* Search Loop */ 581 410 while (ip < ilimit) { /* < instead of <=, because (ip+1) */ ··· 591 416 const BYTE* const repMatch = repBase + repIndex; 592 417 hashTable[h] = curr; /* update hash table */ 593 418 DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr); 594 - assert(offset_1 <= curr +1); /* check repIndex */ 595 419 596 - if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex)) 420 + if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ 421 + & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */ 597 422 && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { 598 423 const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; 599 424 size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; 600 425 ip++; 601 - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, rLength-MINMATCH); 426 + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength); 602 427 ip += rLength; 603 428 anchor = ip; 604 429 } else { ··· 614 439 size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; 615 440 while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ 616 441 offset_2 = offset_1; offset_1 = offset; /* update offset history */ 617 - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); 442 + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); 618 443 ip += mLength; 619 444 anchor = ip; 620 445 } } ··· 628 453 U32 const current2 = (U32)(ip-base); 629 454 U32 const repIndex2 = current2 - offset_2; 630 455 const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; 631 - if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex)) /* intentional overflow */ 456 + if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */ 632 457 && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { 633 458 const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; 634 459 size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; 635 460 { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ 636 - ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, 0 /*offcode*/, repLength2-MINMATCH); 461 + ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2); 637 462 hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; 638 463 ip += repLength2; 639 464 anchor = ip; ··· 650 475 return (size_t)(iend - anchor); 651 476 } 652 477 478 + ZSTD_GEN_FAST_FN(extDict, 4, 0) 479 + ZSTD_GEN_FAST_FN(extDict, 5, 0) 480 + ZSTD_GEN_FAST_FN(extDict, 6, 0) 481 + ZSTD_GEN_FAST_FN(extDict, 7, 0) 653 482 654 483 size_t ZSTD_compressBlock_fast_extDict( 655 484 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ··· 664 485 { 665 486 default: /* includes case 3 */ 666 487 case 4 : 667 - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); 488 + return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize); 668 489 case 5 : 669 - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); 490 + return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize); 670 491 case 6 : 671 - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); 492 + return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize); 672 493 case 7 : 673 - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); 494 + return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize); 674 495 } 675 496 }
+1026 -338
lib/zstd/compress/zstd_lazy.c
··· 61 61 * assumption : curr >= btlow == (curr - btmask) 62 62 * doesn't fail */ 63 63 static void 64 - ZSTD_insertDUBT1(ZSTD_matchState_t* ms, 64 + ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, 65 65 U32 curr, const BYTE* inputEnd, 66 66 U32 nbCompares, U32 btLow, 67 67 const ZSTD_dictMode_e dictMode) ··· 151 151 152 152 static size_t 153 153 ZSTD_DUBT_findBetterDictMatch ( 154 - ZSTD_matchState_t* ms, 154 + const ZSTD_matchState_t* ms, 155 155 const BYTE* const ip, const BYTE* const iend, 156 156 size_t* offsetPtr, 157 157 size_t bestLength, ··· 197 197 U32 matchIndex = dictMatchIndex + dictIndexDelta; 198 198 if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { 199 199 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", 200 - curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex); 201 - bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex; 200 + curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); 201 + bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); 202 202 } 203 203 if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ 204 204 break; /* drop, to guarantee consistency (miss a little bit of compression) */ ··· 218 218 } 219 219 220 220 if (bestLength >= MINMATCH) { 221 - U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; 221 + U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; 222 222 DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", 223 223 curr, (U32)bestLength, (U32)*offsetPtr, mIndex); 224 224 } ··· 328 328 if (matchLength > matchEndIdx - matchIndex) 329 329 matchEndIdx = matchIndex + (U32)matchLength; 330 330 if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) 331 - bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex; 331 + bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); 332 332 if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ 333 333 if (dictMode == ZSTD_dictMatchState) { 334 334 nbCompares = 0; /* in addition to avoiding checking any ··· 368 368 assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ 369 369 ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ 370 370 if (bestLength >= MINMATCH) { 371 - U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; 371 + U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; 372 372 DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", 373 373 curr, (U32)bestLength, (U32)*offsetPtr, mIndex); 374 374 } ··· 391 391 return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); 392 392 } 393 393 394 - 395 - static size_t 396 - ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms, 397 - const BYTE* ip, const BYTE* const iLimit, 398 - size_t* offsetPtr) 399 - { 400 - switch(ms->cParams.minMatch) 401 - { 402 - default : /* includes case 3 */ 403 - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); 404 - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); 405 - case 7 : 406 - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); 407 - } 408 - } 409 - 410 - 411 - static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS ( 412 - ZSTD_matchState_t* ms, 413 - const BYTE* ip, const BYTE* const iLimit, 414 - size_t* offsetPtr) 415 - { 416 - switch(ms->cParams.minMatch) 417 - { 418 - default : /* includes case 3 */ 419 - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); 420 - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); 421 - case 7 : 422 - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); 423 - } 424 - } 425 - 426 - 427 - static size_t ZSTD_BtFindBestMatch_extDict_selectMLS ( 428 - ZSTD_matchState_t* ms, 429 - const BYTE* ip, const BYTE* const iLimit, 430 - size_t* offsetPtr) 431 - { 432 - switch(ms->cParams.minMatch) 433 - { 434 - default : /* includes case 3 */ 435 - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); 436 - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); 437 - case 7 : 438 - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); 439 - } 440 - } 441 - 442 - 443 - 444 394 /* ********************************* 445 - * Hash Chain 395 + * Dedicated dict search 446 396 ***********************************/ 447 - #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] 448 - 449 - /* Update chains up to ip (excluded) 450 - Assumption : always within prefix (i.e. not within extDict) */ 451 - FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( 452 - ZSTD_matchState_t* ms, 453 - const ZSTD_compressionParameters* const cParams, 454 - const BYTE* ip, U32 const mls) 455 - { 456 - U32* const hashTable = ms->hashTable; 457 - const U32 hashLog = cParams->hashLog; 458 - U32* const chainTable = ms->chainTable; 459 - const U32 chainMask = (1 << cParams->chainLog) - 1; 460 - const BYTE* const base = ms->window.base; 461 - const U32 target = (U32)(ip - base); 462 - U32 idx = ms->nextToUpdate; 463 - 464 - while(idx < target) { /* catch up */ 465 - size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); 466 - NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; 467 - hashTable[h] = idx; 468 - idx++; 469 - } 470 - 471 - ms->nextToUpdate = target; 472 - return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; 473 - } 474 - 475 - U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { 476 - const ZSTD_compressionParameters* const cParams = &ms->cParams; 477 - return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); 478 - } 479 397 480 398 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip) 481 399 { ··· 403 485 U32* const chainTable = ms->chainTable; 404 486 U32 const chainSize = 1 << ms->cParams.chainLog; 405 487 U32 idx = ms->nextToUpdate; 406 - U32 const minChain = chainSize < target ? target - chainSize : idx; 488 + U32 const minChain = chainSize < target - idx ? target - chainSize : idx; 407 489 U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG; 408 490 U32 const cacheSize = bucketSize - 1; 409 491 U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize; ··· 417 499 U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; 418 500 U32* const tmpHashTable = hashTable; 419 501 U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog); 420 - U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; 502 + U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; 421 503 U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx; 422 - 423 504 U32 hashIdx; 424 505 425 506 assert(ms->cParams.chainLog <= 24); 426 - assert(ms->cParams.hashLog >= ms->cParams.chainLog); 507 + assert(ms->cParams.hashLog > ms->cParams.chainLog); 427 508 assert(idx != 0); 428 509 assert(tmpMinChain <= minChain); 429 510 ··· 453 536 if (count == cacheSize) { 454 537 for (count = 0; count < chainLimit;) { 455 538 if (i < minChain) { 456 - if (!i || countBeyondMinChain++ > cacheSize) { 539 + if (!i || ++countBeyondMinChain > cacheSize) { 457 540 /* only allow pulling `cacheSize` number of entries 458 541 * into the cache or chainTable beyond `minChain`, 459 542 * to replace the entries pulled out of the ··· 509 592 ms->nextToUpdate = target; 510 593 } 511 594 595 + /* Returns the longest match length found in the dedicated dict search structure. 596 + * If none are longer than the argument ml, then ml will be returned. 597 + */ 598 + FORCE_INLINE_TEMPLATE 599 + size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts, 600 + const ZSTD_matchState_t* const dms, 601 + const BYTE* const ip, const BYTE* const iLimit, 602 + const BYTE* const prefixStart, const U32 curr, 603 + const U32 dictLimit, const size_t ddsIdx) { 604 + const U32 ddsLowestIndex = dms->window.dictLimit; 605 + const BYTE* const ddsBase = dms->window.base; 606 + const BYTE* const ddsEnd = dms->window.nextSrc; 607 + const U32 ddsSize = (U32)(ddsEnd - ddsBase); 608 + const U32 ddsIndexDelta = dictLimit - ddsSize; 609 + const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); 610 + const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; 611 + U32 ddsAttempt; 612 + U32 matchIndex; 613 + 614 + for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { 615 + PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); 616 + } 617 + 618 + { 619 + U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; 620 + U32 const chainIndex = chainPackedPointer >> 8; 621 + 622 + PREFETCH_L1(&dms->chainTable[chainIndex]); 623 + } 624 + 625 + for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { 626 + size_t currentMl=0; 627 + const BYTE* match; 628 + matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; 629 + match = ddsBase + matchIndex; 630 + 631 + if (!matchIndex) { 632 + return ml; 633 + } 634 + 635 + /* guaranteed by table construction */ 636 + (void)ddsLowestIndex; 637 + assert(matchIndex >= ddsLowestIndex); 638 + assert(match+4 <= ddsEnd); 639 + if (MEM_read32(match) == MEM_read32(ip)) { 640 + /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 641 + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; 642 + } 643 + 644 + /* save best solution */ 645 + if (currentMl > ml) { 646 + ml = currentMl; 647 + *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); 648 + if (ip+currentMl == iLimit) { 649 + /* best possible, avoids read overflow on next attempt */ 650 + return ml; 651 + } 652 + } 653 + } 654 + 655 + { 656 + U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; 657 + U32 chainIndex = chainPackedPointer >> 8; 658 + U32 const chainLength = chainPackedPointer & 0xFF; 659 + U32 const chainAttempts = nbAttempts - ddsAttempt; 660 + U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; 661 + U32 chainAttempt; 662 + 663 + for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { 664 + PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); 665 + } 666 + 667 + for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { 668 + size_t currentMl=0; 669 + const BYTE* match; 670 + matchIndex = dms->chainTable[chainIndex]; 671 + match = ddsBase + matchIndex; 672 + 673 + /* guaranteed by table construction */ 674 + assert(matchIndex >= ddsLowestIndex); 675 + assert(match+4 <= ddsEnd); 676 + if (MEM_read32(match) == MEM_read32(ip)) { 677 + /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 678 + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; 679 + } 680 + 681 + /* save best solution */ 682 + if (currentMl > ml) { 683 + ml = currentMl; 684 + *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); 685 + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 686 + } 687 + } 688 + } 689 + return ml; 690 + } 691 + 692 + 693 + /* ********************************* 694 + * Hash Chain 695 + ***********************************/ 696 + #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] 697 + 698 + /* Update chains up to ip (excluded) 699 + Assumption : always within prefix (i.e. not within extDict) */ 700 + FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( 701 + ZSTD_matchState_t* ms, 702 + const ZSTD_compressionParameters* const cParams, 703 + const BYTE* ip, U32 const mls) 704 + { 705 + U32* const hashTable = ms->hashTable; 706 + const U32 hashLog = cParams->hashLog; 707 + U32* const chainTable = ms->chainTable; 708 + const U32 chainMask = (1 << cParams->chainLog) - 1; 709 + const BYTE* const base = ms->window.base; 710 + const U32 target = (U32)(ip - base); 711 + U32 idx = ms->nextToUpdate; 712 + 713 + while(idx < target) { /* catch up */ 714 + size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); 715 + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; 716 + hashTable[h] = idx; 717 + idx++; 718 + } 719 + 720 + ms->nextToUpdate = target; 721 + return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; 722 + } 723 + 724 + U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { 725 + const ZSTD_compressionParameters* const cParams = &ms->cParams; 726 + return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); 727 + } 512 728 513 729 /* inlining is important to hardwire a hot branch (template emulation) */ 514 730 FORCE_INLINE_TEMPLATE 515 - size_t ZSTD_HcFindBestMatch_generic ( 731 + size_t ZSTD_HcFindBestMatch( 516 732 ZSTD_matchState_t* ms, 517 733 const BYTE* const ip, const BYTE* const iLimit, 518 734 size_t* offsetPtr, ··· 703 653 /* save best solution */ 704 654 if (currentMl > ml) { 705 655 ml = currentMl; 706 - *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE; 656 + *offsetPtr = STORE_OFFSET(curr - matchIndex); 707 657 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 708 658 } 709 659 ··· 713 663 714 664 assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ 715 665 if (dictMode == ZSTD_dedicatedDictSearch) { 716 - const U32 ddsLowestIndex = dms->window.dictLimit; 717 - const BYTE* const ddsBase = dms->window.base; 718 - const BYTE* const ddsEnd = dms->window.nextSrc; 719 - const U32 ddsSize = (U32)(ddsEnd - ddsBase); 720 - const U32 ddsIndexDelta = dictLimit - ddsSize; 721 - const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); 722 - const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; 723 - U32 ddsAttempt; 724 - 725 - for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { 726 - PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); 727 - } 728 - 729 - { 730 - U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; 731 - U32 const chainIndex = chainPackedPointer >> 8; 732 - 733 - PREFETCH_L1(&dms->chainTable[chainIndex]); 734 - } 735 - 736 - for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { 737 - size_t currentMl=0; 738 - const BYTE* match; 739 - matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; 740 - match = ddsBase + matchIndex; 741 - 742 - if (!matchIndex) { 743 - return ml; 744 - } 745 - 746 - /* guaranteed by table construction */ 747 - (void)ddsLowestIndex; 748 - assert(matchIndex >= ddsLowestIndex); 749 - assert(match+4 <= ddsEnd); 750 - if (MEM_read32(match) == MEM_read32(ip)) { 751 - /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 752 - currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; 753 - } 754 - 755 - /* save best solution */ 756 - if (currentMl > ml) { 757 - ml = currentMl; 758 - *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; 759 - if (ip+currentMl == iLimit) { 760 - /* best possible, avoids read overflow on next attempt */ 761 - return ml; 762 - } 763 - } 764 - } 765 - 766 - { 767 - U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; 768 - U32 chainIndex = chainPackedPointer >> 8; 769 - U32 const chainLength = chainPackedPointer & 0xFF; 770 - U32 const chainAttempts = nbAttempts - ddsAttempt; 771 - U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; 772 - U32 chainAttempt; 773 - 774 - for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { 775 - PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); 776 - } 777 - 778 - for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { 779 - size_t currentMl=0; 780 - const BYTE* match; 781 - matchIndex = dms->chainTable[chainIndex]; 782 - match = ddsBase + matchIndex; 783 - 784 - /* guaranteed by table construction */ 785 - assert(matchIndex >= ddsLowestIndex); 786 - assert(match+4 <= ddsEnd); 787 - if (MEM_read32(match) == MEM_read32(ip)) { 788 - /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 789 - currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; 790 - } 791 - 792 - /* save best solution */ 793 - if (currentMl > ml) { 794 - ml = currentMl; 795 - *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; 796 - if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 797 - } 798 - } 799 - } 666 + ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms, 667 + ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); 800 668 } else if (dictMode == ZSTD_dictMatchState) { 801 669 const U32* const dmsChainTable = dms->chainTable; 802 670 const U32 dmsChainSize = (1 << dms->cParams.chainLog); ··· 738 770 /* save best solution */ 739 771 if (currentMl > ml) { 740 772 ml = currentMl; 741 - *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE; 773 + assert(curr > matchIndex + dmsIndexDelta); 774 + *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); 742 775 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 743 776 } 744 777 ··· 752 783 return ml; 753 784 } 754 785 786 + /* ********************************* 787 + * (SIMD) Row-based matchfinder 788 + ***********************************/ 789 + /* Constants for row-based hash */ 790 + #define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ 791 + #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ 792 + #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) 793 + #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ 755 794 756 - FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS ( 757 - ZSTD_matchState_t* ms, 758 - const BYTE* ip, const BYTE* const iLimit, 759 - size_t* offsetPtr) 795 + #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1) 796 + 797 + typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */ 798 + 799 + /* ZSTD_VecMask_next(): 800 + * Starting from the LSB, returns the idx of the next non-zero bit. 801 + * Basically counting the nb of trailing zeroes. 802 + */ 803 + static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { 804 + assert(val != 0); 805 + # if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) 806 + if (sizeof(size_t) == 4) { 807 + U32 mostSignificantWord = (U32)(val >> 32); 808 + U32 leastSignificantWord = (U32)val; 809 + if (leastSignificantWord == 0) { 810 + return 32 + (U32)__builtin_ctz(mostSignificantWord); 811 + } else { 812 + return (U32)__builtin_ctz(leastSignificantWord); 813 + } 814 + } else { 815 + return (U32)__builtin_ctzll(val); 816 + } 817 + # else 818 + /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count 819 + * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer 820 + */ 821 + val = ~val & (val - 1ULL); /* Lowest set bit mask */ 822 + val = val - ((val >> 1) & 0x5555555555555555); 823 + val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); 824 + return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); 825 + # endif 826 + } 827 + 828 + /* ZSTD_rotateRight_*(): 829 + * Rotates a bitfield to the right by "count" bits. 830 + * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts 831 + */ 832 + FORCE_INLINE_TEMPLATE 833 + U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { 834 + assert(count < 64); 835 + count &= 0x3F; /* for fickle pattern recognition */ 836 + return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); 837 + } 838 + 839 + FORCE_INLINE_TEMPLATE 840 + U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { 841 + assert(count < 32); 842 + count &= 0x1F; /* for fickle pattern recognition */ 843 + return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); 844 + } 845 + 846 + FORCE_INLINE_TEMPLATE 847 + U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { 848 + assert(count < 16); 849 + count &= 0x0F; /* for fickle pattern recognition */ 850 + return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); 851 + } 852 + 853 + /* ZSTD_row_nextIndex(): 854 + * Returns the next index to insert at within a tagTable row, and updates the "head" 855 + * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) 856 + */ 857 + FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { 858 + U32 const next = (*tagRow - 1) & rowMask; 859 + *tagRow = (BYTE)next; 860 + return next; 861 + } 862 + 863 + /* ZSTD_isAligned(): 864 + * Checks that a pointer is aligned to "align" bytes which must be a power of 2. 865 + */ 866 + MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { 867 + assert((align & (align - 1)) == 0); 868 + return (((size_t)ptr) & (align - 1)) == 0; 869 + } 870 + 871 + /* ZSTD_row_prefetch(): 872 + * Performs prefetching for the hashTable and tagTable at a given row. 873 + */ 874 + FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { 875 + PREFETCH_L1(hashTable + relRow); 876 + if (rowLog >= 5) { 877 + PREFETCH_L1(hashTable + relRow + 16); 878 + /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */ 879 + } 880 + PREFETCH_L1(tagTable + relRow); 881 + if (rowLog == 6) { 882 + PREFETCH_L1(tagTable + relRow + 32); 883 + } 884 + assert(rowLog == 4 || rowLog == 5 || rowLog == 6); 885 + assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */ 886 + assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */ 887 + } 888 + 889 + /* ZSTD_row_fillHashCache(): 890 + * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, 891 + * but not beyond iLimit. 892 + */ 893 + FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, 894 + U32 const rowLog, U32 const mls, 895 + U32 idx, const BYTE* const iLimit) 760 896 { 761 - switch(ms->cParams.minMatch) 762 - { 763 - default : /* includes case 3 */ 764 - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); 765 - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); 766 - case 7 : 767 - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); 897 + U32 const* const hashTable = ms->hashTable; 898 + U16 const* const tagTable = ms->tagTable; 899 + U32 const hashLog = ms->rowHashLog; 900 + U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); 901 + U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); 902 + 903 + for (; idx < lim; ++idx) { 904 + U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); 905 + U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 906 + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); 907 + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; 908 + } 909 + 910 + DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1], 911 + ms->hashCache[2], ms->hashCache[3], ms->hashCache[4], 912 + ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]); 913 + } 914 + 915 + /* ZSTD_row_nextCachedHash(): 916 + * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at 917 + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. 918 + */ 919 + FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, 920 + U16 const* tagTable, BYTE const* base, 921 + U32 idx, U32 const hashLog, 922 + U32 const rowLog, U32 const mls) 923 + { 924 + U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); 925 + U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 926 + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); 927 + { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; 928 + cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash; 929 + return hash; 768 930 } 769 931 } 770 932 771 - 772 - static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS ( 773 - ZSTD_matchState_t* ms, 774 - const BYTE* ip, const BYTE* const iLimit, 775 - size_t* offsetPtr) 933 + /* ZSTD_row_update_internalImpl(): 934 + * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. 935 + */ 936 + FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, 937 + U32 updateStartIdx, U32 const updateEndIdx, 938 + U32 const mls, U32 const rowLog, 939 + U32 const rowMask, U32 const useCache) 776 940 { 777 - switch(ms->cParams.minMatch) 778 - { 779 - default : /* includes case 3 */ 780 - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); 781 - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); 782 - case 7 : 783 - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); 941 + U32* const hashTable = ms->hashTable; 942 + U16* const tagTable = ms->tagTable; 943 + U32 const hashLog = ms->rowHashLog; 944 + const BYTE* const base = ms->window.base; 945 + 946 + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); 947 + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { 948 + U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) 949 + : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); 950 + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 951 + U32* const row = hashTable + relRow; 952 + BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. 953 + Explicit cast allows us to get exact desired position within each row */ 954 + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); 955 + 956 + assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); 957 + ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; 958 + row[pos] = updateStartIdx; 784 959 } 785 960 } 786 961 787 - 788 - static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS ( 789 - ZSTD_matchState_t* ms, 790 - const BYTE* ip, const BYTE* const iLimit, 791 - size_t* offsetPtr) 962 + /* ZSTD_row_update_internal(): 963 + * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. 964 + * Skips sections of long matches as is necessary. 965 + */ 966 + FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, 967 + U32 const mls, U32 const rowLog, 968 + U32 const rowMask, U32 const useCache) 792 969 { 793 - switch(ms->cParams.minMatch) 794 - { 795 - default : /* includes case 3 */ 796 - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch); 797 - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch); 798 - case 7 : 799 - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch); 970 + U32 idx = ms->nextToUpdate; 971 + const BYTE* const base = ms->window.base; 972 + const U32 target = (U32)(ip - base); 973 + const U32 kSkipThreshold = 384; 974 + const U32 kMaxMatchStartPositionsToUpdate = 96; 975 + const U32 kMaxMatchEndPositionsToUpdate = 32; 976 + 977 + if (useCache) { 978 + /* Only skip positions when using hash cache, i.e. 979 + * if we are loading a dict, don't skip anything. 980 + * If we decide to skip, then we only update a set number 981 + * of positions at the beginning and end of the match. 982 + */ 983 + if (UNLIKELY(target - idx > kSkipThreshold)) { 984 + U32 const bound = idx + kMaxMatchStartPositionsToUpdate; 985 + ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache); 986 + idx = target - kMaxMatchEndPositionsToUpdate; 987 + ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1); 988 + } 800 989 } 990 + assert(target >= idx); 991 + ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache); 992 + ms->nextToUpdate = target; 993 + } 994 + 995 + /* ZSTD_row_update(): 996 + * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary 997 + * processing. 998 + */ 999 + void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { 1000 + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); 1001 + const U32 rowMask = (1u << rowLog) - 1; 1002 + const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); 1003 + 1004 + DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); 1005 + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); 1006 + } 1007 + 1008 + #if defined(ZSTD_ARCH_X86_SSE2) 1009 + FORCE_INLINE_TEMPLATE ZSTD_VecMask 1010 + ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head) 1011 + { 1012 + const __m128i comparisonMask = _mm_set1_epi8((char)tag); 1013 + int matches[4] = {0}; 1014 + int i; 1015 + assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4); 1016 + for (i=0; i<nbChunks; i++) { 1017 + const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i)); 1018 + const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask); 1019 + matches[i] = _mm_movemask_epi8(equalMask); 1020 + } 1021 + if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head); 1022 + if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head); 1023 + assert(nbChunks == 4); 1024 + return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head); 1025 + } 1026 + #endif 1027 + 1028 + /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches 1029 + * the hash at the nth position in a row of the tagTable. 1030 + * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield 1031 + * to match up with the actual layout of the entries within the hashTable */ 1032 + FORCE_INLINE_TEMPLATE ZSTD_VecMask 1033 + ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) 1034 + { 1035 + const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; 1036 + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); 1037 + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); 1038 + 1039 + #if defined(ZSTD_ARCH_X86_SSE2) 1040 + 1041 + return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); 1042 + 1043 + #else /* SW or NEON-LE */ 1044 + 1045 + # if defined(ZSTD_ARCH_ARM_NEON) 1046 + /* This NEON path only works for little endian - otherwise use SWAR below */ 1047 + if (MEM_isLittleEndian()) { 1048 + if (rowEntries == 16) { 1049 + const uint8x16_t chunk = vld1q_u8(src); 1050 + const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); 1051 + const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); 1052 + const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); 1053 + const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); 1054 + const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); 1055 + const U16 hi = (U16)vgetq_lane_u8(t3, 8); 1056 + const U16 lo = (U16)vgetq_lane_u8(t3, 0); 1057 + return ZSTD_rotateRight_U16((hi << 8) | lo, head); 1058 + } else if (rowEntries == 32) { 1059 + const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); 1060 + const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); 1061 + const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); 1062 + const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); 1063 + const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); 1064 + const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); 1065 + const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); 1066 + const uint8x8_t t0 = vreinterpret_u8_s8(pack0); 1067 + const uint8x8_t t1 = vreinterpret_u8_s8(pack1); 1068 + const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); 1069 + const uint8x8x2_t t3 = vuzp_u8(t2, t0); 1070 + const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); 1071 + const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); 1072 + return ZSTD_rotateRight_U32(matches, head); 1073 + } else { /* rowEntries == 64 */ 1074 + const uint8x16x4_t chunk = vld4q_u8(src); 1075 + const uint8x16_t dup = vdupq_n_u8(tag); 1076 + const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); 1077 + const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); 1078 + const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); 1079 + const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); 1080 + 1081 + const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); 1082 + const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); 1083 + const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); 1084 + const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); 1085 + const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); 1086 + const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); 1087 + return ZSTD_rotateRight_U64(matches, head); 1088 + } 1089 + } 1090 + # endif /* ZSTD_ARCH_ARM_NEON */ 1091 + /* SWAR */ 1092 + { const size_t chunkSize = sizeof(size_t); 1093 + const size_t shiftAmount = ((chunkSize * 8) - chunkSize); 1094 + const size_t xFF = ~((size_t)0); 1095 + const size_t x01 = xFF / 0xFF; 1096 + const size_t x80 = x01 << 7; 1097 + const size_t splatChar = tag * x01; 1098 + ZSTD_VecMask matches = 0; 1099 + int i = rowEntries - chunkSize; 1100 + assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8)); 1101 + if (MEM_isLittleEndian()) { /* runtime check so have two loops */ 1102 + const size_t extractMagic = (xFF / 0x7F) >> chunkSize; 1103 + do { 1104 + size_t chunk = MEM_readST(&src[i]); 1105 + chunk ^= splatChar; 1106 + chunk = (((chunk | x80) - x01) | chunk) & x80; 1107 + matches <<= chunkSize; 1108 + matches |= (chunk * extractMagic) >> shiftAmount; 1109 + i -= chunkSize; 1110 + } while (i >= 0); 1111 + } else { /* big endian: reverse bits during extraction */ 1112 + const size_t msb = xFF ^ (xFF >> 1); 1113 + const size_t extractMagic = (msb / 0x1FF) | msb; 1114 + do { 1115 + size_t chunk = MEM_readST(&src[i]); 1116 + chunk ^= splatChar; 1117 + chunk = (((chunk | x80) - x01) | chunk) & x80; 1118 + matches <<= chunkSize; 1119 + matches |= ((chunk >> 7) * extractMagic) >> shiftAmount; 1120 + i -= chunkSize; 1121 + } while (i >= 0); 1122 + } 1123 + matches = ~matches; 1124 + if (rowEntries == 16) { 1125 + return ZSTD_rotateRight_U16((U16)matches, head); 1126 + } else if (rowEntries == 32) { 1127 + return ZSTD_rotateRight_U32((U32)matches, head); 1128 + } else { 1129 + return ZSTD_rotateRight_U64((U64)matches, head); 1130 + } 1131 + } 1132 + #endif 1133 + } 1134 + 1135 + /* The high-level approach of the SIMD row based match finder is as follows: 1136 + * - Figure out where to insert the new entry: 1137 + * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag" 1138 + * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines 1139 + * which row to insert into. 1140 + * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can 1141 + * be considered as a circular buffer with a "head" index that resides in the tagTable. 1142 + * - Also insert the "tag" into the equivalent row and position in the tagTable. 1143 + * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry. 1144 + * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively, 1145 + * for alignment/performance reasons, leaving some bytes unused. 1146 + * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and 1147 + * generate a bitfield that we can cycle through to check the collisions in the hash table. 1148 + * - Pick the longest match. 1149 + */ 1150 + FORCE_INLINE_TEMPLATE 1151 + size_t ZSTD_RowFindBestMatch( 1152 + ZSTD_matchState_t* ms, 1153 + const BYTE* const ip, const BYTE* const iLimit, 1154 + size_t* offsetPtr, 1155 + const U32 mls, const ZSTD_dictMode_e dictMode, 1156 + const U32 rowLog) 1157 + { 1158 + U32* const hashTable = ms->hashTable; 1159 + U16* const tagTable = ms->tagTable; 1160 + U32* const hashCache = ms->hashCache; 1161 + const U32 hashLog = ms->rowHashLog; 1162 + const ZSTD_compressionParameters* const cParams = &ms->cParams; 1163 + const BYTE* const base = ms->window.base; 1164 + const BYTE* const dictBase = ms->window.dictBase; 1165 + const U32 dictLimit = ms->window.dictLimit; 1166 + const BYTE* const prefixStart = base + dictLimit; 1167 + const BYTE* const dictEnd = dictBase + dictLimit; 1168 + const U32 curr = (U32)(ip-base); 1169 + const U32 maxDistance = 1U << cParams->windowLog; 1170 + const U32 lowestValid = ms->window.lowLimit; 1171 + const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; 1172 + const U32 isDictionary = (ms->loadedDictEnd != 0); 1173 + const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; 1174 + const U32 rowEntries = (1U << rowLog); 1175 + const U32 rowMask = rowEntries - 1; 1176 + const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ 1177 + U32 nbAttempts = 1U << cappedSearchLog; 1178 + size_t ml=4-1; 1179 + 1180 + /* DMS/DDS variables that may be referenced laster */ 1181 + const ZSTD_matchState_t* const dms = ms->dictMatchState; 1182 + 1183 + /* Initialize the following variables to satisfy static analyzer */ 1184 + size_t ddsIdx = 0; 1185 + U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */ 1186 + U32 dmsTag = 0; 1187 + U32* dmsRow = NULL; 1188 + BYTE* dmsTagRow = NULL; 1189 + 1190 + if (dictMode == ZSTD_dedicatedDictSearch) { 1191 + const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; 1192 + { /* Prefetch DDS hashtable entry */ 1193 + ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG; 1194 + PREFETCH_L1(&dms->hashTable[ddsIdx]); 1195 + } 1196 + ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0; 1197 + } 1198 + 1199 + if (dictMode == ZSTD_dictMatchState) { 1200 + /* Prefetch DMS rows */ 1201 + U32* const dmsHashTable = dms->hashTable; 1202 + U16* const dmsTagTable = dms->tagTable; 1203 + U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); 1204 + U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 1205 + dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; 1206 + dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow); 1207 + dmsRow = dmsHashTable + dmsRelRow; 1208 + ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog); 1209 + } 1210 + 1211 + /* Update the hashTable and tagTable up to (but not including) ip */ 1212 + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); 1213 + { /* Get the hash for ip, compute the appropriate row */ 1214 + U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); 1215 + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; 1216 + U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; 1217 + U32* const row = hashTable + relRow; 1218 + BYTE* tagRow = (BYTE*)(tagTable + relRow); 1219 + U32 const head = *tagRow & rowMask; 1220 + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; 1221 + size_t numMatches = 0; 1222 + size_t currMatch = 0; 1223 + ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); 1224 + 1225 + /* Cycle through the matches and prefetch */ 1226 + for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { 1227 + U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; 1228 + U32 const matchIndex = row[matchPos]; 1229 + assert(numMatches < rowEntries); 1230 + if (matchIndex < lowLimit) 1231 + break; 1232 + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { 1233 + PREFETCH_L1(base + matchIndex); 1234 + } else { 1235 + PREFETCH_L1(dictBase + matchIndex); 1236 + } 1237 + matchBuffer[numMatches++] = matchIndex; 1238 + } 1239 + 1240 + /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop 1241 + in ZSTD_row_update_internal() at the next search. */ 1242 + { 1243 + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); 1244 + tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; 1245 + row[pos] = ms->nextToUpdate++; 1246 + } 1247 + 1248 + /* Return the longest match */ 1249 + for (; currMatch < numMatches; ++currMatch) { 1250 + U32 const matchIndex = matchBuffer[currMatch]; 1251 + size_t currentMl=0; 1252 + assert(matchIndex < curr); 1253 + assert(matchIndex >= lowLimit); 1254 + 1255 + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { 1256 + const BYTE* const match = base + matchIndex; 1257 + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ 1258 + if (match[ml] == ip[ml]) /* potentially better */ 1259 + currentMl = ZSTD_count(ip, match, iLimit); 1260 + } else { 1261 + const BYTE* const match = dictBase + matchIndex; 1262 + assert(match+4 <= dictEnd); 1263 + if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ 1264 + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; 1265 + } 1266 + 1267 + /* Save best solution */ 1268 + if (currentMl > ml) { 1269 + ml = currentMl; 1270 + *offsetPtr = STORE_OFFSET(curr - matchIndex); 1271 + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ 1272 + } 1273 + } 1274 + } 1275 + 1276 + assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ 1277 + if (dictMode == ZSTD_dedicatedDictSearch) { 1278 + ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms, 1279 + ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); 1280 + } else if (dictMode == ZSTD_dictMatchState) { 1281 + /* TODO: Measure and potentially add prefetching to DMS */ 1282 + const U32 dmsLowestIndex = dms->window.dictLimit; 1283 + const BYTE* const dmsBase = dms->window.base; 1284 + const BYTE* const dmsEnd = dms->window.nextSrc; 1285 + const U32 dmsSize = (U32)(dmsEnd - dmsBase); 1286 + const U32 dmsIndexDelta = dictLimit - dmsSize; 1287 + 1288 + { U32 const head = *dmsTagRow & rowMask; 1289 + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; 1290 + size_t numMatches = 0; 1291 + size_t currMatch = 0; 1292 + ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); 1293 + 1294 + for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { 1295 + U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; 1296 + U32 const matchIndex = dmsRow[matchPos]; 1297 + if (matchIndex < dmsLowestIndex) 1298 + break; 1299 + PREFETCH_L1(dmsBase + matchIndex); 1300 + matchBuffer[numMatches++] = matchIndex; 1301 + } 1302 + 1303 + /* Return the longest match */ 1304 + for (; currMatch < numMatches; ++currMatch) { 1305 + U32 const matchIndex = matchBuffer[currMatch]; 1306 + size_t currentMl=0; 1307 + assert(matchIndex >= dmsLowestIndex); 1308 + assert(matchIndex < curr); 1309 + 1310 + { const BYTE* const match = dmsBase + matchIndex; 1311 + assert(match+4 <= dmsEnd); 1312 + if (MEM_read32(match) == MEM_read32(ip)) 1313 + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; 1314 + } 1315 + 1316 + if (currentMl > ml) { 1317 + ml = currentMl; 1318 + assert(curr > matchIndex + dmsIndexDelta); 1319 + *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); 1320 + if (ip+currentMl == iLimit) break; 1321 + } 1322 + } 1323 + } 1324 + } 1325 + return ml; 801 1326 } 802 1327 803 1328 804 - FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( 805 - ZSTD_matchState_t* ms, 806 - const BYTE* ip, const BYTE* const iLimit, 807 - size_t* offsetPtr) 808 - { 809 - switch(ms->cParams.minMatch) 810 - { 811 - default : /* includes case 3 */ 812 - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); 813 - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); 814 - case 7 : 815 - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); 816 - } 817 - } 1329 + /* 1330 + * Generate search functions templated on (dictMode, mls, rowLog). 1331 + * These functions are outlined for code size & compilation time. 1332 + * ZSTD_searchMax() dispatches to the correct implementation function. 1333 + * 1334 + * TODO: The start of the search function involves loading and calculating a 1335 + * bunch of constants from the ZSTD_matchState_t. These computations could be 1336 + * done in an initialization function, and saved somewhere in the match state. 1337 + * Then we could pass a pointer to the saved state instead of the match state, 1338 + * and avoid duplicate computations. 1339 + * 1340 + * TODO: Move the match re-winding into searchMax. This improves compression 1341 + * ratio, and unlocks further simplifications with the next TODO. 1342 + * 1343 + * TODO: Try moving the repcode search into searchMax. After the re-winding 1344 + * and repcode search are in searchMax, there is no more logic in the match 1345 + * finder loop that requires knowledge about the dictMode. So we should be 1346 + * able to avoid force inlining it, and we can join the extDict loop with 1347 + * the single segment loop. It should go in searchMax instead of its own 1348 + * function to avoid having multiple virtual function calls per search. 1349 + */ 818 1350 1351 + #define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls 1352 + #define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls 1353 + #define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog 1354 + 1355 + #define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE 1356 + 1357 + #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \ 1358 + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \ 1359 + ZSTD_matchState_t* ms, \ 1360 + const BYTE* ip, const BYTE* const iLimit, \ 1361 + size_t* offBasePtr) \ 1362 + { \ 1363 + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ 1364 + return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \ 1365 + } \ 1366 + 1367 + #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \ 1368 + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \ 1369 + ZSTD_matchState_t* ms, \ 1370 + const BYTE* ip, const BYTE* const iLimit, \ 1371 + size_t* offsetPtr) \ 1372 + { \ 1373 + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ 1374 + return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ 1375 + } \ 1376 + 1377 + #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \ 1378 + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \ 1379 + ZSTD_matchState_t* ms, \ 1380 + const BYTE* ip, const BYTE* const iLimit, \ 1381 + size_t* offsetPtr) \ 1382 + { \ 1383 + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ 1384 + assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \ 1385 + return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \ 1386 + } \ 1387 + 1388 + #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \ 1389 + X(dictMode, mls, 4) \ 1390 + X(dictMode, mls, 5) \ 1391 + X(dictMode, mls, 6) 1392 + 1393 + #define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \ 1394 + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \ 1395 + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \ 1396 + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6) 1397 + 1398 + #define ZSTD_FOR_EACH_MLS(X, dictMode) \ 1399 + X(dictMode, 4) \ 1400 + X(dictMode, 5) \ 1401 + X(dictMode, 6) 1402 + 1403 + #define ZSTD_FOR_EACH_DICT_MODE(X, ...) \ 1404 + X(__VA_ARGS__, noDict) \ 1405 + X(__VA_ARGS__, extDict) \ 1406 + X(__VA_ARGS__, dictMatchState) \ 1407 + X(__VA_ARGS__, dedicatedDictSearch) 1408 + 1409 + /* Generate row search fns for each combination of (dictMode, mls, rowLog) */ 1410 + ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN) 1411 + /* Generate binary Tree search fns for each combination of (dictMode, mls) */ 1412 + ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN) 1413 + /* Generate hash chain search fns for each combination of (dictMode, mls) */ 1414 + ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN) 1415 + 1416 + typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e; 1417 + 1418 + #define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \ 1419 + case mls: \ 1420 + return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); 1421 + #define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \ 1422 + case mls: \ 1423 + return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); 1424 + #define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \ 1425 + case rowLog: \ 1426 + return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr); 1427 + 1428 + #define ZSTD_SWITCH_MLS(X, dictMode) \ 1429 + switch (mls) { \ 1430 + ZSTD_FOR_EACH_MLS(X, dictMode) \ 1431 + } 1432 + 1433 + #define ZSTD_SWITCH_ROWLOG(dictMode, mls) \ 1434 + case mls: \ 1435 + switch (rowLog) { \ 1436 + ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \ 1437 + } \ 1438 + ZSTD_UNREACHABLE; \ 1439 + break; 1440 + 1441 + #define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \ 1442 + switch (searchMethod) { \ 1443 + case search_hashChain: \ 1444 + ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \ 1445 + break; \ 1446 + case search_binaryTree: \ 1447 + ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \ 1448 + break; \ 1449 + case search_rowHash: \ 1450 + ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \ 1451 + break; \ 1452 + } \ 1453 + ZSTD_UNREACHABLE; 1454 + 1455 + /* 1456 + * Searches for the longest match at @p ip. 1457 + * Dispatches to the correct implementation function based on the 1458 + * (searchMethod, dictMode, mls, rowLog). We use switch statements 1459 + * here instead of using an indirect function call through a function 1460 + * pointer because after Spectre and Meltdown mitigations, indirect 1461 + * function calls can be very costly, especially in the kernel. 1462 + * 1463 + * NOTE: dictMode and searchMethod should be templated, so those switch 1464 + * statements should be optimized out. Only the mls & rowLog switches 1465 + * should be left. 1466 + * 1467 + * @param ms The match state. 1468 + * @param ip The position to search at. 1469 + * @param iend The end of the input data. 1470 + * @param[out] offsetPtr Stores the match offset into this pointer. 1471 + * @param mls The minimum search length, in the range [4, 6]. 1472 + * @param rowLog The row log (if applicable), in the range [4, 6]. 1473 + * @param searchMethod The search method to use (templated). 1474 + * @param dictMode The dictMode (templated). 1475 + * 1476 + * @returns The length of the longest match found, or < mls if no match is found. 1477 + * If a match is found its offset is stored in @p offsetPtr. 1478 + */ 1479 + FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( 1480 + ZSTD_matchState_t* ms, 1481 + const BYTE* ip, 1482 + const BYTE* iend, 1483 + size_t* offsetPtr, 1484 + U32 const mls, 1485 + U32 const rowLog, 1486 + searchMethod_e const searchMethod, 1487 + ZSTD_dictMode_e const dictMode) 1488 + { 1489 + if (dictMode == ZSTD_noDict) { 1490 + ZSTD_SWITCH_SEARCH_METHOD(noDict) 1491 + } else if (dictMode == ZSTD_extDict) { 1492 + ZSTD_SWITCH_SEARCH_METHOD(extDict) 1493 + } else if (dictMode == ZSTD_dictMatchState) { 1494 + ZSTD_SWITCH_SEARCH_METHOD(dictMatchState) 1495 + } else if (dictMode == ZSTD_dedicatedDictSearch) { 1496 + ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch) 1497 + } 1498 + ZSTD_UNREACHABLE; 1499 + return 0; 1500 + } 819 1501 820 1502 /* ******************************* 821 1503 * Common parser - lazy strategy 822 1504 *********************************/ 823 - typedef enum { search_hashChain, search_binaryTree } searchMethod_e; 824 1505 825 1506 FORCE_INLINE_TEMPLATE size_t 826 1507 ZSTD_compressBlock_lazy_generic( ··· 1484 865 const BYTE* ip = istart; 1485 866 const BYTE* anchor = istart; 1486 867 const BYTE* const iend = istart + srcSize; 1487 - const BYTE* const ilimit = iend - 8; 868 + const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; 1488 869 const BYTE* const base = ms->window.base; 1489 870 const U32 prefixLowestIndex = ms->window.dictLimit; 1490 871 const BYTE* const prefixLowest = base + prefixLowestIndex; 872 + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); 873 + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); 1491 874 1492 - typedef size_t (*searchMax_f)( 1493 - ZSTD_matchState_t* ms, 1494 - const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); 1495 - 1496 - /* 1497 - * This table is indexed first by the four ZSTD_dictMode_e values, and then 1498 - * by the two searchMethod_e values. NULLs are placed for configurations 1499 - * that should never occur (extDict modes go to the other implementation 1500 - * below and there is no DDSS for binary tree search yet). 1501 - */ 1502 - const searchMax_f searchFuncs[4][2] = { 1503 - { 1504 - ZSTD_HcFindBestMatch_selectMLS, 1505 - ZSTD_BtFindBestMatch_selectMLS 1506 - }, 1507 - { 1508 - NULL, 1509 - NULL 1510 - }, 1511 - { 1512 - ZSTD_HcFindBestMatch_dictMatchState_selectMLS, 1513 - ZSTD_BtFindBestMatch_dictMatchState_selectMLS 1514 - }, 1515 - { 1516 - ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS, 1517 - NULL 1518 - } 1519 - }; 1520 - 1521 - searchMax_f const searchMax = searchFuncs[dictMode][searchMethod == search_binaryTree]; 1522 875 U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; 1523 876 1524 877 const int isDMS = dictMode == ZSTD_dictMatchState; ··· 1506 915 0; 1507 916 const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest)); 1508 917 1509 - assert(searchMax != NULL); 1510 - 1511 - DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode); 1512 - 1513 - /* init */ 918 + DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod); 1514 919 ip += (dictAndPrefixLength == 0); 1515 920 if (dictMode == ZSTD_noDict) { 1516 921 U32 const curr = (U32)(ip - base); ··· 1522 935 assert(offset_2 <= dictAndPrefixLength); 1523 936 } 1524 937 938 + if (searchMethod == search_rowHash) { 939 + ZSTD_row_fillHashCache(ms, base, rowLog, 940 + MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), 941 + ms->nextToUpdate, ilimit); 942 + } 943 + 1525 944 /* Match Loop */ 1526 945 #if defined(__x86_64__) 1527 946 /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the ··· 1537 944 #endif 1538 945 while (ip < ilimit) { 1539 946 size_t matchLength=0; 1540 - size_t offset=0; 947 + size_t offcode=STORE_REPCODE_1; 1541 948 const BYTE* start=ip+1; 949 + DEBUGLOG(7, "search baseline (depth 0)"); 1542 950 1543 951 /* check repCode */ 1544 952 if (isDxS) { ··· 1563 969 1564 970 /* first search (depth 0) */ 1565 971 { size_t offsetFound = 999999999; 1566 - size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); 972 + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode); 1567 973 if (ml2 > matchLength) 1568 - matchLength = ml2, start = ip, offset=offsetFound; 974 + matchLength = ml2, start = ip, offcode=offsetFound; 1569 975 } 1570 976 1571 977 if (matchLength < 4) { ··· 1576 982 /* let's try to find a better solution */ 1577 983 if (depth>=1) 1578 984 while (ip<ilimit) { 985 + DEBUGLOG(7, "search depth 1"); 1579 986 ip ++; 1580 987 if ( (dictMode == ZSTD_noDict) 1581 - && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { 988 + && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { 1582 989 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; 1583 990 int const gain2 = (int)(mlRep * 3); 1584 - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); 991 + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); 1585 992 if ((mlRep >= 4) && (gain2 > gain1)) 1586 - matchLength = mlRep, offset = 0, start = ip; 993 + matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; 1587 994 } 1588 995 if (isDxS) { 1589 996 const U32 repIndex = (U32)(ip - base) - offset_1; ··· 1596 1001 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; 1597 1002 size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; 1598 1003 int const gain2 = (int)(mlRep * 3); 1599 - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); 1004 + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); 1600 1005 if ((mlRep >= 4) && (gain2 > gain1)) 1601 - matchLength = mlRep, offset = 0, start = ip; 1006 + matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; 1602 1007 } 1603 1008 } 1604 1009 { size_t offset2=999999999; 1605 - size_t const ml2 = searchMax(ms, ip, iend, &offset2); 1606 - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ 1607 - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); 1010 + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); 1011 + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ 1012 + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); 1608 1013 if ((ml2 >= 4) && (gain2 > gain1)) { 1609 - matchLength = ml2, offset = offset2, start = ip; 1014 + matchLength = ml2, offcode = offset2, start = ip; 1610 1015 continue; /* search a better one */ 1611 1016 } } 1612 1017 1613 1018 /* let's find an even better one */ 1614 1019 if ((depth==2) && (ip<ilimit)) { 1020 + DEBUGLOG(7, "search depth 2"); 1615 1021 ip ++; 1616 1022 if ( (dictMode == ZSTD_noDict) 1617 - && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { 1023 + && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { 1618 1024 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; 1619 1025 int const gain2 = (int)(mlRep * 4); 1620 - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); 1026 + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); 1621 1027 if ((mlRep >= 4) && (gain2 > gain1)) 1622 - matchLength = mlRep, offset = 0, start = ip; 1028 + matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; 1623 1029 } 1624 1030 if (isDxS) { 1625 1031 const U32 repIndex = (U32)(ip - base) - offset_1; ··· 1632 1036 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; 1633 1037 size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; 1634 1038 int const gain2 = (int)(mlRep * 4); 1635 - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); 1039 + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); 1636 1040 if ((mlRep >= 4) && (gain2 > gain1)) 1637 - matchLength = mlRep, offset = 0, start = ip; 1041 + matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; 1638 1042 } 1639 1043 } 1640 1044 { size_t offset2=999999999; 1641 - size_t const ml2 = searchMax(ms, ip, iend, &offset2); 1642 - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ 1643 - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); 1045 + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); 1046 + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ 1047 + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); 1644 1048 if ((ml2 >= 4) && (gain2 > gain1)) { 1645 - matchLength = ml2, offset = offset2, start = ip; 1049 + matchLength = ml2, offcode = offset2, start = ip; 1646 1050 continue; 1647 1051 } } } 1648 1052 break; /* nothing found : store previous solution */ 1649 1053 } 1650 1054 1651 1055 /* NOTE: 1652 - * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior. 1653 - * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which 1654 - * overflows the pointer, which is undefined behavior. 1056 + * Pay attention that `start[-value]` can lead to strange undefined behavior 1057 + * notably if `value` is unsigned, resulting in a large positive `-value`. 1655 1058 */ 1656 1059 /* catch up */ 1657 - if (offset) { 1060 + if (STORED_IS_OFFSET(offcode)) { 1658 1061 if (dictMode == ZSTD_noDict) { 1659 - while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest)) 1660 - && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */ 1062 + while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) 1063 + && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */ 1661 1064 { start--; matchLength++; } 1662 1065 } 1663 1066 if (isDxS) { 1664 - U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); 1067 + U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); 1665 1068 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; 1666 1069 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; 1667 1070 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ 1668 1071 } 1669 - offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); 1072 + offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); 1670 1073 } 1671 1074 /* store sequence */ 1672 1075 _storeSequence: 1673 - { size_t const litLength = start - anchor; 1674 - ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); 1076 + { size_t const litLength = (size_t)(start - anchor); 1077 + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); 1675 1078 anchor = ip = start + matchLength; 1676 1079 } 1677 1080 ··· 1686 1091 && (MEM_read32(repMatch) == MEM_read32(ip)) ) { 1687 1092 const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; 1688 1093 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; 1689 - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */ 1690 - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); 1094 + offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */ 1095 + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); 1691 1096 ip += matchLength; 1692 1097 anchor = ip; 1693 1098 continue; ··· 1701 1106 && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { 1702 1107 /* store sequence */ 1703 1108 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; 1704 - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */ 1705 - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); 1109 + offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ 1110 + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); 1706 1111 ip += matchLength; 1707 1112 anchor = ip; 1708 1113 continue; /* faster when present ... (?) */ ··· 1795 1200 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); 1796 1201 } 1797 1202 1203 + /* Row-based matchfinder */ 1204 + size_t ZSTD_compressBlock_lazy2_row( 1205 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1206 + void const* src, size_t srcSize) 1207 + { 1208 + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); 1209 + } 1210 + 1211 + size_t ZSTD_compressBlock_lazy_row( 1212 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1213 + void const* src, size_t srcSize) 1214 + { 1215 + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); 1216 + } 1217 + 1218 + size_t ZSTD_compressBlock_greedy_row( 1219 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1220 + void const* src, size_t srcSize) 1221 + { 1222 + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); 1223 + } 1224 + 1225 + size_t ZSTD_compressBlock_lazy2_dictMatchState_row( 1226 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1227 + void const* src, size_t srcSize) 1228 + { 1229 + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); 1230 + } 1231 + 1232 + size_t ZSTD_compressBlock_lazy_dictMatchState_row( 1233 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1234 + void const* src, size_t srcSize) 1235 + { 1236 + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); 1237 + } 1238 + 1239 + size_t ZSTD_compressBlock_greedy_dictMatchState_row( 1240 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1241 + void const* src, size_t srcSize) 1242 + { 1243 + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); 1244 + } 1245 + 1246 + 1247 + size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( 1248 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1249 + void const* src, size_t srcSize) 1250 + { 1251 + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); 1252 + } 1253 + 1254 + size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( 1255 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1256 + void const* src, size_t srcSize) 1257 + { 1258 + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); 1259 + } 1260 + 1261 + size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( 1262 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1263 + void const* src, size_t srcSize) 1264 + { 1265 + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); 1266 + } 1798 1267 1799 1268 FORCE_INLINE_TEMPLATE 1800 1269 size_t ZSTD_compressBlock_lazy_extDict_generic( ··· 1871 1212 const BYTE* ip = istart; 1872 1213 const BYTE* anchor = istart; 1873 1214 const BYTE* const iend = istart + srcSize; 1874 - const BYTE* const ilimit = iend - 8; 1215 + const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; 1875 1216 const BYTE* const base = ms->window.base; 1876 1217 const U32 dictLimit = ms->window.dictLimit; 1877 1218 const BYTE* const prefixStart = base + dictLimit; ··· 1879 1220 const BYTE* const dictEnd = dictBase + dictLimit; 1880 1221 const BYTE* const dictStart = dictBase + ms->window.lowLimit; 1881 1222 const U32 windowLog = ms->cParams.windowLog; 1882 - 1883 - typedef size_t (*searchMax_f)( 1884 - ZSTD_matchState_t* ms, 1885 - const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); 1886 - searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS; 1223 + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); 1224 + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); 1887 1225 1888 1226 U32 offset_1 = rep[0], offset_2 = rep[1]; 1889 1227 1890 - DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic"); 1228 + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); 1891 1229 1892 1230 /* init */ 1893 1231 ip += (ip == prefixStart); 1232 + if (searchMethod == search_rowHash) { 1233 + ZSTD_row_fillHashCache(ms, base, rowLog, 1234 + MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), 1235 + ms->nextToUpdate, ilimit); 1236 + } 1894 1237 1895 1238 /* Match Loop */ 1896 1239 #if defined(__x86_64__) ··· 1903 1242 #endif 1904 1243 while (ip < ilimit) { 1905 1244 size_t matchLength=0; 1906 - size_t offset=0; 1245 + size_t offcode=STORE_REPCODE_1; 1907 1246 const BYTE* start=ip+1; 1908 1247 U32 curr = (U32)(ip-base); 1909 1248 ··· 1912 1251 const U32 repIndex = (U32)(curr+1 - offset_1); 1913 1252 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 1914 1253 const BYTE* const repMatch = repBase + repIndex; 1915 - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ 1254 + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */ 1255 + & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */ 1916 1256 if (MEM_read32(ip+1) == MEM_read32(repMatch)) { 1917 1257 /* repcode detected we should take it */ 1918 1258 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; ··· 1923 1261 1924 1262 /* first search (depth 0) */ 1925 1263 { size_t offsetFound = 999999999; 1926 - size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); 1264 + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict); 1927 1265 if (ml2 > matchLength) 1928 - matchLength = ml2, start = ip, offset=offsetFound; 1266 + matchLength = ml2, start = ip, offcode=offsetFound; 1929 1267 } 1930 1268 1931 1269 if (matchLength < 4) { ··· 1939 1277 ip ++; 1940 1278 curr++; 1941 1279 /* check repCode */ 1942 - if (offset) { 1280 + if (offcode) { 1943 1281 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); 1944 1282 const U32 repIndex = (U32)(curr - offset_1); 1945 1283 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 1946 1284 const BYTE* const repMatch = repBase + repIndex; 1947 - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ 1285 + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ 1286 + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ 1948 1287 if (MEM_read32(ip) == MEM_read32(repMatch)) { 1949 1288 /* repcode detected */ 1950 1289 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 1951 1290 size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; 1952 1291 int const gain2 = (int)(repLength * 3); 1953 - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); 1292 + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); 1954 1293 if ((repLength >= 4) && (gain2 > gain1)) 1955 - matchLength = repLength, offset = 0, start = ip; 1294 + matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; 1956 1295 } } 1957 1296 1958 1297 /* search match, depth 1 */ 1959 1298 { size_t offset2=999999999; 1960 - size_t const ml2 = searchMax(ms, ip, iend, &offset2); 1961 - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ 1962 - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); 1299 + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); 1300 + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ 1301 + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); 1963 1302 if ((ml2 >= 4) && (gain2 > gain1)) { 1964 - matchLength = ml2, offset = offset2, start = ip; 1303 + matchLength = ml2, offcode = offset2, start = ip; 1965 1304 continue; /* search a better one */ 1966 1305 } } 1967 1306 ··· 1971 1308 ip ++; 1972 1309 curr++; 1973 1310 /* check repCode */ 1974 - if (offset) { 1311 + if (offcode) { 1975 1312 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); 1976 1313 const U32 repIndex = (U32)(curr - offset_1); 1977 1314 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 1978 1315 const BYTE* const repMatch = repBase + repIndex; 1979 - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ 1316 + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ 1317 + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ 1980 1318 if (MEM_read32(ip) == MEM_read32(repMatch)) { 1981 1319 /* repcode detected */ 1982 1320 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 1983 1321 size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; 1984 1322 int const gain2 = (int)(repLength * 4); 1985 - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); 1323 + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); 1986 1324 if ((repLength >= 4) && (gain2 > gain1)) 1987 - matchLength = repLength, offset = 0, start = ip; 1325 + matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; 1988 1326 } } 1989 1327 1990 1328 /* search match, depth 2 */ 1991 1329 { size_t offset2=999999999; 1992 - size_t const ml2 = searchMax(ms, ip, iend, &offset2); 1993 - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ 1994 - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); 1330 + size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); 1331 + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ 1332 + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); 1995 1333 if ((ml2 >= 4) && (gain2 > gain1)) { 1996 - matchLength = ml2, offset = offset2, start = ip; 1334 + matchLength = ml2, offcode = offset2, start = ip; 1997 1335 continue; 1998 1336 } } } 1999 1337 break; /* nothing found : store previous solution */ 2000 1338 } 2001 1339 2002 1340 /* catch up */ 2003 - if (offset) { 2004 - U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); 1341 + if (STORED_IS_OFFSET(offcode)) { 1342 + U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); 2005 1343 const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; 2006 1344 const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; 2007 1345 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ 2008 - offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); 1346 + offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); 2009 1347 } 2010 1348 2011 1349 /* store sequence */ 2012 1350 _storeSequence: 2013 - { size_t const litLength = start - anchor; 2014 - ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); 1351 + { size_t const litLength = (size_t)(start - anchor); 1352 + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); 2015 1353 anchor = ip = start + matchLength; 2016 1354 } 2017 1355 ··· 2023 1359 const U32 repIndex = repCurrent - offset_2; 2024 1360 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; 2025 1361 const BYTE* const repMatch = repBase + repIndex; 2026 - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ 1362 + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ 1363 + & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ 2027 1364 if (MEM_read32(ip) == MEM_read32(repMatch)) { 2028 1365 /* repcode detected we should take it */ 2029 1366 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; 2030 1367 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; 2031 - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */ 2032 - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); 1368 + offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */ 1369 + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); 2033 1370 ip += matchLength; 2034 1371 anchor = ip; 2035 1372 continue; /* faster when present ... (?) */ ··· 2076 1411 2077 1412 { 2078 1413 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); 1414 + } 1415 + 1416 + size_t ZSTD_compressBlock_greedy_extDict_row( 1417 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1418 + void const* src, size_t srcSize) 1419 + { 1420 + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); 1421 + } 1422 + 1423 + size_t ZSTD_compressBlock_lazy_extDict_row( 1424 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1425 + void const* src, size_t srcSize) 1426 + 1427 + { 1428 + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); 1429 + } 1430 + 1431 + size_t ZSTD_compressBlock_lazy2_extDict_row( 1432 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1433 + void const* src, size_t srcSize) 1434 + 1435 + { 1436 + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); 2079 1437 }
+38
lib/zstd/compress/zstd_lazy.h
··· 23 23 #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 24 24 25 25 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); 26 + void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); 26 27 27 28 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); 28 29 ··· 41 40 size_t ZSTD_compressBlock_greedy( 42 41 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 43 42 void const* src, size_t srcSize); 43 + size_t ZSTD_compressBlock_lazy2_row( 44 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 45 + void const* src, size_t srcSize); 46 + size_t ZSTD_compressBlock_lazy_row( 47 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 48 + void const* src, size_t srcSize); 49 + size_t ZSTD_compressBlock_greedy_row( 50 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 51 + void const* src, size_t srcSize); 44 52 45 53 size_t ZSTD_compressBlock_btlazy2_dictMatchState( 46 54 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ··· 63 53 size_t ZSTD_compressBlock_greedy_dictMatchState( 64 54 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 65 55 void const* src, size_t srcSize); 56 + size_t ZSTD_compressBlock_lazy2_dictMatchState_row( 57 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 58 + void const* src, size_t srcSize); 59 + size_t ZSTD_compressBlock_lazy_dictMatchState_row( 60 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 61 + void const* src, size_t srcSize); 62 + size_t ZSTD_compressBlock_greedy_dictMatchState_row( 63 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 64 + void const* src, size_t srcSize); 66 65 67 66 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( 68 67 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ··· 80 61 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 81 62 void const* src, size_t srcSize); 82 63 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( 64 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 65 + void const* src, size_t srcSize); 66 + size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( 67 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 68 + void const* src, size_t srcSize); 69 + size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( 70 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 71 + void const* src, size_t srcSize); 72 + size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( 83 73 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 84 74 void const* src, size_t srcSize); 85 75 ··· 101 73 size_t ZSTD_compressBlock_lazy2_extDict( 102 74 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 103 75 void const* src, size_t srcSize); 76 + size_t ZSTD_compressBlock_greedy_extDict_row( 77 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 78 + void const* src, size_t srcSize); 79 + size_t ZSTD_compressBlock_lazy_extDict_row( 80 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 81 + void const* src, size_t srcSize); 82 + size_t ZSTD_compressBlock_lazy2_extDict_row( 83 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 84 + void const* src, size_t srcSize); 104 85 size_t ZSTD_compressBlock_btlazy2_extDict( 105 86 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 106 87 void const* src, size_t srcSize); 88 + 107 89 108 90 109 91 #endif /* ZSTD_LAZY_H */
+57 -19
lib/zstd/compress/zstd_ldm.c
··· 57 57 } 58 58 } 59 59 60 + /* ZSTD_ldm_gear_reset() 61 + * Feeds [data, data + minMatchLength) into the hash without registering any 62 + * splits. This effectively resets the hash state. This is used when skipping 63 + * over data, either at the beginning of a block, or skipping sections. 64 + */ 65 + static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state, 66 + BYTE const* data, size_t minMatchLength) 67 + { 68 + U64 hash = state->rolling; 69 + size_t n = 0; 70 + 71 + #define GEAR_ITER_ONCE() do { \ 72 + hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \ 73 + n += 1; \ 74 + } while (0) 75 + while (n + 3 < minMatchLength) { 76 + GEAR_ITER_ONCE(); 77 + GEAR_ITER_ONCE(); 78 + GEAR_ITER_ONCE(); 79 + GEAR_ITER_ONCE(); 80 + } 81 + while (n < minMatchLength) { 82 + GEAR_ITER_ONCE(); 83 + } 84 + #undef GEAR_ITER_ONCE 85 + } 86 + 60 87 /* ZSTD_ldm_gear_feed(): 61 88 * 62 89 * Registers in the splits array all the split points found in the first ··· 159 132 size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog); 160 133 size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize) 161 134 + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t)); 162 - return params.enableLdm ? totalSize : 0; 135 + return params.enableLdm == ZSTD_ps_enable ? totalSize : 0; 163 136 } 164 137 165 138 size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) 166 139 { 167 - return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0; 140 + return params.enableLdm == ZSTD_ps_enable ? (maxChunkSize / params.minMatchLength) : 0; 168 141 } 169 142 170 143 /* ZSTD_ldm_getBucket() : ··· 282 255 while (ip < iend) { 283 256 size_t hashed; 284 257 unsigned n; 285 - 258 + 286 259 numSplits = 0; 287 260 hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits); 288 261 ··· 354 327 355 328 /* Initialize the rolling hash state with the first minMatchLength bytes */ 356 329 ZSTD_ldm_gear_init(&hashState, params); 357 - { 358 - size_t n = 0; 359 - 360 - while (n < minMatchLength) { 361 - numSplits = 0; 362 - n += ZSTD_ldm_gear_feed(&hashState, ip + n, minMatchLength - n, 363 - splits, &numSplits); 364 - } 365 - ip += minMatchLength; 366 - } 330 + ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength); 331 + ip += minMatchLength; 367 332 368 333 while (ip < ilimit) { 369 334 size_t hashed; ··· 380 361 for (n = 0; n < numSplits; n++) { 381 362 size_t forwardMatchLength = 0, backwardMatchLength = 0, 382 363 bestMatchLength = 0, mLength; 364 + U32 offset; 383 365 BYTE const* const split = candidates[n].split; 384 366 U32 const checksum = candidates[n].checksum; 385 367 U32 const hash = candidates[n].hash; ··· 448 428 } 449 429 450 430 /* Match found */ 431 + offset = (U32)(split - base) - bestEntry->offset; 451 432 mLength = forwardMatchLength + backwardMatchLength; 452 433 { 453 - U32 const offset = (U32)(split - base) - bestEntry->offset; 454 434 rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size; 455 435 456 436 /* Out of sequence storage */ ··· 467 447 ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); 468 448 469 449 anchor = split + forwardMatchLength; 450 + 451 + /* If we find a match that ends after the data that we've hashed 452 + * then we have a repeating, overlapping, pattern. E.g. all zeros. 453 + * If one repetition of the pattern matches our `stopMask` then all 454 + * repetitions will. We don't need to insert them all into out table, 455 + * only the first one. So skip over overlapping matches. 456 + * This is a major speed boost (20x) for compressing a single byte 457 + * repeated, when that byte ends up in the table. 458 + */ 459 + if (anchor > ip + hashed) { 460 + ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength); 461 + /* Continue the outer loop at anchor (ip + hashed == anchor). */ 462 + ip = anchor - hashed; 463 + break; 464 + } 470 465 } 471 466 472 467 ip += hashed; ··· 535 500 536 501 assert(chunkStart < iend); 537 502 /* 1. Perform overflow correction if necessary. */ 538 - if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) { 503 + if (ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDist, ldmState->loadedDictEnd, chunkStart, chunkEnd)) { 539 504 U32 const ldmHSize = 1U << params->hashLog; 540 505 U32 const correction = ZSTD_window_correctOverflow( 541 506 &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart); ··· 579 544 return 0; 580 545 } 581 546 582 - void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) { 547 + void 548 + ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) 549 + { 583 550 while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) { 584 551 rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos; 585 552 if (srcSize <= seq->litLength) { ··· 659 622 660 623 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, 661 624 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 625 + ZSTD_paramSwitch_e useRowMatchFinder, 662 626 void const* src, size_t srcSize) 663 627 { 664 628 const ZSTD_compressionParameters* const cParams = &ms->cParams; 665 629 unsigned const minMatch = cParams->minMatch; 666 630 ZSTD_blockCompressor const blockCompressor = 667 - ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms)); 631 + ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms)); 668 632 /* Input bounds */ 669 633 BYTE const* const istart = (BYTE const*)src; 670 634 BYTE const* const iend = istart + srcSize; ··· 711 673 rep[0] = sequence.offset; 712 674 /* Store the sequence */ 713 675 ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, 714 - sequence.offset + ZSTD_REP_MOVE, 715 - sequence.matchLength - MINMATCH); 676 + STORE_OFFSET(sequence.offset), 677 + sequence.matchLength); 716 678 ip += sequence.matchLength; 717 679 } 718 680 }
+1
lib/zstd/compress/zstd_ldm.h
··· 63 63 */ 64 64 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, 65 65 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 66 + ZSTD_paramSwitch_e useRowMatchFinder, 66 67 void const* src, size_t srcSize); 67 68 68 69 /*
+4 -1
lib/zstd/compress/zstd_ldm_geartab.h
··· 11 11 #ifndef ZSTD_LDM_GEARTAB_H 12 12 #define ZSTD_LDM_GEARTAB_H 13 13 14 - static U64 ZSTD_ldm_gearTab[256] = { 14 + #include "../common/compiler.h" /* UNUSED_ATTR */ 15 + #include "../common/mem.h" /* U64 */ 16 + 17 + static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = { 15 18 0xf5b8f72c5f77775c, 0x84935f266b7ac412, 0xb647ada9ca730ccc, 16 19 0xb065bb4b114fb1de, 0x34584e7e8c3a9fd0, 0x4e97e17c6ae26b05, 17 20 0x3a03d743bc99a604, 0xcecd042422c4044f, 0x76de76c58524259e,
+244 -156
lib/zstd/compress/zstd_opt.c
··· 8 8 * You may select, at your option, one of the above-listed licenses. 9 9 */ 10 10 11 - /* 12 - * Disable inlining for the optimal parser for the kernel build. 13 - * It is unlikely to be used in the kernel, and where it is used 14 - * latency shouldn't matter because it is very slow to begin with. 15 - * We prefer a ~180KB binary size win over faster optimal parsing. 16 - * 17 - * TODO(https://github.com/facebook/zstd/issues/2862): 18 - * Improve the code size of the optimal parser in general, so we 19 - * don't need this hack for the kernel build. 20 - */ 21 - #define ZSTD_NO_INLINE 1 22 - 23 11 #include "zstd_compress_internal.h" 24 12 #include "hist.h" 25 13 #include "zstd_opt.h" 26 14 27 15 28 16 #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ 29 - #define ZSTD_FREQ_DIV 4 /* log factor when using previous stats to init next stats */ 30 17 #define ZSTD_MAX_PRICE (1<<30) 31 18 32 19 #define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ ··· 23 36 * Price functions for optimal parser 24 37 ***************************************/ 25 38 26 - #if 0 /* approximation at bit level */ 39 + #if 0 /* approximation at bit level (for tests) */ 27 40 # define BITCOST_ACCURACY 0 28 41 # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) 29 - # define WEIGHT(stat) ((void)opt, ZSTD_bitWeight(stat)) 30 - #elif 0 /* fractional bit accuracy */ 42 + # define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat)) 43 + #elif 0 /* fractional bit accuracy (for tests) */ 31 44 # define BITCOST_ACCURACY 8 32 45 # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) 33 46 # define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) ··· 65 78 66 79 static int ZSTD_compressedLiterals(optState_t const* const optPtr) 67 80 { 68 - return optPtr->literalCompressionMode != ZSTD_lcm_uncompressed; 81 + return optPtr->literalCompressionMode != ZSTD_ps_disable; 69 82 } 70 83 71 84 static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel) ··· 78 91 } 79 92 80 93 81 - /* ZSTD_downscaleStat() : 82 - * reduce all elements in table by a factor 2^(ZSTD_FREQ_DIV+malus) 83 - * return the resulting sum of elements */ 84 - static U32 ZSTD_downscaleStat(unsigned* table, U32 lastEltIndex, int malus) 94 + static U32 sum_u32(const unsigned table[], size_t nbElts) 95 + { 96 + size_t n; 97 + U32 total = 0; 98 + for (n=0; n<nbElts; n++) { 99 + total += table[n]; 100 + } 101 + return total; 102 + } 103 + 104 + static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift) 85 105 { 86 106 U32 s, sum=0; 87 - DEBUGLOG(5, "ZSTD_downscaleStat (nbElts=%u)", (unsigned)lastEltIndex+1); 88 - assert(ZSTD_FREQ_DIV+malus > 0 && ZSTD_FREQ_DIV+malus < 31); 107 + DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift); 108 + assert(shift < 30); 89 109 for (s=0; s<lastEltIndex+1; s++) { 90 - table[s] = 1 + (table[s] >> (ZSTD_FREQ_DIV+malus)); 110 + table[s] = 1 + (table[s] >> shift); 91 111 sum += table[s]; 92 112 } 93 113 return sum; 94 114 } 95 115 116 + /* ZSTD_scaleStats() : 117 + * reduce all elements in table is sum too large 118 + * return the resulting sum of elements */ 119 + static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) 120 + { 121 + U32 const prevsum = sum_u32(table, lastEltIndex+1); 122 + U32 const factor = prevsum >> logTarget; 123 + DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); 124 + assert(logTarget < 30); 125 + if (factor <= 1) return prevsum; 126 + return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)); 127 + } 128 + 96 129 /* ZSTD_rescaleFreqs() : 97 130 * if first block (detected by optPtr->litLengthSum == 0) : init statistics 98 131 * take hints from dictionary if there is one 99 - * or init from zero, using src for literals stats, or flat 1 for match symbols 132 + * and init from zero if there is none, 133 + * using src for literals stats, and baseline stats for sequence symbols 100 134 * otherwise downscale existing stats, to be used as seed for next block. 101 135 */ 102 136 static void ··· 146 138 optPtr->litSum = 0; 147 139 for (lit=0; lit<=MaxLit; lit++) { 148 140 U32 const scaleLog = 11; /* scale to 2K */ 149 - U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit); 141 + U32 const bitCost = HUF_getNbBitsFromCTable(optPtr->symbolCosts->huf.CTable, lit); 150 142 assert(bitCost <= scaleLog); 151 143 optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; 152 144 optPtr->litSum += optPtr->litFreq[lit]; ··· 194 186 if (compressedLiterals) { 195 187 unsigned lit = MaxLit; 196 188 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ 197 - optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1); 189 + optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8); 198 190 } 199 191 200 - { unsigned ll; 201 - for (ll=0; ll<=MaxLL; ll++) 202 - optPtr->litLengthFreq[ll] = 1; 192 + { unsigned const baseLLfreqs[MaxLL+1] = { 193 + 4, 2, 1, 1, 1, 1, 1, 1, 194 + 1, 1, 1, 1, 1, 1, 1, 1, 195 + 1, 1, 1, 1, 1, 1, 1, 1, 196 + 1, 1, 1, 1, 1, 1, 1, 1, 197 + 1, 1, 1, 1 198 + }; 199 + ZSTD_memcpy(optPtr->litLengthFreq, baseLLfreqs, sizeof(baseLLfreqs)); 200 + optPtr->litLengthSum = sum_u32(baseLLfreqs, MaxLL+1); 203 201 } 204 - optPtr->litLengthSum = MaxLL+1; 205 202 206 203 { unsigned ml; 207 204 for (ml=0; ml<=MaxML; ml++) ··· 214 201 } 215 202 optPtr->matchLengthSum = MaxML+1; 216 203 217 - { unsigned of; 218 - for (of=0; of<=MaxOff; of++) 219 - optPtr->offCodeFreq[of] = 1; 204 + { unsigned const baseOFCfreqs[MaxOff+1] = { 205 + 6, 2, 1, 1, 2, 3, 4, 4, 206 + 4, 3, 2, 1, 1, 1, 1, 1, 207 + 1, 1, 1, 1, 1, 1, 1, 1, 208 + 1, 1, 1, 1, 1, 1, 1, 1 209 + }; 210 + ZSTD_memcpy(optPtr->offCodeFreq, baseOFCfreqs, sizeof(baseOFCfreqs)); 211 + optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); 220 212 } 221 - optPtr->offCodeSum = MaxOff+1; 213 + 222 214 223 215 } 224 216 225 217 } else { /* new block : re-use previous statistics, scaled down */ 226 218 227 219 if (compressedLiterals) 228 - optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1); 229 - optPtr->litLengthSum = ZSTD_downscaleStat(optPtr->litLengthFreq, MaxLL, 0); 230 - optPtr->matchLengthSum = ZSTD_downscaleStat(optPtr->matchLengthFreq, MaxML, 0); 231 - optPtr->offCodeSum = ZSTD_downscaleStat(optPtr->offCodeFreq, MaxOff, 0); 220 + optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); 221 + optPtr->litLengthSum = ZSTD_scaleStats(optPtr->litLengthFreq, MaxLL, 11); 222 + optPtr->matchLengthSum = ZSTD_scaleStats(optPtr->matchLengthFreq, MaxML, 11); 223 + optPtr->offCodeSum = ZSTD_scaleStats(optPtr->offCodeFreq, MaxOff, 11); 232 224 } 233 225 234 226 ZSTD_setBasePrices(optPtr, optLevel); ··· 269 251 * cost of literalLength symbol */ 270 252 static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel) 271 253 { 272 - if (optPtr->priceType == zop_predef) return WEIGHT(litLength, optLevel); 254 + assert(litLength <= ZSTD_BLOCKSIZE_MAX); 255 + if (optPtr->priceType == zop_predef) 256 + return WEIGHT(litLength, optLevel); 257 + /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX 258 + * because it isn't representable in the zstd format. So instead just 259 + * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block 260 + * would be all literals. 261 + */ 262 + if (litLength == ZSTD_BLOCKSIZE_MAX) 263 + return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel); 273 264 274 265 /* dynamic statistics */ 275 266 { U32 const llCode = ZSTD_LLcode(litLength); ··· 291 264 /* ZSTD_getMatchPrice() : 292 265 * Provides the cost of the match part (offset + matchLength) of a sequence 293 266 * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. 294 - * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */ 267 + * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2 268 + * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) 269 + */ 295 270 FORCE_INLINE_TEMPLATE U32 296 - ZSTD_getMatchPrice(U32 const offset, 271 + ZSTD_getMatchPrice(U32 const offcode, 297 272 U32 const matchLength, 298 273 const optState_t* const optPtr, 299 274 int const optLevel) 300 275 { 301 276 U32 price; 302 - U32 const offCode = ZSTD_highbit32(offset+1); 277 + U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode)); 303 278 U32 const mlBase = matchLength - MINMATCH; 304 279 assert(matchLength >= MINMATCH); 305 280 ··· 344 315 optPtr->litLengthSum++; 345 316 } 346 317 347 - /* match offset code (0-2=>repCode; 3+=>offset+2) */ 348 - { U32 const offCode = ZSTD_highbit32(offsetCode+1); 318 + /* offset code : expected to follow storeSeq() numeric representation */ 319 + { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode)); 349 320 assert(offCode <= MaxOff); 350 321 optPtr->offCodeFreq[offCode]++; 351 322 optPtr->offCodeSum++; ··· 379 350 380 351 /* Update hashTable3 up to ip (excluded) 381 352 Assumption : always within prefix (i.e. not within extDict) */ 382 - static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, 353 + static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, 383 354 U32* nextToUpdate3, 384 355 const BYTE* const ip) 385 356 { ··· 405 376 * Binary Tree search 406 377 ***************************************/ 407 378 /* ZSTD_insertBt1() : add one or multiple positions to tree. 408 - * ip : assumed <= iend-8 . 379 + * @param ip assumed <= iend-8 . 380 + * @param target The target of ZSTD_updateTree_internal() - we are filling to this position 409 381 * @return : nb of positions added */ 410 382 static U32 ZSTD_insertBt1( 411 - ZSTD_matchState_t* ms, 383 + const ZSTD_matchState_t* ms, 412 384 const BYTE* const ip, const BYTE* const iend, 385 + U32 const target, 413 386 U32 const mls, const int extDict) 414 387 { 415 388 const ZSTD_compressionParameters* const cParams = &ms->cParams; ··· 434 403 U32* smallerPtr = bt + 2*(curr&btMask); 435 404 U32* largerPtr = smallerPtr + 1; 436 405 U32 dummy32; /* to be nullified at the end */ 437 - U32 const windowLow = ms->window.lowLimit; 406 + /* windowLow is based on target because 407 + * we only need positions that will be in the window at the end of the tree update. 408 + */ 409 + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, target, cParams->windowLog); 438 410 U32 matchEndIdx = curr+8+1; 439 411 size_t bestLength = 8; 440 412 U32 nbCompares = 1U << cParams->searchLog; ··· 450 416 451 417 DEBUGLOG(8, "ZSTD_insertBt1 (%u)", curr); 452 418 419 + assert(curr <= target); 453 420 assert(ip <= iend-8); /* required for h calculation */ 454 421 hashTable[h] = curr; /* Update Hash Table */ 455 422 ··· 539 504 idx, target, dictMode); 540 505 541 506 while(idx < target) { 542 - U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict); 507 + U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, target, mls, dictMode == ZSTD_extDict); 543 508 assert(idx < (U32)(idx + forward)); 544 509 idx += forward; 545 510 } ··· 644 609 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", 645 610 repCode, ll0, repOffset, repLen); 646 611 bestLength = repLen; 647 - matches[mnum].off = repCode - ll0; 612 + matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */ 648 613 matches[mnum].len = (U32)repLen; 649 614 mnum++; 650 615 if ( (repLen > sufficient_len) ··· 673 638 bestLength = mlen; 674 639 assert(curr > matchIndex3); 675 640 assert(mnum==0); /* no prior solution */ 676 - matches[0].off = (curr - matchIndex3) + ZSTD_REP_MOVE; 641 + matches[0].off = STORE_OFFSET(curr - matchIndex3); 677 642 matches[0].len = (U32)mlen; 678 643 mnum = 1; 679 644 if ( (mlen > sufficient_len) | ··· 682 647 return 1; 683 648 } } } 684 649 /* no dictMatchState lookup: dicts don't have a populated HC3 table */ 685 - } 650 + } /* if (mls == 3) */ 686 651 687 652 hashTable[h] = curr; /* Update Hash Table */ 688 653 ··· 707 672 708 673 if (matchLength > bestLength) { 709 674 DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", 710 - (U32)matchLength, curr - matchIndex, curr - matchIndex + ZSTD_REP_MOVE); 675 + (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); 711 676 assert(matchEndIdx > matchIndex); 712 677 if (matchLength > matchEndIdx - matchIndex) 713 678 matchEndIdx = matchIndex + (U32)matchLength; 714 679 bestLength = matchLength; 715 - matches[mnum].off = (curr - matchIndex) + ZSTD_REP_MOVE; 680 + matches[mnum].off = STORE_OFFSET(curr - matchIndex); 716 681 matches[mnum].len = (U32)matchLength; 717 682 mnum++; 718 683 if ( (matchLength > ZSTD_OPT_NUM) 719 684 | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) { 720 685 if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */ 721 686 break; /* drop, to preserve bt consistency (miss a little bit of compression) */ 722 - } 723 - } 687 + } } 724 688 725 689 if (match[matchLength] < ip[matchLength]) { 726 690 /* match smaller than current */ ··· 755 721 if (matchLength > bestLength) { 756 722 matchIndex = dictMatchIndex + dmsIndexDelta; 757 723 DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", 758 - (U32)matchLength, curr - matchIndex, curr - matchIndex + ZSTD_REP_MOVE); 724 + (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); 759 725 if (matchLength > matchEndIdx - matchIndex) 760 726 matchEndIdx = matchIndex + (U32)matchLength; 761 727 bestLength = matchLength; 762 - matches[mnum].off = (curr - matchIndex) + ZSTD_REP_MOVE; 728 + matches[mnum].off = STORE_OFFSET(curr - matchIndex); 763 729 matches[mnum].len = (U32)matchLength; 764 730 mnum++; 765 731 if ( (matchLength > ZSTD_OPT_NUM) 766 732 | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) { 767 733 break; /* drop, to guarantee consistency (miss a little bit of compression) */ 768 - } 769 - } 734 + } } 770 735 771 736 if (dictMatchIndex <= dmsBtLow) { break; } /* beyond tree size, stop the search */ 772 737 if (match[matchLength] < ip[matchLength]) { ··· 775 742 /* match is larger than current */ 776 743 commonLengthLarger = matchLength; 777 744 dictMatchIndex = nextPtr[0]; 778 - } 779 - } 780 - } 745 + } } } /* if (dictMode == ZSTD_dictMatchState) */ 781 746 782 747 assert(matchEndIdx > curr+8); 783 748 ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ 784 749 return mnum; 785 750 } 786 751 752 + typedef U32 (*ZSTD_getAllMatchesFn)( 753 + ZSTD_match_t*, 754 + ZSTD_matchState_t*, 755 + U32*, 756 + const BYTE*, 757 + const BYTE*, 758 + const U32 rep[ZSTD_REP_NUM], 759 + U32 const ll0, 760 + U32 const lengthToBeat); 787 761 788 - FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( 789 - ZSTD_match_t* matches, /* store result (match found, increasing size) in this table */ 790 - ZSTD_matchState_t* ms, 791 - U32* nextToUpdate3, 792 - const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode, 793 - const U32 rep[ZSTD_REP_NUM], 794 - U32 const ll0, 795 - U32 const lengthToBeat) 762 + FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( 763 + ZSTD_match_t* matches, 764 + ZSTD_matchState_t* ms, 765 + U32* nextToUpdate3, 766 + const BYTE* ip, 767 + const BYTE* const iHighLimit, 768 + const U32 rep[ZSTD_REP_NUM], 769 + U32 const ll0, 770 + U32 const lengthToBeat, 771 + const ZSTD_dictMode_e dictMode, 772 + const U32 mls) 796 773 { 797 - const ZSTD_compressionParameters* const cParams = &ms->cParams; 798 - U32 const matchLengthSearch = cParams->minMatch; 799 - DEBUGLOG(8, "ZSTD_BtGetAllMatches"); 800 - if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ 801 - ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode); 802 - switch(matchLengthSearch) 803 - { 804 - case 3 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 3); 805 - default : 806 - case 4 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 4); 807 - case 5 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 5); 808 - case 7 : 809 - case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6); 774 + assert(BOUNDED(3, ms->cParams.minMatch, 6) == mls); 775 + DEBUGLOG(8, "ZSTD_BtGetAllMatches(dictMode=%d, mls=%u)", (int)dictMode, mls); 776 + if (ip < ms->window.base + ms->nextToUpdate) 777 + return 0; /* skipped area */ 778 + ZSTD_updateTree_internal(ms, ip, iHighLimit, mls, dictMode); 779 + return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, mls); 780 + } 781 + 782 + #define ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls) ZSTD_btGetAllMatches_##dictMode##_##mls 783 + 784 + #define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls) \ 785 + static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)( \ 786 + ZSTD_match_t* matches, \ 787 + ZSTD_matchState_t* ms, \ 788 + U32* nextToUpdate3, \ 789 + const BYTE* ip, \ 790 + const BYTE* const iHighLimit, \ 791 + const U32 rep[ZSTD_REP_NUM], \ 792 + U32 const ll0, \ 793 + U32 const lengthToBeat) \ 794 + { \ 795 + return ZSTD_btGetAllMatches_internal( \ 796 + matches, ms, nextToUpdate3, ip, iHighLimit, \ 797 + rep, ll0, lengthToBeat, ZSTD_##dictMode, mls); \ 810 798 } 799 + 800 + #define GEN_ZSTD_BT_GET_ALL_MATCHES(dictMode) \ 801 + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 3) \ 802 + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 4) \ 803 + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 5) \ 804 + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 6) 805 + 806 + GEN_ZSTD_BT_GET_ALL_MATCHES(noDict) 807 + GEN_ZSTD_BT_GET_ALL_MATCHES(extDict) 808 + GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState) 809 + 810 + #define ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMode) \ 811 + { \ 812 + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 3), \ 813 + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 4), \ 814 + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 5), \ 815 + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 6) \ 816 + } 817 + 818 + static ZSTD_getAllMatchesFn 819 + ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode) 820 + { 821 + ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = { 822 + ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict), 823 + ZSTD_BT_GET_ALL_MATCHES_ARRAY(extDict), 824 + ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMatchState) 825 + }; 826 + U32 const mls = BOUNDED(3, ms->cParams.minMatch, 6); 827 + assert((U32)dictMode < 3); 828 + assert(mls - 3 < 4); 829 + return getAllMatchesFns[(int)dictMode][mls - 3]; 811 830 } 812 831 813 832 /* *********************** ··· 868 783 869 784 /* Struct containing info needed to make decision about ldm inclusion */ 870 785 typedef struct { 871 - rawSeqStore_t seqStore; /* External match candidates store for this block */ 872 - U32 startPosInBlock; /* Start position of the current match candidate */ 873 - U32 endPosInBlock; /* End position of the current match candidate */ 874 - U32 offset; /* Offset of the match candidate */ 786 + rawSeqStore_t seqStore; /* External match candidates store for this block */ 787 + U32 startPosInBlock; /* Start position of the current match candidate */ 788 + U32 endPosInBlock; /* End position of the current match candidate */ 789 + U32 offset; /* Offset of the match candidate */ 875 790 } ZSTD_optLdm_t; 876 791 877 792 /* ZSTD_optLdm_skipRawSeqStoreBytes(): 878 - * Moves forward in rawSeqStore by nbBytes, which will update the fields 'pos' and 'posInSequence'. 793 + * Moves forward in @rawSeqStore by @nbBytes, 794 + * which will update the fields 'pos' and 'posInSequence'. 879 795 */ 880 - static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) { 796 + static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) 797 + { 881 798 U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); 882 799 while (currPos && rawSeqStore->pos < rawSeqStore->size) { 883 800 rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos]; ··· 900 813 * Calculates the beginning and end of the next match in the current block. 901 814 * Updates 'pos' and 'posInSequence' of the ldmSeqStore. 902 815 */ 903 - static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock, 904 - U32 blockBytesRemaining) { 816 + static void 817 + ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock, 818 + U32 blockBytesRemaining) 819 + { 905 820 rawSeq currSeq; 906 821 U32 currBlockEndPos; 907 822 U32 literalsBytesRemaining; ··· 915 826 optLdm->endPosInBlock = UINT_MAX; 916 827 return; 917 828 } 918 - /* Calculate appropriate bytes left in matchLength and litLength after adjusting 919 - based on ldmSeqStore->posInSequence */ 829 + /* Calculate appropriate bytes left in matchLength and litLength 830 + * after adjusting based on ldmSeqStore->posInSequence */ 920 831 currSeq = optLdm->seqStore.seq[optLdm->seqStore.pos]; 921 832 assert(optLdm->seqStore.posInSequence <= currSeq.litLength + currSeq.matchLength); 922 833 currBlockEndPos = currPosInBlock + blockBytesRemaining; ··· 952 863 } 953 864 954 865 /* ZSTD_optLdm_maybeAddMatch(): 955 - * Adds a match if it's long enough, based on it's 'matchStartPosInBlock' 956 - * and 'matchEndPosInBlock', into 'matches'. Maintains the correct ordering of 'matches' 866 + * Adds a match if it's long enough, 867 + * based on it's 'matchStartPosInBlock' and 'matchEndPosInBlock', 868 + * into 'matches'. Maintains the correct ordering of 'matches'. 957 869 */ 958 870 static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, 959 - ZSTD_optLdm_t* optLdm, U32 currPosInBlock) { 960 - U32 posDiff = currPosInBlock - optLdm->startPosInBlock; 871 + const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) 872 + { 873 + U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; 961 874 /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */ 962 - U32 candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; 963 - U32 candidateOffCode = optLdm->offset + ZSTD_REP_MOVE; 875 + U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; 964 876 965 877 /* Ensure that current block position is not outside of the match */ 966 878 if (currPosInBlock < optLdm->startPosInBlock ··· 971 881 } 972 882 973 883 if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { 884 + U32 const candidateOffCode = STORE_OFFSET(optLdm->offset); 974 885 DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u", 975 886 candidateOffCode, candidateMatchLength, currPosInBlock); 976 887 matches[*nbMatches].len = candidateMatchLength; ··· 983 892 /* ZSTD_optLdm_processMatchCandidate(): 984 893 * Wrapper function to update ldm seq store and call ldm functions as necessary. 985 894 */ 986 - static void ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, ZSTD_match_t* matches, U32* nbMatches, 987 - U32 currPosInBlock, U32 remainingBytes) { 895 + static void 896 + ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, 897 + ZSTD_match_t* matches, U32* nbMatches, 898 + U32 currPosInBlock, U32 remainingBytes) 899 + { 988 900 if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) { 989 901 return; 990 902 } ··· 998 904 * at the end of a match from the ldm seq store, and will often be some bytes 999 905 * over beyond matchEndPosInBlock. As such, we need to correct for these "overshoots" 1000 906 */ 1001 - U32 posOvershoot = currPosInBlock - optLdm->endPosInBlock; 907 + U32 const posOvershoot = currPosInBlock - optLdm->endPosInBlock; 1002 908 ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, posOvershoot); 1003 - } 909 + } 1004 910 ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes); 1005 911 } 1006 912 ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock); 1007 913 } 1008 914 915 + 1009 916 /*-******************************* 1010 917 * Optimal parser 1011 918 *********************************/ 1012 - 1013 919 1014 920 static U32 ZSTD_totalLen(ZSTD_optimal_t sol) 1015 921 { ··· 1051 957 const BYTE* const prefixStart = base + ms->window.dictLimit; 1052 958 const ZSTD_compressionParameters* const cParams = &ms->cParams; 1053 959 960 + ZSTD_getAllMatchesFn getAllMatches = ZSTD_selectBtGetAllMatches(ms, dictMode); 961 + 1054 962 U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); 1055 963 U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4; 1056 964 U32 nextToUpdate3 = ms->nextToUpdate; ··· 1080 984 /* find first match */ 1081 985 { U32 const litlen = (U32)(ip - anchor); 1082 986 U32 const ll0 = !litlen; 1083 - U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch); 987 + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch); 1084 988 ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, 1085 989 (U32)(ip-istart), (U32)(iend - ip)); 1086 990 if (!nbMatches) { ip++; continue; } ··· 1094 998 * in every price. We include the literal length to avoid negative 1095 999 * prices when we subtract the previous literal length. 1096 1000 */ 1097 - opt[0].price = ZSTD_litLengthPrice(litlen, optStatePtr, optLevel); 1001 + opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel); 1098 1002 1099 1003 /* large match -> immediate encoding */ 1100 1004 { U32 const maxML = matches[nbMatches-1].len; 1101 - U32 const maxOffset = matches[nbMatches-1].off; 1005 + U32 const maxOffcode = matches[nbMatches-1].off; 1102 1006 DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", 1103 - nbMatches, maxML, maxOffset, (U32)(ip-prefixStart)); 1007 + nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart)); 1104 1008 1105 1009 if (maxML > sufficient_len) { 1106 1010 lastSequence.litlen = litlen; 1107 1011 lastSequence.mlen = maxML; 1108 - lastSequence.off = maxOffset; 1012 + lastSequence.off = maxOffcode; 1109 1013 DEBUGLOG(6, "large match (%u>%u), immediate encoding", 1110 1014 maxML, sufficient_len); 1111 1015 cur = 0; ··· 1114 1018 } } 1115 1019 1116 1020 /* set prices for first matches starting position == 0 */ 1117 - { U32 const literalsPrice = opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); 1021 + assert(opt[0].price >= 0); 1022 + { U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); 1118 1023 U32 pos; 1119 1024 U32 matchNb; 1120 1025 for (pos = 1; pos < minMatch; pos++) { 1121 1026 opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ 1122 1027 } 1123 1028 for (matchNb = 0; matchNb < nbMatches; matchNb++) { 1124 - U32 const offset = matches[matchNb].off; 1029 + U32 const offcode = matches[matchNb].off; 1125 1030 U32 const end = matches[matchNb].len; 1126 1031 for ( ; pos <= end ; pos++ ) { 1127 - U32 const matchPrice = ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel); 1032 + U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel); 1128 1033 U32 const sequencePrice = literalsPrice + matchPrice; 1129 1034 DEBUGLOG(7, "rPos:%u => set initial price : %.2f", 1130 1035 pos, ZSTD_fCost(sequencePrice)); 1131 1036 opt[pos].mlen = pos; 1132 - opt[pos].off = offset; 1037 + opt[pos].off = offcode; 1133 1038 opt[pos].litlen = litlen; 1134 - opt[pos].price = sequencePrice; 1039 + opt[pos].price = (int)sequencePrice; 1135 1040 } } 1136 1041 last_pos = pos-1; 1137 1042 } ··· 1147 1050 /* Fix current position with one literal if cheaper */ 1148 1051 { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; 1149 1052 int const price = opt[cur-1].price 1150 - + ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) 1151 - + ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) 1152 - - ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); 1053 + + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) 1054 + + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) 1055 + - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); 1153 1056 assert(price < 1000000000); /* overflow check */ 1154 1057 if (price <= opt[cur].price) { 1155 1058 DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", ··· 1175 1078 assert(cur >= opt[cur].mlen); 1176 1079 if (opt[cur].mlen != 0) { 1177 1080 U32 const prev = cur - opt[cur].mlen; 1178 - repcodes_t newReps = ZSTD_updateRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); 1081 + repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); 1179 1082 ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); 1180 1083 } else { 1181 1084 ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); ··· 1192 1095 continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ 1193 1096 } 1194 1097 1098 + assert(opt[cur].price >= 0); 1195 1099 { U32 const ll0 = (opt[cur].mlen != 0); 1196 1100 U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; 1197 - U32 const previousPrice = opt[cur].price; 1101 + U32 const previousPrice = (U32)opt[cur].price; 1198 1102 U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); 1199 - U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch); 1103 + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch); 1200 1104 U32 matchNb; 1201 1105 1202 1106 ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, ··· 1235 1137 1236 1138 for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ 1237 1139 U32 const pos = cur + mlen; 1238 - int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); 1140 + int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); 1239 1141 1240 1142 if ((pos > last_pos) || (price < opt[pos].price)) { 1241 1143 DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", ··· 1265 1167 * update them while traversing the sequences. 1266 1168 */ 1267 1169 if (lastSequence.mlen != 0) { 1268 - repcodes_t reps = ZSTD_updateRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); 1170 + repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); 1269 1171 ZSTD_memcpy(rep, &reps, sizeof(reps)); 1270 1172 } else { 1271 1173 ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); ··· 1309 1211 1310 1212 assert(anchor + llen <= iend); 1311 1213 ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); 1312 - ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen-MINMATCH); 1214 + ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen); 1313 1215 anchor += advance; 1314 1216 ip = anchor; 1315 1217 } } ··· 1321 1223 return (size_t)(iend - anchor); 1322 1224 } 1323 1225 1226 + static size_t ZSTD_compressBlock_opt0( 1227 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1228 + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) 1229 + { 1230 + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); 1231 + } 1232 + 1233 + static size_t ZSTD_compressBlock_opt2( 1234 + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1235 + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) 1236 + { 1237 + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); 1238 + } 1324 1239 1325 1240 size_t ZSTD_compressBlock_btopt( 1326 1241 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1327 1242 const void* src, size_t srcSize) 1328 1243 { 1329 1244 DEBUGLOG(5, "ZSTD_compressBlock_btopt"); 1330 - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict); 1245 + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict); 1331 1246 } 1332 1247 1333 1248 1334 - /* used in 2-pass strategy */ 1335 - static U32 ZSTD_upscaleStat(unsigned* table, U32 lastEltIndex, int bonus) 1336 - { 1337 - U32 s, sum=0; 1338 - assert(ZSTD_FREQ_DIV+bonus >= 0); 1339 - for (s=0; s<lastEltIndex+1; s++) { 1340 - table[s] <<= ZSTD_FREQ_DIV+bonus; 1341 - table[s]--; 1342 - sum += table[s]; 1343 - } 1344 - return sum; 1345 - } 1346 1249 1347 - /* used in 2-pass strategy */ 1348 - MEM_STATIC void ZSTD_upscaleStats(optState_t* optPtr) 1349 - { 1350 - if (ZSTD_compressedLiterals(optPtr)) 1351 - optPtr->litSum = ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0); 1352 - optPtr->litLengthSum = ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL, 0); 1353 - optPtr->matchLengthSum = ZSTD_upscaleStat(optPtr->matchLengthFreq, MaxML, 0); 1354 - optPtr->offCodeSum = ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 0); 1355 - } 1356 1250 1357 1251 /* ZSTD_initStats_ultra(): 1358 1252 * make a first compression pass, just to seed stats with more accurate starting values. ··· 1366 1276 assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */ 1367 1277 assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */ 1368 1278 1369 - ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/ 1279 + ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ 1370 1280 1371 1281 /* invalidate first scan from history */ 1372 1282 ZSTD_resetSeqStore(seqStore); ··· 1375 1285 ms->window.lowLimit = ms->window.dictLimit; 1376 1286 ms->nextToUpdate = ms->window.dictLimit; 1377 1287 1378 - /* re-inforce weight of collected statistics */ 1379 - ZSTD_upscaleStats(&ms->opt); 1380 1288 } 1381 1289 1382 1290 size_t ZSTD_compressBlock_btultra( ··· 1382 1294 const void* src, size_t srcSize) 1383 1295 { 1384 1296 DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize); 1385 - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); 1297 + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); 1386 1298 } 1387 1299 1388 1300 size_t ZSTD_compressBlock_btultra2( ··· 1410 1322 ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); 1411 1323 } 1412 1324 1413 - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); 1325 + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); 1414 1326 } 1415 1327 1416 1328 size_t ZSTD_compressBlock_btopt_dictMatchState( 1417 1329 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1418 1330 const void* src, size_t srcSize) 1419 1331 { 1420 - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState); 1332 + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); 1421 1333 } 1422 1334 1423 1335 size_t ZSTD_compressBlock_btultra_dictMatchState( 1424 1336 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1425 1337 const void* src, size_t srcSize) 1426 1338 { 1427 - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState); 1339 + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); 1428 1340 } 1429 1341 1430 1342 size_t ZSTD_compressBlock_btopt_extDict( 1431 1343 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1432 1344 const void* src, size_t srcSize) 1433 1345 { 1434 - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict); 1346 + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); 1435 1347 } 1436 1348 1437 1349 size_t ZSTD_compressBlock_btultra_extDict( 1438 1350 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], 1439 1351 const void* src, size_t srcSize) 1440 1352 { 1441 - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict); 1353 + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict); 1442 1354 } 1443 1355 1444 1356 /* note : no btultra2 variant for extDict nor dictMatchState,
+722 -188
lib/zstd/decompress/huf_decompress.c
··· 22 22 #define HUF_STATIC_LINKING_ONLY 23 23 #include "../common/huf.h" 24 24 #include "../common/error_private.h" 25 + #include "../common/zstd_internal.h" 26 + 27 + /* ************************************************************** 28 + * Constants 29 + ****************************************************************/ 30 + 31 + #define HUF_DECODER_FAST_TABLELOG 11 25 32 26 33 /* ************************************************************** 27 34 * Macros ··· 43 36 #error "Cannot force the use of the X1 and X2 decoders at the same time!" 44 37 #endif 45 38 39 + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 40 + # define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE 41 + #else 42 + # define HUF_ASM_X86_64_BMI2_ATTRS 43 + #endif 44 + 45 + #define HUF_EXTERN_C 46 + #define HUF_ASM_DECL HUF_EXTERN_C 47 + 48 + #if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) 49 + # define HUF_NEED_BMI2_FUNCTION 1 50 + #else 51 + # define HUF_NEED_BMI2_FUNCTION 0 52 + #endif 53 + 54 + #if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) 55 + # define HUF_NEED_DEFAULT_FUNCTION 1 56 + #else 57 + # define HUF_NEED_DEFAULT_FUNCTION 0 58 + #endif 46 59 47 60 /* ************************************************************** 48 61 * Error Management ··· 92 65 return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ 93 66 } \ 94 67 \ 95 - static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \ 68 + static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \ 96 69 void* dst, size_t dstSize, \ 97 70 const void* cSrc, size_t cSrcSize, \ 98 71 const HUF_DTable* DTable) \ ··· 134 107 return dtd; 135 108 } 136 109 110 + #if ZSTD_ENABLE_ASM_X86_64_BMI2 111 + 112 + static size_t HUF_initDStream(BYTE const* ip) { 113 + BYTE const lastByte = ip[7]; 114 + size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; 115 + size_t const value = MEM_readLEST(ip) | 1; 116 + assert(bitsConsumed <= 8); 117 + return value << bitsConsumed; 118 + } 119 + typedef struct { 120 + BYTE const* ip[4]; 121 + BYTE* op[4]; 122 + U64 bits[4]; 123 + void const* dt; 124 + BYTE const* ilimit; 125 + BYTE* oend; 126 + BYTE const* iend[4]; 127 + } HUF_DecompressAsmArgs; 128 + 129 + /* 130 + * Initializes args for the asm decoding loop. 131 + * @returns 0 on success 132 + * 1 if the fallback implementation should be used. 133 + * Or an error code on failure. 134 + */ 135 + static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) 136 + { 137 + void const* dt = DTable + 1; 138 + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; 139 + 140 + const BYTE* const ilimit = (const BYTE*)src + 6 + 8; 141 + 142 + BYTE* const oend = (BYTE*)dst + dstSize; 143 + 144 + /* The following condition is false on x32 platform, 145 + * but HUF_asm is not compatible with this ABI */ 146 + if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; 147 + 148 + /* strict minimum : jump table + 1 byte per stream */ 149 + if (srcSize < 10) 150 + return ERROR(corruption_detected); 151 + 152 + /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers. 153 + * If table log is not correct at this point, fallback to the old decoder. 154 + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. 155 + */ 156 + if (dtLog != HUF_DECODER_FAST_TABLELOG) 157 + return 1; 158 + 159 + /* Read the jump table. */ 160 + { 161 + const BYTE* const istart = (const BYTE*)src; 162 + size_t const length1 = MEM_readLE16(istart); 163 + size_t const length2 = MEM_readLE16(istart+2); 164 + size_t const length3 = MEM_readLE16(istart+4); 165 + size_t const length4 = srcSize - (length1 + length2 + length3 + 6); 166 + args->iend[0] = istart + 6; /* jumpTable */ 167 + args->iend[1] = args->iend[0] + length1; 168 + args->iend[2] = args->iend[1] + length2; 169 + args->iend[3] = args->iend[2] + length3; 170 + 171 + /* HUF_initDStream() requires this, and this small of an input 172 + * won't benefit from the ASM loop anyways. 173 + * length1 must be >= 16 so that ip[0] >= ilimit before the loop 174 + * starts. 175 + */ 176 + if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) 177 + return 1; 178 + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ 179 + } 180 + /* ip[] contains the position that is currently loaded into bits[]. */ 181 + args->ip[0] = args->iend[1] - sizeof(U64); 182 + args->ip[1] = args->iend[2] - sizeof(U64); 183 + args->ip[2] = args->iend[3] - sizeof(U64); 184 + args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64); 185 + 186 + /* op[] contains the output pointers. */ 187 + args->op[0] = (BYTE*)dst; 188 + args->op[1] = args->op[0] + (dstSize+3)/4; 189 + args->op[2] = args->op[1] + (dstSize+3)/4; 190 + args->op[3] = args->op[2] + (dstSize+3)/4; 191 + 192 + /* No point to call the ASM loop for tiny outputs. */ 193 + if (args->op[3] >= oend) 194 + return 1; 195 + 196 + /* bits[] is the bit container. 197 + * It is read from the MSB down to the LSB. 198 + * It is shifted left as it is read, and zeros are 199 + * shifted in. After the lowest valid bit a 1 is 200 + * set, so that CountTrailingZeros(bits[]) can be used 201 + * to count how many bits we've consumed. 202 + */ 203 + args->bits[0] = HUF_initDStream(args->ip[0]); 204 + args->bits[1] = HUF_initDStream(args->ip[1]); 205 + args->bits[2] = HUF_initDStream(args->ip[2]); 206 + args->bits[3] = HUF_initDStream(args->ip[3]); 207 + 208 + /* If ip[] >= ilimit, it is guaranteed to be safe to 209 + * reload bits[]. It may be beyond its section, but is 210 + * guaranteed to be valid (>= istart). 211 + */ 212 + args->ilimit = ilimit; 213 + 214 + args->oend = oend; 215 + args->dt = dt; 216 + 217 + return 0; 218 + } 219 + 220 + static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) 221 + { 222 + /* Validate that we haven't overwritten. */ 223 + if (args->op[stream] > segmentEnd) 224 + return ERROR(corruption_detected); 225 + /* Validate that we haven't read beyond iend[]. 226 + * Note that ip[] may be < iend[] because the MSB is 227 + * the next bit to read, and we may have consumed 100% 228 + * of the stream, so down to iend[i] - 8 is valid. 229 + */ 230 + if (args->ip[stream] < args->iend[stream] - 8) 231 + return ERROR(corruption_detected); 232 + 233 + /* Construct the BIT_DStream_t. */ 234 + bit->bitContainer = MEM_readLE64(args->ip[stream]); 235 + bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); 236 + bit->start = (const char*)args->iend[0]; 237 + bit->limitPtr = bit->start + sizeof(size_t); 238 + bit->ptr = (const char*)args->ip[stream]; 239 + 240 + return 0; 241 + } 242 + #endif 243 + 137 244 138 245 #ifndef HUF_FORCE_DECOMPRESS_X2 139 246 140 247 /*-***************************/ 141 248 /* single-symbol decoding */ 142 249 /*-***************************/ 143 - typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */ 250 + typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */ 144 251 145 252 /* 146 253 * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at ··· 283 122 static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { 284 123 U64 D4; 285 124 if (MEM_isLittleEndian()) { 286 - D4 = symbol + (nbBits << 8); 287 - } else { 288 125 D4 = (symbol << 8) + nbBits; 126 + } else { 127 + D4 = symbol + (nbBits << 8); 289 128 } 290 129 D4 *= 0x0001000100010001ULL; 291 130 return D4; 131 + } 132 + 133 + /* 134 + * Increase the tableLog to targetTableLog and rescales the stats. 135 + * If tableLog > targetTableLog this is a no-op. 136 + * @returns New tableLog 137 + */ 138 + static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog) 139 + { 140 + if (tableLog > targetTableLog) 141 + return tableLog; 142 + if (tableLog < targetTableLog) { 143 + U32 const scale = targetTableLog - tableLog; 144 + U32 s; 145 + /* Increase the weight for all non-zero probability symbols by scale. */ 146 + for (s = 0; s < nbSymbols; ++s) { 147 + huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale); 148 + } 149 + /* Update rankVal to reflect the new weights. 150 + * All weights except 0 get moved to weight + scale. 151 + * Weights [1, scale] are empty. 152 + */ 153 + for (s = targetTableLog; s > scale; --s) { 154 + rankVal[s] = rankVal[s - scale]; 155 + } 156 + for (s = scale; s > 0; --s) { 157 + rankVal[s] = 0; 158 + } 159 + } 160 + return targetTableLog; 292 161 } 293 162 294 163 typedef struct { ··· 353 162 iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2); 354 163 if (HUF_isError(iSize)) return iSize; 355 164 165 + 356 166 /* Table header */ 357 167 { DTableDesc dtd = HUF_getDTableDesc(DTable); 168 + U32 const maxTableLog = dtd.maxTableLog + 1; 169 + U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG); 170 + tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog); 358 171 if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */ 359 172 dtd.tableType = 0; 360 173 dtd.tableLog = (BYTE)tableLog; ··· 402 207 403 208 /* fill DTable 404 209 * We fill all entries of each weight in order. 405 - * That way length is a constant for each iteration of the outter loop. 210 + * That way length is a constant for each iteration of the outer loop. 406 211 * We can switch based on the length to a different inner loop which is 407 212 * optimized for that particular case. 408 213 */ ··· 499 304 BYTE* const pStart = p; 500 305 501 306 /* up to 4 symbols at a time */ 502 - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { 503 - HUF_DECODE_SYMBOLX1_2(p, bitDPtr); 504 - HUF_DECODE_SYMBOLX1_1(p, bitDPtr); 505 - HUF_DECODE_SYMBOLX1_2(p, bitDPtr); 506 - HUF_DECODE_SYMBOLX1_0(p, bitDPtr); 307 + if ((pEnd - p) > 3) { 308 + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { 309 + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); 310 + HUF_DECODE_SYMBOLX1_1(p, bitDPtr); 311 + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); 312 + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); 313 + } 314 + } else { 315 + BIT_reloadDStream(bitDPtr); 507 316 } 508 317 509 318 /* [0-3] symbols remaining */ ··· 587 388 U32 endSignal = 1; 588 389 589 390 if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ 391 + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ 590 392 CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); 591 393 CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); 592 394 CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); 593 395 CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); 594 396 595 397 /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */ 596 - for ( ; (endSignal) & (op4 < olimit) ; ) { 597 - HUF_DECODE_SYMBOLX1_2(op1, &bitD1); 598 - HUF_DECODE_SYMBOLX1_2(op2, &bitD2); 599 - HUF_DECODE_SYMBOLX1_2(op3, &bitD3); 600 - HUF_DECODE_SYMBOLX1_2(op4, &bitD4); 601 - HUF_DECODE_SYMBOLX1_1(op1, &bitD1); 602 - HUF_DECODE_SYMBOLX1_1(op2, &bitD2); 603 - HUF_DECODE_SYMBOLX1_1(op3, &bitD3); 604 - HUF_DECODE_SYMBOLX1_1(op4, &bitD4); 605 - HUF_DECODE_SYMBOLX1_2(op1, &bitD1); 606 - HUF_DECODE_SYMBOLX1_2(op2, &bitD2); 607 - HUF_DECODE_SYMBOLX1_2(op3, &bitD3); 608 - HUF_DECODE_SYMBOLX1_2(op4, &bitD4); 609 - HUF_DECODE_SYMBOLX1_0(op1, &bitD1); 610 - HUF_DECODE_SYMBOLX1_0(op2, &bitD2); 611 - HUF_DECODE_SYMBOLX1_0(op3, &bitD3); 612 - HUF_DECODE_SYMBOLX1_0(op4, &bitD4); 613 - endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; 614 - endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; 615 - endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; 616 - endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; 398 + if ((size_t)(oend - op4) >= sizeof(size_t)) { 399 + for ( ; (endSignal) & (op4 < olimit) ; ) { 400 + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); 401 + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); 402 + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); 403 + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); 404 + HUF_DECODE_SYMBOLX1_1(op1, &bitD1); 405 + HUF_DECODE_SYMBOLX1_1(op2, &bitD2); 406 + HUF_DECODE_SYMBOLX1_1(op3, &bitD3); 407 + HUF_DECODE_SYMBOLX1_1(op4, &bitD4); 408 + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); 409 + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); 410 + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); 411 + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); 412 + HUF_DECODE_SYMBOLX1_0(op1, &bitD1); 413 + HUF_DECODE_SYMBOLX1_0(op2, &bitD2); 414 + HUF_DECODE_SYMBOLX1_0(op3, &bitD3); 415 + HUF_DECODE_SYMBOLX1_0(op4, &bitD4); 416 + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; 417 + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; 418 + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; 419 + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; 420 + } 617 421 } 618 422 619 423 /* check corruption */ ··· 642 440 } 643 441 } 644 442 443 + #if HUF_NEED_BMI2_FUNCTION 444 + static BMI2_TARGET_ATTRIBUTE 445 + size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, 446 + size_t cSrcSize, HUF_DTable const* DTable) { 447 + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); 448 + } 449 + #endif 450 + 451 + #if HUF_NEED_DEFAULT_FUNCTION 452 + static 453 + size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, 454 + size_t cSrcSize, HUF_DTable const* DTable) { 455 + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); 456 + } 457 + #endif 458 + 459 + #if ZSTD_ENABLE_ASM_X86_64_BMI2 460 + 461 + HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; 462 + 463 + static HUF_ASM_X86_64_BMI2_ATTRS 464 + size_t 465 + HUF_decompress4X1_usingDTable_internal_bmi2_asm( 466 + void* dst, size_t dstSize, 467 + const void* cSrc, size_t cSrcSize, 468 + const HUF_DTable* DTable) 469 + { 470 + void const* dt = DTable + 1; 471 + const BYTE* const iend = (const BYTE*)cSrc + 6; 472 + BYTE* const oend = (BYTE*)dst + dstSize; 473 + HUF_DecompressAsmArgs args; 474 + { 475 + size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); 476 + FORWARD_IF_ERROR(ret, "Failed to init asm args"); 477 + if (ret != 0) 478 + return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); 479 + } 480 + 481 + assert(args.ip[0] >= args.ilimit); 482 + HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); 483 + 484 + /* Our loop guarantees that ip[] >= ilimit and that we haven't 485 + * overwritten any op[]. 486 + */ 487 + assert(args.ip[0] >= iend); 488 + assert(args.ip[1] >= iend); 489 + assert(args.ip[2] >= iend); 490 + assert(args.ip[3] >= iend); 491 + assert(args.op[3] <= oend); 492 + (void)iend; 493 + 494 + /* finish bit streams one by one. */ 495 + { 496 + size_t const segmentSize = (dstSize+3) / 4; 497 + BYTE* segmentEnd = (BYTE*)dst; 498 + int i; 499 + for (i = 0; i < 4; ++i) { 500 + BIT_DStream_t bit; 501 + if (segmentSize <= (size_t)(oend - segmentEnd)) 502 + segmentEnd += segmentSize; 503 + else 504 + segmentEnd = oend; 505 + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); 506 + /* Decompress and validate that we've produced exactly the expected length. */ 507 + args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG); 508 + if (args.op[i] != segmentEnd) return ERROR(corruption_detected); 509 + } 510 + } 511 + 512 + /* decoded size */ 513 + return dstSize; 514 + } 515 + #endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ 645 516 646 517 typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, 647 518 const void *cSrc, ··· 722 447 const HUF_DTable *DTable); 723 448 724 449 HUF_DGEN(HUF_decompress1X1_usingDTable_internal) 725 - HUF_DGEN(HUF_decompress4X1_usingDTable_internal) 726 450 451 + static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, 452 + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) 453 + { 454 + #if DYNAMIC_BMI2 455 + if (bmi2) { 456 + # if ZSTD_ENABLE_ASM_X86_64_BMI2 457 + return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); 458 + # else 459 + return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); 460 + # endif 461 + } 462 + #else 463 + (void)bmi2; 464 + #endif 465 + 466 + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) 467 + return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); 468 + #else 469 + return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); 470 + #endif 471 + } 727 472 728 473 729 474 size_t HUF_decompress1X1_usingDTable( ··· 813 518 /* *************************/ 814 519 815 520 typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */ 816 - typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t; 521 + typedef struct { BYTE symbol; } sortedSymbol_t; 817 522 typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; 818 523 typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; 819 524 525 + /* 526 + * Constructs a HUF_DEltX2 in a U32. 527 + */ 528 + static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level) 529 + { 530 + U32 seq; 531 + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0); 532 + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2); 533 + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3); 534 + DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32)); 535 + if (MEM_isLittleEndian()) { 536 + seq = level == 1 ? symbol : (baseSeq + (symbol << 8)); 537 + return seq + (nbBits << 16) + ((U32)level << 24); 538 + } else { 539 + seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol); 540 + return (seq << 16) + (nbBits << 8) + (U32)level; 541 + } 542 + } 543 + 544 + /* 545 + * Constructs a HUF_DEltX2. 546 + */ 547 + static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level) 548 + { 549 + HUF_DEltX2 DElt; 550 + U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); 551 + DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val)); 552 + ZSTD_memcpy(&DElt, &val, sizeof(val)); 553 + return DElt; 554 + } 555 + 556 + /* 557 + * Constructs 2 HUF_DEltX2s and packs them into a U64. 558 + */ 559 + static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level) 560 + { 561 + U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); 562 + return (U64)DElt + ((U64)DElt << 32); 563 + } 564 + 565 + /* 566 + * Fills the DTable rank with all the symbols from [begin, end) that are each 567 + * nbBits long. 568 + * 569 + * @param DTableRank The start of the rank in the DTable. 570 + * @param begin The first symbol to fill (inclusive). 571 + * @param end The last symbol to fill (exclusive). 572 + * @param nbBits Each symbol is nbBits long. 573 + * @param tableLog The table log. 574 + * @param baseSeq If level == 1 { 0 } else { the first level symbol } 575 + * @param level The level in the table. Must be 1 or 2. 576 + */ 577 + static void HUF_fillDTableX2ForWeight( 578 + HUF_DEltX2* DTableRank, 579 + sortedSymbol_t const* begin, sortedSymbol_t const* end, 580 + U32 nbBits, U32 tableLog, 581 + U16 baseSeq, int const level) 582 + { 583 + U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */); 584 + const sortedSymbol_t* ptr; 585 + assert(level >= 1 && level <= 2); 586 + switch (length) { 587 + case 1: 588 + for (ptr = begin; ptr != end; ++ptr) { 589 + HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); 590 + *DTableRank++ = DElt; 591 + } 592 + break; 593 + case 2: 594 + for (ptr = begin; ptr != end; ++ptr) { 595 + HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); 596 + DTableRank[0] = DElt; 597 + DTableRank[1] = DElt; 598 + DTableRank += 2; 599 + } 600 + break; 601 + case 4: 602 + for (ptr = begin; ptr != end; ++ptr) { 603 + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); 604 + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); 605 + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); 606 + DTableRank += 4; 607 + } 608 + break; 609 + case 8: 610 + for (ptr = begin; ptr != end; ++ptr) { 611 + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); 612 + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); 613 + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); 614 + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); 615 + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); 616 + DTableRank += 8; 617 + } 618 + break; 619 + default: 620 + for (ptr = begin; ptr != end; ++ptr) { 621 + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); 622 + HUF_DEltX2* const DTableRankEnd = DTableRank + length; 623 + for (; DTableRank != DTableRankEnd; DTableRank += 8) { 624 + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); 625 + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); 626 + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); 627 + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); 628 + } 629 + } 630 + break; 631 + } 632 + } 820 633 821 634 /* HUF_fillDTableX2Level2() : 822 635 * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ 823 - static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed, 824 - const U32* rankValOrigin, const int minWeight, 825 - const sortedSymbol_t* sortedSymbols, const U32 sortedListSize, 826 - U32 nbBitsBaseline, U16 baseSeq, U32* wksp, size_t wkspSize) 636 + static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits, 637 + const U32* rankVal, const int minWeight, const int maxWeight1, 638 + const sortedSymbol_t* sortedSymbols, U32 const* rankStart, 639 + U32 nbBitsBaseline, U16 baseSeq) 827 640 { 828 - HUF_DEltX2 DElt; 829 - U32* rankVal = wksp; 830 - 831 - assert(wkspSize >= HUF_TABLELOG_MAX + 1); 832 - (void)wkspSize; 833 - /* get pre-calculated rankVal */ 834 - ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1)); 835 - 836 - /* fill skipped values */ 641 + /* Fill skipped values (all positions up to rankVal[minWeight]). 642 + * These are positions only get a single symbol because the combined weight 643 + * is too large. 644 + */ 837 645 if (minWeight>1) { 838 - U32 i, skipSize = rankVal[minWeight]; 839 - MEM_writeLE16(&(DElt.sequence), baseSeq); 840 - DElt.nbBits = (BYTE)(consumed); 841 - DElt.length = 1; 842 - for (i = 0; i < skipSize; i++) 843 - DTable[i] = DElt; 646 + U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */); 647 + U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1); 648 + int const skipSize = rankVal[minWeight]; 649 + assert(length > 1); 650 + assert((U32)skipSize < length); 651 + switch (length) { 652 + case 2: 653 + assert(skipSize == 1); 654 + ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2)); 655 + break; 656 + case 4: 657 + assert(skipSize <= 4); 658 + ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2)); 659 + ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2)); 660 + break; 661 + default: 662 + { 663 + int i; 664 + for (i = 0; i < skipSize; i += 8) { 665 + ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2)); 666 + ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2)); 667 + ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2)); 668 + ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2)); 669 + } 670 + } 671 + } 844 672 } 845 673 846 - /* fill DTable */ 847 - { U32 s; for (s=0; s<sortedListSize; s++) { /* note : sortedSymbols already skipped */ 848 - const U32 symbol = sortedSymbols[s].symbol; 849 - const U32 weight = sortedSymbols[s].weight; 850 - const U32 nbBits = nbBitsBaseline - weight; 851 - const U32 length = 1 << (sizeLog-nbBits); 852 - const U32 start = rankVal[weight]; 853 - U32 i = start; 854 - const U32 end = start + length; 855 - 856 - MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8))); 857 - DElt.nbBits = (BYTE)(nbBits + consumed); 858 - DElt.length = 2; 859 - do { DTable[i++] = DElt; } while (i<end); /* since length >= 1 */ 860 - 861 - rankVal[weight] += length; 862 - } } 674 + /* Fill each of the second level symbols by weight. */ 675 + { 676 + int w; 677 + for (w = minWeight; w < maxWeight1; ++w) { 678 + int const begin = rankStart[w]; 679 + int const end = rankStart[w+1]; 680 + U32 const nbBits = nbBitsBaseline - w; 681 + U32 const totalBits = nbBits + consumedBits; 682 + HUF_fillDTableX2ForWeight( 683 + DTable + rankVal[w], 684 + sortedSymbols + begin, sortedSymbols + end, 685 + totalBits, targetLog, 686 + baseSeq, /* level */ 2); 687 + } 688 + } 863 689 } 864 690 865 - 866 691 static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, 867 - const sortedSymbol_t* sortedList, const U32 sortedListSize, 692 + const sortedSymbol_t* sortedList, 868 693 const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, 869 - const U32 nbBitsBaseline, U32* wksp, size_t wkspSize) 694 + const U32 nbBitsBaseline) 870 695 { 871 - U32* rankVal = wksp; 696 + U32* const rankVal = rankValOrigin[0]; 872 697 const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ 873 698 const U32 minBits = nbBitsBaseline - maxWeight; 874 - U32 s; 699 + int w; 700 + int const wEnd = (int)maxWeight + 1; 875 701 876 - assert(wkspSize >= HUF_TABLELOG_MAX + 1); 877 - wksp += HUF_TABLELOG_MAX + 1; 878 - wkspSize -= HUF_TABLELOG_MAX + 1; 702 + /* Fill DTable in order of weight. */ 703 + for (w = 1; w < wEnd; ++w) { 704 + int const begin = (int)rankStart[w]; 705 + int const end = (int)rankStart[w+1]; 706 + U32 const nbBits = nbBitsBaseline - w; 879 707 880 - ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1)); 881 - 882 - /* fill DTable */ 883 - for (s=0; s<sortedListSize; s++) { 884 - const U16 symbol = sortedList[s].symbol; 885 - const U32 weight = sortedList[s].weight; 886 - const U32 nbBits = nbBitsBaseline - weight; 887 - const U32 start = rankVal[weight]; 888 - const U32 length = 1 << (targetLog-nbBits); 889 - 890 - if (targetLog-nbBits >= minBits) { /* enough room for a second symbol */ 891 - U32 sortedRank; 708 + if (targetLog-nbBits >= minBits) { 709 + /* Enough room for a second symbol. */ 710 + int start = rankVal[w]; 711 + U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */); 892 712 int minWeight = nbBits + scaleLog; 713 + int s; 893 714 if (minWeight < 1) minWeight = 1; 894 - sortedRank = rankStart[minWeight]; 895 - HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits, 896 - rankValOrigin[nbBits], minWeight, 897 - sortedList+sortedRank, sortedListSize-sortedRank, 898 - nbBitsBaseline, symbol, wksp, wkspSize); 715 + /* Fill the DTable for every symbol of weight w. 716 + * These symbols get at least 1 second symbol. 717 + */ 718 + for (s = begin; s != end; ++s) { 719 + HUF_fillDTableX2Level2( 720 + DTable + start, targetLog, nbBits, 721 + rankValOrigin[nbBits], minWeight, wEnd, 722 + sortedList, rankStart, 723 + nbBitsBaseline, sortedList[s].symbol); 724 + start += length; 725 + } 899 726 } else { 900 - HUF_DEltX2 DElt; 901 - MEM_writeLE16(&(DElt.sequence), symbol); 902 - DElt.nbBits = (BYTE)(nbBits); 903 - DElt.length = 1; 904 - { U32 const end = start + length; 905 - U32 u; 906 - for (u = start; u < end; u++) DTable[u] = DElt; 907 - } } 908 - rankVal[weight] += length; 727 + /* Only a single symbol. */ 728 + HUF_fillDTableX2ForWeight( 729 + DTable + rankVal[w], 730 + sortedList + begin, sortedList + end, 731 + nbBits, targetLog, 732 + /* baseSeq */ 0, /* level */ 1); 733 + } 909 734 } 910 735 } 911 736 912 737 typedef struct { 913 738 rankValCol_t rankVal[HUF_TABLELOG_MAX]; 914 739 U32 rankStats[HUF_TABLELOG_MAX + 1]; 915 - U32 rankStart0[HUF_TABLELOG_MAX + 2]; 740 + U32 rankStart0[HUF_TABLELOG_MAX + 3]; 916 741 sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1]; 917 742 BYTE weightList[HUF_SYMBOLVALUE_MAX + 1]; 918 743 U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; ··· 1042 627 const void* src, size_t srcSize, 1043 628 void* workSpace, size_t wkspSize) 1044 629 { 1045 - U32 tableLog, maxW, sizeOfSort, nbSymbols; 630 + return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); 631 + } 632 + 633 + size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, 634 + const void* src, size_t srcSize, 635 + void* workSpace, size_t wkspSize, int bmi2) 636 + { 637 + U32 tableLog, maxW, nbSymbols; 1046 638 DTableDesc dtd = HUF_getDTableDesc(DTable); 1047 - U32 const maxTableLog = dtd.maxTableLog; 639 + U32 maxTableLog = dtd.maxTableLog; 1048 640 size_t iSize; 1049 641 void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */ 1050 642 HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr; ··· 1069 647 if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); 1070 648 /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ 1071 649 1072 - iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), /* bmi2 */ 0); 650 + iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2); 1073 651 if (HUF_isError(iSize)) return iSize; 1074 652 1075 653 /* check result */ 1076 654 if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ 655 + if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG; 1077 656 1078 657 /* find maxWeight */ 1079 658 for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ ··· 1087 664 rankStart[w] = curr; 1088 665 } 1089 666 rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/ 1090 - sizeOfSort = nextRankStart; 667 + rankStart[maxW+1] = nextRankStart; 1091 668 } 1092 669 1093 670 /* sort symbols by weight */ ··· 1096 673 U32 const w = wksp->weightList[s]; 1097 674 U32 const r = rankStart[w]++; 1098 675 wksp->sortedSymbol[r].symbol = (BYTE)s; 1099 - wksp->sortedSymbol[r].weight = (BYTE)w; 1100 676 } 1101 677 rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */ 1102 678 } ··· 1120 698 } } } } 1121 699 1122 700 HUF_fillDTableX2(dt, maxTableLog, 1123 - wksp->sortedSymbol, sizeOfSort, 701 + wksp->sortedSymbol, 1124 702 wksp->rankStart0, wksp->rankVal, maxW, 1125 - tableLog+1, 1126 - wksp->calleeWksp, sizeof(wksp->calleeWksp) / sizeof(U32)); 703 + tableLog+1); 1127 704 1128 705 dtd.tableLog = (BYTE)maxTableLog; 1129 706 dtd.tableType = 1; ··· 1135 714 HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) 1136 715 { 1137 716 size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ 1138 - ZSTD_memcpy(op, dt+val, 2); 717 + ZSTD_memcpy(op, &dt[val].sequence, 2); 1139 718 BIT_skipBits(DStream, dt[val].nbBits); 1140 719 return dt[val].length; 1141 720 } ··· 1144 723 HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) 1145 724 { 1146 725 size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ 1147 - ZSTD_memcpy(op, dt+val, 1); 1148 - if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits); 1149 - else { 726 + ZSTD_memcpy(op, &dt[val].sequence, 1); 727 + if (dt[val].length==1) { 728 + BIT_skipBits(DStream, dt[val].nbBits); 729 + } else { 1150 730 if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { 1151 731 BIT_skipBits(DStream, dt[val].nbBits); 1152 732 if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) 1153 733 /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ 1154 734 DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); 1155 - } } 735 + } 736 + } 1156 737 return 1; 1157 738 } 1158 739 ··· 1176 753 BYTE* const pStart = p; 1177 754 1178 755 /* up to 8 symbols at a time */ 1179 - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { 1180 - HUF_DECODE_SYMBOLX2_2(p, bitDPtr); 1181 - HUF_DECODE_SYMBOLX2_1(p, bitDPtr); 1182 - HUF_DECODE_SYMBOLX2_2(p, bitDPtr); 1183 - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 756 + if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) { 757 + if (dtLog <= 11 && MEM_64bits()) { 758 + /* up to 10 symbols at a time */ 759 + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) { 760 + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 761 + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 762 + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 763 + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 764 + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 765 + } 766 + } else { 767 + /* up to 8 symbols at a time */ 768 + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { 769 + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); 770 + HUF_DECODE_SYMBOLX2_1(p, bitDPtr); 771 + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); 772 + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 773 + } 774 + } 775 + } else { 776 + BIT_reloadDStream(bitDPtr); 1184 777 } 1185 778 1186 779 /* closer to end : up to 2 symbols at a time */ 1187 - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) 1188 - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 780 + if ((size_t)(pEnd - p) >= 2) { 781 + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) 782 + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); 1189 783 1190 - while (p <= pEnd-2) 1191 - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ 784 + while (p <= pEnd-2) 785 + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ 786 + } 1192 787 1193 788 if (p < pEnd) 1194 789 p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog); ··· 1240 799 /* decoded size */ 1241 800 return dstSize; 1242 801 } 1243 - 1244 802 FORCE_INLINE_TEMPLATE size_t 1245 803 HUF_decompress4X2_usingDTable_internal_body( 1246 804 void* dst, size_t dstSize, ··· 1281 841 U32 const dtLog = dtd.tableLog; 1282 842 1283 843 if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ 844 + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ 1284 845 CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); 1285 846 CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); 1286 847 CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); 1287 848 CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); 1288 849 1289 850 /* 16-32 symbols per loop (4-8 symbols per stream) */ 1290 - for ( ; (endSignal) & (op4 < olimit); ) { 851 + if ((size_t)(oend - op4) >= sizeof(size_t)) { 852 + for ( ; (endSignal) & (op4 < olimit); ) { 1291 853 #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) 1292 - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 1293 - HUF_DECODE_SYMBOLX2_1(op1, &bitD1); 1294 - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 1295 - HUF_DECODE_SYMBOLX2_0(op1, &bitD1); 1296 - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 1297 - HUF_DECODE_SYMBOLX2_1(op2, &bitD2); 1298 - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 1299 - HUF_DECODE_SYMBOLX2_0(op2, &bitD2); 1300 - endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; 1301 - endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; 1302 - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 1303 - HUF_DECODE_SYMBOLX2_1(op3, &bitD3); 1304 - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 1305 - HUF_DECODE_SYMBOLX2_0(op3, &bitD3); 1306 - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 1307 - HUF_DECODE_SYMBOLX2_1(op4, &bitD4); 1308 - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 1309 - HUF_DECODE_SYMBOLX2_0(op4, &bitD4); 1310 - endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; 1311 - endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; 854 + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 855 + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); 856 + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 857 + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); 858 + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 859 + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); 860 + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 861 + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); 862 + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; 863 + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; 864 + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 865 + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); 866 + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 867 + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); 868 + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 869 + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); 870 + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 871 + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); 872 + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; 873 + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; 1312 874 #else 1313 - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 1314 - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 1315 - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 1316 - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 1317 - HUF_DECODE_SYMBOLX2_1(op1, &bitD1); 1318 - HUF_DECODE_SYMBOLX2_1(op2, &bitD2); 1319 - HUF_DECODE_SYMBOLX2_1(op3, &bitD3); 1320 - HUF_DECODE_SYMBOLX2_1(op4, &bitD4); 1321 - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 1322 - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 1323 - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 1324 - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 1325 - HUF_DECODE_SYMBOLX2_0(op1, &bitD1); 1326 - HUF_DECODE_SYMBOLX2_0(op2, &bitD2); 1327 - HUF_DECODE_SYMBOLX2_0(op3, &bitD3); 1328 - HUF_DECODE_SYMBOLX2_0(op4, &bitD4); 1329 - endSignal = (U32)LIKELY((U32) 1330 - (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) 1331 - & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) 1332 - & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) 1333 - & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); 875 + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 876 + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 877 + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 878 + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 879 + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); 880 + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); 881 + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); 882 + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); 883 + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); 884 + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); 885 + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); 886 + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); 887 + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); 888 + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); 889 + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); 890 + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); 891 + endSignal = (U32)LIKELY((U32) 892 + (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) 893 + & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) 894 + & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) 895 + & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); 1334 896 #endif 897 + } 1335 898 } 1336 899 1337 900 /* check corruption */ ··· 1358 915 } 1359 916 } 1360 917 918 + #if HUF_NEED_BMI2_FUNCTION 919 + static BMI2_TARGET_ATTRIBUTE 920 + size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, 921 + size_t cSrcSize, HUF_DTable const* DTable) { 922 + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); 923 + } 924 + #endif 925 + 926 + #if HUF_NEED_DEFAULT_FUNCTION 927 + static 928 + size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, 929 + size_t cSrcSize, HUF_DTable const* DTable) { 930 + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); 931 + } 932 + #endif 933 + 934 + #if ZSTD_ENABLE_ASM_X86_64_BMI2 935 + 936 + HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; 937 + 938 + static HUF_ASM_X86_64_BMI2_ATTRS size_t 939 + HUF_decompress4X2_usingDTable_internal_bmi2_asm( 940 + void* dst, size_t dstSize, 941 + const void* cSrc, size_t cSrcSize, 942 + const HUF_DTable* DTable) { 943 + void const* dt = DTable + 1; 944 + const BYTE* const iend = (const BYTE*)cSrc + 6; 945 + BYTE* const oend = (BYTE*)dst + dstSize; 946 + HUF_DecompressAsmArgs args; 947 + { 948 + size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); 949 + FORWARD_IF_ERROR(ret, "Failed to init asm args"); 950 + if (ret != 0) 951 + return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); 952 + } 953 + 954 + assert(args.ip[0] >= args.ilimit); 955 + HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); 956 + 957 + /* note : op4 already verified within main loop */ 958 + assert(args.ip[0] >= iend); 959 + assert(args.ip[1] >= iend); 960 + assert(args.ip[2] >= iend); 961 + assert(args.ip[3] >= iend); 962 + assert(args.op[3] <= oend); 963 + (void)iend; 964 + 965 + /* finish bitStreams one by one */ 966 + { 967 + size_t const segmentSize = (dstSize+3) / 4; 968 + BYTE* segmentEnd = (BYTE*)dst; 969 + int i; 970 + for (i = 0; i < 4; ++i) { 971 + BIT_DStream_t bit; 972 + if (segmentSize <= (size_t)(oend - segmentEnd)) 973 + segmentEnd += segmentSize; 974 + else 975 + segmentEnd = oend; 976 + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); 977 + args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG); 978 + if (args.op[i] != segmentEnd) 979 + return ERROR(corruption_detected); 980 + } 981 + } 982 + 983 + /* decoded size */ 984 + return dstSize; 985 + } 986 + #endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ 987 + 988 + static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, 989 + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) 990 + { 991 + #if DYNAMIC_BMI2 992 + if (bmi2) { 993 + # if ZSTD_ENABLE_ASM_X86_64_BMI2 994 + return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); 995 + # else 996 + return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); 997 + # endif 998 + } 999 + #else 1000 + (void)bmi2; 1001 + #endif 1002 + 1003 + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) 1004 + return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); 1005 + #else 1006 + return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); 1007 + #endif 1008 + } 1009 + 1361 1010 HUF_DGEN(HUF_decompress1X2_usingDTable_internal) 1362 - HUF_DGEN(HUF_decompress4X2_usingDTable_internal) 1363 1011 1364 1012 size_t HUF_decompress1X2_usingDTable( 1365 1013 void* dst, size_t dstSize, ··· 1559 1025 1560 1026 #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) 1561 1027 typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; 1562 - static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = 1028 + static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] = 1563 1029 { 1564 1030 /* single, double, quad */ 1565 - {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */ 1566 - {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */ 1567 - {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */ 1568 - {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */ 1569 - {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */ 1570 - {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */ 1571 - {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */ 1572 - {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */ 1573 - {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */ 1574 - {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */ 1575 - {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */ 1576 - {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */ 1577 - {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */ 1578 - {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */ 1579 - {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */ 1580 - {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */ 1031 + {{0,0}, {1,1}}, /* Q==0 : impossible */ 1032 + {{0,0}, {1,1}}, /* Q==1 : impossible */ 1033 + {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */ 1034 + {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */ 1035 + {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */ 1036 + {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */ 1037 + {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */ 1038 + {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */ 1039 + {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */ 1040 + {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */ 1041 + {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */ 1042 + {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */ 1043 + {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */ 1044 + {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */ 1045 + {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */ 1046 + {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */ 1581 1047 }; 1582 1048 #endif 1583 1049 ··· 1604 1070 U32 const D256 = (U32)(dstSize >> 8); 1605 1071 U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); 1606 1072 U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); 1607 - DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */ 1073 + DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */ 1608 1074 return DTime1 < DTime0; 1609 1075 } 1610 1076 #endif
+63 -17
lib/zstd/decompress/zstd_decompress.c
··· 53 53 * Dependencies 54 54 *********************************************************/ 55 55 #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ 56 - #include "../common/cpu.h" /* bmi2 */ 57 56 #include "../common/mem.h" /* low level memory routines */ 58 57 #define FSE_STATIC_LINKING_ONLY 59 58 #include "../common/fse.h" ··· 251 252 dctx->inBuffSize = 0; 252 253 dctx->outBuffSize = 0; 253 254 dctx->streamStage = zdss_init; 254 - dctx->legacyContext = NULL; 255 - dctx->previousLegacyVersion = 0; 256 255 dctx->noForwardProgress = 0; 257 256 dctx->oversizedDuration = 0; 258 - dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); 257 + #if DYNAMIC_BMI2 258 + dctx->bmi2 = ZSTD_cpuSupportsBmi2(); 259 + #endif 259 260 dctx->ddictSet = NULL; 260 261 ZSTD_DCtx_resetParameters(dctx); 261 262 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION ··· 276 277 return dctx; 277 278 } 278 279 279 - ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) 280 - { 280 + static ZSTD_DCtx* ZSTD_createDCtx_internal(ZSTD_customMem customMem) { 281 281 if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; 282 282 283 283 { ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_customMalloc(sizeof(*dctx), customMem); ··· 287 289 } 288 290 } 289 291 292 + ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) 293 + { 294 + return ZSTD_createDCtx_internal(customMem); 295 + } 296 + 290 297 ZSTD_DCtx* ZSTD_createDCtx(void) 291 298 { 292 299 DEBUGLOG(3, "ZSTD_createDCtx"); 293 - return ZSTD_createDCtx_advanced(ZSTD_defaultCMem); 300 + return ZSTD_createDCtx_internal(ZSTD_defaultCMem); 294 301 } 295 302 296 303 static void ZSTD_clearDict(ZSTD_DCtx* dctx) ··· 368 365 if (size < ZSTD_FRAMEIDSIZE) return 0; 369 366 { U32 const magic = MEM_readLE32(buffer); 370 367 if (magic == ZSTD_MAGICNUMBER) return 1; 368 + if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1; 369 + } 370 + return 0; 371 + } 372 + 373 + /*! ZSTD_isSkippableFrame() : 374 + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. 375 + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. 376 + */ 377 + unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size) 378 + { 379 + if (size < ZSTD_FRAMEIDSIZE) return 0; 380 + { U32 const magic = MEM_readLE32(buffer); 371 381 if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1; 372 382 } 373 383 return 0; ··· 513 497 return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1); 514 498 } 515 499 516 - 517 500 /* ZSTD_getFrameContentSize() : 518 501 * compatible with legacy mode 519 502 * @return : decompressed size of the single frame pointed to be `src` if known, otherwise ··· 545 530 RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); 546 531 return skippableSize; 547 532 } 533 + } 534 + 535 + /*! ZSTD_readSkippableFrame() : 536 + * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. 537 + * 538 + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, 539 + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested 540 + * in the magicVariant. 541 + * 542 + * Returns an error if destination buffer is not large enough, or if the frame is not skippable. 543 + * 544 + * @return : number of bytes written or a ZSTD error. 545 + */ 546 + ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, 547 + const void* src, size_t srcSize) 548 + { 549 + U32 const magicNumber = MEM_readLE32(src); 550 + size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); 551 + size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; 552 + 553 + /* check input validity */ 554 + RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); 555 + RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); 556 + RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); 557 + 558 + /* deliver payload */ 559 + if (skippableContentSize > 0 && dst != NULL) 560 + ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); 561 + if (magicVariant != NULL) 562 + *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; 563 + return skippableContentSize; 548 564 } 549 565 550 566 /* ZSTD_findDecompressedSize() : ··· 870 824 switch(blockProperties.blockType) 871 825 { 872 826 case bt_compressed: 873 - decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1); 827 + decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming); 874 828 break; 875 829 case bt_raw : 876 830 decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize); ··· 1022 976 { 1023 977 #if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1) 1024 978 size_t regenSize; 1025 - ZSTD_DCtx* const dctx = ZSTD_createDCtx(); 979 + ZSTD_DCtx* const dctx = ZSTD_createDCtx_internal(ZSTD_defaultCMem); 1026 980 RETURN_ERROR_IF(dctx==NULL, memory_allocation, "NULL pointer!"); 1027 981 regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize); 1028 982 ZSTD_freeDCtx(dctx); ··· 1042 996 size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } 1043 997 1044 998 /* 1045 - * Similar to ZSTD_nextSrcSizeToDecompress(), but when when a block input can be streamed, 999 + * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, 1046 1000 * we allow taking a partial block as the input. Currently only raw uncompressed blocks can 1047 1001 * be streamed. 1048 1002 * ··· 1056 1010 return dctx->expected; 1057 1011 if (dctx->bType != bt_raw) 1058 1012 return dctx->expected; 1059 - return MIN(MAX(inputSize, 1), dctx->expected); 1013 + return BOUNDED(1, inputSize, dctx->expected); 1060 1014 } 1061 1015 1062 1016 ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { ··· 1162 1116 { 1163 1117 case bt_compressed: 1164 1118 DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); 1165 - rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1); 1119 + rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming); 1166 1120 dctx->expected = 0; /* Streaming not supported */ 1167 1121 break; 1168 1122 case bt_raw : ··· 1484 1438 ZSTD_DStream* ZSTD_createDStream(void) 1485 1439 { 1486 1440 DEBUGLOG(3, "ZSTD_createDStream"); 1487 - return ZSTD_createDStream_advanced(ZSTD_defaultCMem); 1441 + return ZSTD_createDCtx_internal(ZSTD_defaultCMem); 1488 1442 } 1489 1443 1490 1444 ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) ··· 1494 1448 1495 1449 ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem) 1496 1450 { 1497 - return ZSTD_createDCtx_advanced(customMem); 1451 + return ZSTD_createDCtx_internal(customMem); 1498 1452 } 1499 1453 1500 1454 size_t ZSTD_freeDStream(ZSTD_DStream* zds) ··· 1754 1708 size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) 1755 1709 { 1756 1710 size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); 1757 - unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2); 1711 + /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/ 1712 + unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2); 1758 1713 unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); 1759 1714 size_t const minRBSize = (size_t) neededSize; 1760 1715 RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, ··· 1889 1842 DEBUGLOG(5, "stage zdss_init => transparent reset "); 1890 1843 zds->streamStage = zdss_loadHeader; 1891 1844 zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; 1892 - zds->legacyVersion = 0; 1893 1845 zds->hostageByte = 0; 1894 1846 zds->expectedOutBuffer = *output; 1895 1847 ZSTD_FALLTHROUGH;
+776 -244
lib/zstd/decompress/zstd_decompress_block.c
··· 69 69 } 70 70 } 71 71 72 + /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ 73 + static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, 74 + const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) 75 + { 76 + if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) 77 + { 78 + /* room for litbuffer to fit without read faulting */ 79 + dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH; 80 + dctx->litBufferEnd = dctx->litBuffer + litSize; 81 + dctx->litBufferLocation = ZSTD_in_dst; 82 + } 83 + else if (litSize > ZSTD_LITBUFFEREXTRASIZE) 84 + { 85 + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ 86 + if (splitImmediately) { 87 + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ 88 + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; 89 + dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; 90 + } 91 + else { 92 + /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ 93 + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; 94 + dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; 95 + } 96 + dctx->litBufferLocation = ZSTD_split; 97 + } 98 + else 99 + { 100 + /* fits entirely within litExtraBuffer, so no split is necessary */ 101 + dctx->litBuffer = dctx->litExtraBuffer; 102 + dctx->litBufferEnd = dctx->litBuffer + litSize; 103 + dctx->litBufferLocation = ZSTD_not_in_dst; 104 + } 105 + } 72 106 73 107 /* Hidden declaration for fullbench */ 74 108 size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, 75 - const void* src, size_t srcSize); 109 + const void* src, size_t srcSize, 110 + void* dst, size_t dstCapacity, const streaming_operation streaming); 76 111 /*! ZSTD_decodeLiteralsBlock() : 112 + * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored 113 + * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current 114 + * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being 115 + * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. 116 + * 77 117 * @return : nb of bytes read from src (< srcSize ) 78 118 * note : symbol not declared but exposed for fullbench */ 79 119 size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, 80 - const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */ 120 + const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ 121 + void* dst, size_t dstCapacity, const streaming_operation streaming) 81 122 { 82 123 DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); 83 124 RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); ··· 140 99 U32 const lhlCode = (istart[0] >> 2) & 3; 141 100 U32 const lhc = MEM_readLE32(istart); 142 101 size_t hufSuccess; 102 + size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); 143 103 switch(lhlCode) 144 104 { 145 105 case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ ··· 163 121 litCSize = (lhc >> 22) + ((size_t)istart[4] << 10); 164 122 break; 165 123 } 124 + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 166 125 RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); 167 126 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); 127 + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); 128 + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); 168 129 169 130 /* prefetch huffman table if cold */ 170 131 if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { ··· 178 133 if (singleStream) { 179 134 hufSuccess = HUF_decompress1X_usingDTable_bmi2( 180 135 dctx->litBuffer, litSize, istart+lhSize, litCSize, 181 - dctx->HUFptr, dctx->bmi2); 136 + dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); 182 137 } else { 183 138 hufSuccess = HUF_decompress4X_usingDTable_bmi2( 184 139 dctx->litBuffer, litSize, istart+lhSize, litCSize, 185 - dctx->HUFptr, dctx->bmi2); 140 + dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); 186 141 } 187 142 } else { 188 143 if (singleStream) { ··· 195 150 hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( 196 151 dctx->entropy.hufTable, dctx->litBuffer, litSize, 197 152 istart+lhSize, litCSize, dctx->workspace, 198 - sizeof(dctx->workspace), dctx->bmi2); 153 + sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); 199 154 #endif 200 155 } else { 201 156 hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( 202 157 dctx->entropy.hufTable, dctx->litBuffer, litSize, 203 158 istart+lhSize, litCSize, dctx->workspace, 204 - sizeof(dctx->workspace), dctx->bmi2); 159 + sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); 205 160 } 161 + } 162 + if (dctx->litBufferLocation == ZSTD_split) 163 + { 164 + ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); 165 + ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); 166 + dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; 167 + dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; 206 168 } 207 169 208 170 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); ··· 218 166 dctx->litSize = litSize; 219 167 dctx->litEntropy = 1; 220 168 if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; 221 - ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); 222 169 return litCSize + lhSize; 223 170 } 224 171 225 172 case set_basic: 226 173 { size_t litSize, lhSize; 227 174 U32 const lhlCode = ((istart[0]) >> 2) & 3; 175 + size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); 228 176 switch(lhlCode) 229 177 { 230 178 case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ ··· 241 189 break; 242 190 } 243 191 192 + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 193 + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); 194 + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); 244 195 if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ 245 196 RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); 246 - ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize); 197 + if (dctx->litBufferLocation == ZSTD_split) 198 + { 199 + ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE); 200 + ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); 201 + } 202 + else 203 + { 204 + ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize); 205 + } 247 206 dctx->litPtr = dctx->litBuffer; 248 207 dctx->litSize = litSize; 249 - ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); 250 208 return lhSize+litSize; 251 209 } 252 210 /* direct reference into compressed stream */ 253 211 dctx->litPtr = istart+lhSize; 254 212 dctx->litSize = litSize; 213 + dctx->litBufferEnd = dctx->litPtr + litSize; 214 + dctx->litBufferLocation = ZSTD_not_in_dst; 255 215 return lhSize+litSize; 256 216 } 257 217 258 218 case set_rle: 259 219 { U32 const lhlCode = ((istart[0]) >> 2) & 3; 260 220 size_t litSize, lhSize; 221 + size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); 261 222 switch(lhlCode) 262 223 { 263 224 case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ ··· 287 222 RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); 288 223 break; 289 224 } 225 + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); 290 226 RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); 291 - ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH); 227 + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); 228 + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); 229 + if (dctx->litBufferLocation == ZSTD_split) 230 + { 231 + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE); 232 + ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE); 233 + } 234 + else 235 + { 236 + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize); 237 + } 292 238 dctx->litPtr = dctx->litBuffer; 293 239 dctx->litSize = litSize; 294 240 return lhSize+1; ··· 419 343 }; /* ML_defaultDTable */ 420 344 421 345 422 - static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits) 346 + static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits) 423 347 { 424 348 void* ptr = dt; 425 349 ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr; ··· 431 355 cell->nbBits = 0; 432 356 cell->nextState = 0; 433 357 assert(nbAddBits < 255); 434 - cell->nbAdditionalBits = (BYTE)nbAddBits; 358 + cell->nbAdditionalBits = nbAddBits; 435 359 cell->baseValue = baseValue; 436 360 } 437 361 ··· 443 367 FORCE_INLINE_TEMPLATE 444 368 void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, 445 369 const short* normalizedCounter, unsigned maxSymbolValue, 446 - const U32* baseValue, const U32* nbAdditionalBits, 370 + const U32* baseValue, const U8* nbAdditionalBits, 447 371 unsigned tableLog, void* wksp, size_t wkspSize) 448 372 { 449 373 ZSTD_seqSymbol* const tableDecode = dt+1; ··· 554 478 tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) ); 555 479 tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize); 556 480 assert(nbAdditionalBits[symbol] < 255); 557 - tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol]; 481 + tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol]; 558 482 tableDecode[u].baseValue = baseValue[symbol]; 559 483 } 560 484 } ··· 563 487 /* Avoids the FORCE_INLINE of the _body() function. */ 564 488 static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt, 565 489 const short* normalizedCounter, unsigned maxSymbolValue, 566 - const U32* baseValue, const U32* nbAdditionalBits, 490 + const U32* baseValue, const U8* nbAdditionalBits, 567 491 unsigned tableLog, void* wksp, size_t wkspSize) 568 492 { 569 493 ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, ··· 571 495 } 572 496 573 497 #if DYNAMIC_BMI2 574 - TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, 498 + BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, 575 499 const short* normalizedCounter, unsigned maxSymbolValue, 576 - const U32* baseValue, const U32* nbAdditionalBits, 500 + const U32* baseValue, const U8* nbAdditionalBits, 577 501 unsigned tableLog, void* wksp, size_t wkspSize) 578 502 { 579 503 ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, ··· 583 507 584 508 void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, 585 509 const short* normalizedCounter, unsigned maxSymbolValue, 586 - const U32* baseValue, const U32* nbAdditionalBits, 510 + const U32* baseValue, const U8* nbAdditionalBits, 587 511 unsigned tableLog, void* wksp, size_t wkspSize, int bmi2) 588 512 { 589 513 #if DYNAMIC_BMI2 ··· 605 529 static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr, 606 530 symbolEncodingType_e type, unsigned max, U32 maxLog, 607 531 const void* src, size_t srcSize, 608 - const U32* baseValue, const U32* nbAdditionalBits, 532 + const U32* baseValue, const U8* nbAdditionalBits, 609 533 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable, 610 534 int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize, 611 535 int bmi2) ··· 617 541 RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, ""); 618 542 { U32 const symbol = *(const BYTE*)src; 619 543 U32 const baseline = baseValue[symbol]; 620 - U32 const nbBits = nbAdditionalBits[symbol]; 544 + U8 const nbBits = nbAdditionalBits[symbol]; 621 545 ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); 622 546 } 623 547 *DTablePtr = DTableSpace; ··· 696 620 LL_defaultDTable, dctx->fseEntropy, 697 621 dctx->ddictIsCold, nbSeq, 698 622 dctx->workspace, sizeof(dctx->workspace), 699 - dctx->bmi2); 623 + ZSTD_DCtx_get_bmi2(dctx)); 700 624 RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 701 625 ip += llhSize; 702 626 } ··· 708 632 OF_defaultDTable, dctx->fseEntropy, 709 633 dctx->ddictIsCold, nbSeq, 710 634 dctx->workspace, sizeof(dctx->workspace), 711 - dctx->bmi2); 635 + ZSTD_DCtx_get_bmi2(dctx)); 712 636 RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 713 637 ip += ofhSize; 714 638 } ··· 720 644 ML_defaultDTable, dctx->fseEntropy, 721 645 dctx->ddictIsCold, nbSeq, 722 646 dctx->workspace, sizeof(dctx->workspace), 723 - dctx->bmi2); 647 + ZSTD_DCtx_get_bmi2(dctx)); 724 648 RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); 725 649 ip += mlhSize; 726 650 } ··· 734 658 size_t litLength; 735 659 size_t matchLength; 736 660 size_t offset; 737 - const BYTE* match; 738 661 } seq_t; 739 662 740 663 typedef struct { ··· 747 672 ZSTD_fseState stateOffb; 748 673 ZSTD_fseState stateML; 749 674 size_t prevOffset[ZSTD_REP_NUM]; 750 - const BYTE* prefixStart; 751 - const BYTE* dictEnd; 752 - size_t pos; 753 675 } seqState_t; 754 676 755 677 /*! ZSTD_overlapCopy8() : ··· 789 717 * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. 790 718 * The src buffer must be before the dst buffer. 791 719 */ 792 - static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { 720 + static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { 793 721 ptrdiff_t const diff = op - ip; 794 722 BYTE* const oend = op + length; 795 723 ··· 805 733 /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ 806 734 assert(length >= 8); 807 735 ZSTD_overlapCopy8(&op, &ip, diff); 736 + length -= 8; 808 737 assert(op - ip >= 8); 809 738 assert(op <= oend); 810 739 } ··· 820 747 assert(oend > oend_w); 821 748 ZSTD_wildcopy(op, ip, oend_w - op, ovtype); 822 749 ip += oend_w - op; 823 - op = oend_w; 750 + op += oend_w - op; 824 751 } 752 + /* Handle the leftovers. */ 753 + while (op < oend) *op++ = *ip++; 754 + } 755 + 756 + /* ZSTD_safecopyDstBeforeSrc(): 757 + * This version allows overlap with dst before src, or handles the non-overlap case with dst after src 758 + * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ 759 + static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) { 760 + ptrdiff_t const diff = op - ip; 761 + BYTE* const oend = op + length; 762 + 763 + if (length < 8 || diff > -8) { 764 + /* Handle short lengths, close overlaps, and dst not before src. */ 765 + while (op < oend) *op++ = *ip++; 766 + return; 767 + } 768 + 769 + if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) { 770 + ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap); 771 + ip += oend - WILDCOPY_OVERLENGTH - op; 772 + op += oend - WILDCOPY_OVERLENGTH - op; 773 + } 774 + 825 775 /* Handle the leftovers. */ 826 776 while (op < oend) *op++ = *ip++; 827 777 } ··· 859 763 */ 860 764 FORCE_NOINLINE 861 765 size_t ZSTD_execSequenceEnd(BYTE* op, 862 - BYTE* const oend, seq_t sequence, 863 - const BYTE** litPtr, const BYTE* const litLimit, 864 - const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 766 + BYTE* const oend, seq_t sequence, 767 + const BYTE** litPtr, const BYTE* const litLimit, 768 + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 865 769 { 866 770 BYTE* const oLitEnd = op + sequence.litLength; 867 771 size_t const sequenceLength = sequence.litLength + sequence.matchLength; ··· 884 788 if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 885 789 /* offset beyond prefix */ 886 790 RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); 887 - match = dictEnd - (prefixStart-match); 791 + match = dictEnd - (prefixStart - match); 888 792 if (match + sequence.matchLength <= dictEnd) { 889 793 ZSTD_memmove(oLitEnd, match, sequence.matchLength); 890 794 return sequenceLength; 891 795 } 892 796 /* span extDict & currentPrefixSegment */ 893 797 { size_t const length1 = dictEnd - match; 894 - ZSTD_memmove(oLitEnd, match, length1); 895 - op = oLitEnd + length1; 896 - sequence.matchLength -= length1; 897 - match = prefixStart; 898 - } } 798 + ZSTD_memmove(oLitEnd, match, length1); 799 + op = oLitEnd + length1; 800 + sequence.matchLength -= length1; 801 + match = prefixStart; 802 + } 803 + } 804 + ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); 805 + return sequenceLength; 806 + } 807 + 808 + /* ZSTD_execSequenceEndSplitLitBuffer(): 809 + * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. 810 + */ 811 + FORCE_NOINLINE 812 + size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, 813 + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, 814 + const BYTE** litPtr, const BYTE* const litLimit, 815 + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 816 + { 817 + BYTE* const oLitEnd = op + sequence.litLength; 818 + size_t const sequenceLength = sequence.litLength + sequence.matchLength; 819 + const BYTE* const iLitEnd = *litPtr + sequence.litLength; 820 + const BYTE* match = oLitEnd - sequence.offset; 821 + 822 + 823 + /* bounds checks : careful of address space overflow in 32-bit mode */ 824 + RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); 825 + RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); 826 + assert(op < op + sequenceLength); 827 + assert(oLitEnd < op + sequenceLength); 828 + 829 + /* copy literals */ 830 + RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer"); 831 + ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength); 832 + op = oLitEnd; 833 + *litPtr = iLitEnd; 834 + 835 + /* copy Match */ 836 + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 837 + /* offset beyond prefix */ 838 + RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); 839 + match = dictEnd - (prefixStart - match); 840 + if (match + sequence.matchLength <= dictEnd) { 841 + ZSTD_memmove(oLitEnd, match, sequence.matchLength); 842 + return sequenceLength; 843 + } 844 + /* span extDict & currentPrefixSegment */ 845 + { size_t const length1 = dictEnd - match; 846 + ZSTD_memmove(oLitEnd, match, length1); 847 + op = oLitEnd + length1; 848 + sequence.matchLength -= length1; 849 + match = prefixStart; 850 + } 851 + } 899 852 ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); 900 853 return sequenceLength; 901 854 } 902 855 903 856 HINT_INLINE 904 857 size_t ZSTD_execSequence(BYTE* op, 905 - BYTE* const oend, seq_t sequence, 906 - const BYTE** litPtr, const BYTE* const litLimit, 907 - const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 858 + BYTE* const oend, seq_t sequence, 859 + const BYTE** litPtr, const BYTE* const litLimit, 860 + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 908 861 { 909 862 BYTE* const oLitEnd = op + sequence.litLength; 910 863 size_t const sequenceLength = sequence.litLength + sequence.matchLength; ··· 970 825 * - 32-bit mode and the match length overflows 971 826 */ 972 827 if (UNLIKELY( 828 + iLitEnd > litLimit || 829 + oMatchEnd > oend_w || 830 + (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) 831 + return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); 832 + 833 + /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ 834 + assert(op <= oLitEnd /* No overflow */); 835 + assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); 836 + assert(oMatchEnd <= oend /* No underflow */); 837 + assert(iLitEnd <= litLimit /* Literal length is in bounds */); 838 + assert(oLitEnd <= oend_w /* Can wildcopy literals */); 839 + assert(oMatchEnd <= oend_w /* Can wildcopy matches */); 840 + 841 + /* Copy Literals: 842 + * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. 843 + * We likely don't need the full 32-byte wildcopy. 844 + */ 845 + assert(WILDCOPY_OVERLENGTH >= 16); 846 + ZSTD_copy16(op, (*litPtr)); 847 + if (UNLIKELY(sequence.litLength > 16)) { 848 + ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap); 849 + } 850 + op = oLitEnd; 851 + *litPtr = iLitEnd; /* update for next sequence */ 852 + 853 + /* Copy Match */ 854 + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { 855 + /* offset beyond prefix -> go into extDict */ 856 + RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); 857 + match = dictEnd + (match - prefixStart); 858 + if (match + sequence.matchLength <= dictEnd) { 859 + ZSTD_memmove(oLitEnd, match, sequence.matchLength); 860 + return sequenceLength; 861 + } 862 + /* span extDict & currentPrefixSegment */ 863 + { size_t const length1 = dictEnd - match; 864 + ZSTD_memmove(oLitEnd, match, length1); 865 + op = oLitEnd + length1; 866 + sequence.matchLength -= length1; 867 + match = prefixStart; 868 + } 869 + } 870 + /* Match within prefix of 1 or more bytes */ 871 + assert(op <= oMatchEnd); 872 + assert(oMatchEnd <= oend_w); 873 + assert(match >= prefixStart); 874 + assert(sequence.matchLength >= 1); 875 + 876 + /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy 877 + * without overlap checking. 878 + */ 879 + if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { 880 + /* We bet on a full wildcopy for matches, since we expect matches to be 881 + * longer than literals (in general). In silesia, ~10% of matches are longer 882 + * than 16 bytes. 883 + */ 884 + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); 885 + return sequenceLength; 886 + } 887 + assert(sequence.offset < WILDCOPY_VECLEN); 888 + 889 + /* Copy 8 bytes and spread the offset to be >= 8. */ 890 + ZSTD_overlapCopy8(&op, &match, sequence.offset); 891 + 892 + /* If the match length is > 8 bytes, then continue with the wildcopy. */ 893 + if (sequence.matchLength > 8) { 894 + assert(op < oMatchEnd); 895 + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst); 896 + } 897 + return sequenceLength; 898 + } 899 + 900 + HINT_INLINE 901 + size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, 902 + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, 903 + const BYTE** litPtr, const BYTE* const litLimit, 904 + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) 905 + { 906 + BYTE* const oLitEnd = op + sequence.litLength; 907 + size_t const sequenceLength = sequence.litLength + sequence.matchLength; 908 + BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ 909 + const BYTE* const iLitEnd = *litPtr + sequence.litLength; 910 + const BYTE* match = oLitEnd - sequence.offset; 911 + 912 + assert(op != NULL /* Precondition */); 913 + assert(oend_w < oend /* No underflow */); 914 + /* Handle edge cases in a slow path: 915 + * - Read beyond end of literals 916 + * - Match end is within WILDCOPY_OVERLIMIT of oend 917 + * - 32-bit mode and the match length overflows 918 + */ 919 + if (UNLIKELY( 973 920 iLitEnd > litLimit || 974 921 oMatchEnd > oend_w || 975 922 (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) 976 - return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); 923 + return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); 977 924 978 925 /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ 979 926 assert(op <= oLitEnd /* No overflow */); ··· 1133 896 return sequenceLength; 1134 897 } 1135 898 899 + 1136 900 static void 1137 901 ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) 1138 902 { ··· 1147 909 } 1148 910 1149 911 FORCE_INLINE_TEMPLATE void 1150 - ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD) 912 + ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) 1151 913 { 1152 - ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state]; 1153 - U32 const nbBits = DInfo.nbBits; 1154 914 size_t const lowBits = BIT_readBits(bitD, nbBits); 1155 - DStatePtr->state = DInfo.nextState + lowBits; 1156 - } 1157 - 1158 - FORCE_INLINE_TEMPLATE void 1159 - ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo) 1160 - { 1161 - U32 const nbBits = DInfo.nbBits; 1162 - size_t const lowBits = BIT_readBits(bitD, nbBits); 1163 - DStatePtr->state = DInfo.nextState + lowBits; 915 + DStatePtr->state = nextState + lowBits; 1164 916 } 1165 917 1166 918 /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum ··· 1164 936 : 0) 1165 937 1166 938 typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; 1167 - typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e; 1168 939 1169 940 FORCE_INLINE_TEMPLATE seq_t 1170 - ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch) 941 + ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) 1171 942 { 1172 943 seq_t seq; 1173 - ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state]; 1174 - ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state]; 1175 - ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state]; 1176 - U32 const llBase = llDInfo.baseValue; 1177 - U32 const mlBase = mlDInfo.baseValue; 1178 - U32 const ofBase = ofDInfo.baseValue; 1179 - BYTE const llBits = llDInfo.nbAdditionalBits; 1180 - BYTE const mlBits = mlDInfo.nbAdditionalBits; 1181 - BYTE const ofBits = ofDInfo.nbAdditionalBits; 1182 - BYTE const totalBits = llBits+mlBits+ofBits; 944 + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; 945 + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; 946 + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; 947 + seq.matchLength = mlDInfo->baseValue; 948 + seq.litLength = llDInfo->baseValue; 949 + { U32 const ofBase = ofDInfo->baseValue; 950 + BYTE const llBits = llDInfo->nbAdditionalBits; 951 + BYTE const mlBits = mlDInfo->nbAdditionalBits; 952 + BYTE const ofBits = ofDInfo->nbAdditionalBits; 953 + BYTE const totalBits = llBits+mlBits+ofBits; 1183 954 1184 - /* sequence */ 1185 - { size_t offset; 1186 - if (ofBits > 1) { 1187 - ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); 1188 - ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); 1189 - assert(ofBits <= MaxOff); 1190 - if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { 1191 - U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); 1192 - offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); 1193 - BIT_reloadDStream(&seqState->DStream); 1194 - if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); 1195 - assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ 1196 - } else { 1197 - offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ 1198 - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); 1199 - } 1200 - seqState->prevOffset[2] = seqState->prevOffset[1]; 1201 - seqState->prevOffset[1] = seqState->prevOffset[0]; 1202 - seqState->prevOffset[0] = offset; 1203 - } else { 1204 - U32 const ll0 = (llBase == 0); 1205 - if (LIKELY((ofBits == 0))) { 1206 - if (LIKELY(!ll0)) 1207 - offset = seqState->prevOffset[0]; 1208 - else { 1209 - offset = seqState->prevOffset[1]; 1210 - seqState->prevOffset[1] = seqState->prevOffset[0]; 1211 - seqState->prevOffset[0] = offset; 955 + U16 const llNext = llDInfo->nextState; 956 + U16 const mlNext = mlDInfo->nextState; 957 + U16 const ofNext = ofDInfo->nextState; 958 + U32 const llnbBits = llDInfo->nbBits; 959 + U32 const mlnbBits = mlDInfo->nbBits; 960 + U32 const ofnbBits = ofDInfo->nbBits; 961 + /* 962 + * As gcc has better branch and block analyzers, sometimes it is only 963 + * valuable to mark likelyness for clang, it gives around 3-4% of 964 + * performance. 965 + */ 966 + 967 + /* sequence */ 968 + { size_t offset; 969 + #if defined(__clang__) 970 + if (LIKELY(ofBits > 1)) { 971 + #else 972 + if (ofBits > 1) { 973 + #endif 974 + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); 975 + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); 976 + assert(ofBits <= MaxOff); 977 + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { 978 + U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); 979 + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); 980 + BIT_reloadDStream(&seqState->DStream); 981 + if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); 982 + assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ 983 + } else { 984 + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ 985 + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); 1212 986 } 987 + seqState->prevOffset[2] = seqState->prevOffset[1]; 988 + seqState->prevOffset[1] = seqState->prevOffset[0]; 989 + seqState->prevOffset[0] = offset; 1213 990 } else { 1214 - offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); 1215 - { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; 1216 - temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ 1217 - if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; 1218 - seqState->prevOffset[1] = seqState->prevOffset[0]; 1219 - seqState->prevOffset[0] = offset = temp; 1220 - } } } 1221 - seq.offset = offset; 1222 - } 1223 - 1224 - seq.matchLength = mlBase; 1225 - if (mlBits > 0) 1226 - seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); 1227 - 1228 - if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) 1229 - BIT_reloadDStream(&seqState->DStream); 1230 - if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) 1231 - BIT_reloadDStream(&seqState->DStream); 1232 - /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ 1233 - ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); 1234 - 1235 - seq.litLength = llBase; 1236 - if (llBits > 0) 1237 - seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); 1238 - 1239 - if (MEM_32bits()) 1240 - BIT_reloadDStream(&seqState->DStream); 1241 - 1242 - DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", 1243 - (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); 1244 - 1245 - if (prefetch == ZSTD_p_prefetch) { 1246 - size_t const pos = seqState->pos + seq.litLength; 1247 - const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart; 1248 - seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. 1249 - * No consequence though : no memory access will occur, offset is only used for prefetching */ 1250 - seqState->pos = pos + seq.matchLength; 1251 - } 1252 - 1253 - /* ANS state update 1254 - * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo(). 1255 - * clang-9.2.0 does 7% worse with ZSTD_updateFseState(). 1256 - * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the 1257 - * better option, so it is the default for other compilers. But, if you 1258 - * measure that it is worse, please put up a pull request. 1259 - */ 1260 - { 1261 - #if !defined(__clang__) 1262 - const int kUseUpdateFseState = 1; 1263 - #else 1264 - const int kUseUpdateFseState = 0; 1265 - #endif 1266 - if (kUseUpdateFseState) { 1267 - ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ 1268 - ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ 1269 - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ 1270 - ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ 1271 - } else { 1272 - ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */ 1273 - ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */ 1274 - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ 1275 - ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */ 991 + U32 const ll0 = (llDInfo->baseValue == 0); 992 + if (LIKELY((ofBits == 0))) { 993 + offset = seqState->prevOffset[ll0]; 994 + seqState->prevOffset[1] = seqState->prevOffset[!ll0]; 995 + seqState->prevOffset[0] = offset; 996 + } else { 997 + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); 998 + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; 999 + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ 1000 + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; 1001 + seqState->prevOffset[1] = seqState->prevOffset[0]; 1002 + seqState->prevOffset[0] = offset = temp; 1003 + } } } 1004 + seq.offset = offset; 1276 1005 } 1006 + 1007 + #if defined(__clang__) 1008 + if (UNLIKELY(mlBits > 0)) 1009 + #else 1010 + if (mlBits > 0) 1011 + #endif 1012 + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); 1013 + 1014 + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) 1015 + BIT_reloadDStream(&seqState->DStream); 1016 + if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) 1017 + BIT_reloadDStream(&seqState->DStream); 1018 + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ 1019 + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); 1020 + 1021 + #if defined(__clang__) 1022 + if (UNLIKELY(llBits > 0)) 1023 + #else 1024 + if (llBits > 0) 1025 + #endif 1026 + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); 1027 + 1028 + if (MEM_32bits()) 1029 + BIT_reloadDStream(&seqState->DStream); 1030 + 1031 + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", 1032 + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); 1033 + 1034 + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ 1035 + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ 1036 + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ 1037 + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ 1277 1038 } 1278 1039 1279 1040 return seq; ··· 1315 1098 #endif 1316 1099 1317 1100 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1101 + 1102 + 1318 1103 FORCE_INLINE_TEMPLATE size_t 1319 1104 DONT_VECTORIZE 1320 - ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, 1105 + ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, 1321 1106 void* dst, size_t maxDstSize, 1322 1107 const void* seqStart, size_t seqSize, int nbSeq, 1323 1108 const ZSTD_longOffset_e isLongOffset, ··· 1331 1112 BYTE* const oend = ostart + maxDstSize; 1332 1113 BYTE* op = ostart; 1333 1114 const BYTE* litPtr = dctx->litPtr; 1334 - const BYTE* const litEnd = litPtr + dctx->litSize; 1115 + const BYTE* litBufferEnd = dctx->litBufferEnd; 1335 1116 const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); 1336 1117 const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); 1337 1118 const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); 1338 - DEBUGLOG(5, "ZSTD_decompressSequences_body"); 1119 + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer"); 1339 1120 (void)frame; 1340 1121 1341 1122 /* Regen sequences */ 1342 1123 if (nbSeq) { 1343 1124 seqState_t seqState; 1344 - size_t error = 0; 1345 1125 dctx->fseEntropy = 1; 1346 1126 { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1347 1127 RETURN_ERROR_IF( ··· 1356 1138 BIT_DStream_endOfBuffer < BIT_DStream_completed && 1357 1139 BIT_DStream_completed < BIT_DStream_overflow); 1358 1140 1141 + /* decompress without overrunning litPtr begins */ 1142 + { 1143 + seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); 1144 + /* Align the decompression loop to 32 + 16 bytes. 1145 + * 1146 + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression 1147 + * speed swings based on the alignment of the decompression loop. This 1148 + * performance swing is caused by parts of the decompression loop falling 1149 + * out of the DSB. The entire decompression loop should fit in the DSB, 1150 + * when it can't we get much worse performance. You can measure if you've 1151 + * hit the good case or the bad case with this perf command for some 1152 + * compressed file test.zst: 1153 + * 1154 + * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ 1155 + * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst 1156 + * 1157 + * If you see most cycles served out of the MITE you've hit the bad case. 1158 + * If you see most cycles served out of the DSB you've hit the good case. 1159 + * If it is pretty even then you may be in an okay case. 1160 + * 1161 + * This issue has been reproduced on the following CPUs: 1162 + * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 1163 + * Use Instruments->Counters to get DSB/MITE cycles. 1164 + * I never got performance swings, but I was able to 1165 + * go from the good case of mostly DSB to half of the 1166 + * cycles served from MITE. 1167 + * - Coffeelake: Intel i9-9900k 1168 + * - Coffeelake: Intel i7-9700k 1169 + * 1170 + * I haven't been able to reproduce the instability or DSB misses on any 1171 + * of the following CPUS: 1172 + * - Haswell 1173 + * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH 1174 + * - Skylake 1175 + * 1176 + * Alignment is done for each of the three major decompression loops: 1177 + * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer 1178 + * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer 1179 + * - ZSTD_decompressSequences_body 1180 + * Alignment choices are made to minimize large swings on bad cases and influence on performance 1181 + * from changes external to this code, rather than to overoptimize on the current commit. 1182 + * 1183 + * If you are seeing performance stability this script can help test. 1184 + * It tests on 4 commits in zstd where I saw performance change. 1185 + * 1186 + * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 1187 + */ 1359 1188 #if defined(__x86_64__) 1360 - /* Align the decompression loop to 32 + 16 bytes. 1361 - * 1362 - * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression 1363 - * speed swings based on the alignment of the decompression loop. This 1364 - * performance swing is caused by parts of the decompression loop falling 1365 - * out of the DSB. The entire decompression loop should fit in the DSB, 1366 - * when it can't we get much worse performance. You can measure if you've 1367 - * hit the good case or the bad case with this perf command for some 1368 - * compressed file test.zst: 1369 - * 1370 - * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ 1371 - * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst 1372 - * 1373 - * If you see most cycles served out of the MITE you've hit the bad case. 1374 - * If you see most cycles served out of the DSB you've hit the good case. 1375 - * If it is pretty even then you may be in an okay case. 1376 - * 1377 - * I've been able to reproduce this issue on the following CPUs: 1378 - * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 1379 - * Use Instruments->Counters to get DSB/MITE cycles. 1380 - * I never got performance swings, but I was able to 1381 - * go from the good case of mostly DSB to half of the 1382 - * cycles served from MITE. 1383 - * - Coffeelake: Intel i9-9900k 1384 - * 1385 - * I haven't been able to reproduce the instability or DSB misses on any 1386 - * of the following CPUS: 1387 - * - Haswell 1388 - * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH 1389 - * - Skylake 1390 - * 1391 - * If you are seeing performance stability this script can help test. 1392 - * It tests on 4 commits in zstd where I saw performance change. 1393 - * 1394 - * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 1395 - */ 1396 - __asm__(".p2align 5"); 1397 - __asm__("nop"); 1398 - __asm__(".p2align 4"); 1189 + __asm__(".p2align 6"); 1190 + # if __GNUC__ >= 7 1191 + /* good for gcc-7, gcc-9, and gcc-11 */ 1192 + __asm__("nop"); 1193 + __asm__(".p2align 5"); 1194 + __asm__("nop"); 1195 + __asm__(".p2align 4"); 1196 + # if __GNUC__ == 8 || __GNUC__ == 10 1197 + /* good for gcc-8 and gcc-10 */ 1198 + __asm__("nop"); 1199 + __asm__(".p2align 3"); 1200 + # endif 1201 + # endif 1399 1202 #endif 1203 + 1204 + /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ 1205 + for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) { 1206 + size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1207 + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1208 + assert(!ZSTD_isError(oneSeqSize)); 1209 + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1210 + #endif 1211 + if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1212 + return oneSeqSize; 1213 + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1214 + op += oneSeqSize; 1215 + if (UNLIKELY(!--nbSeq)) 1216 + break; 1217 + BIT_reloadDStream(&(seqState.DStream)); 1218 + sequence = ZSTD_decodeSequence(&seqState, isLongOffset); 1219 + } 1220 + 1221 + /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ 1222 + if (nbSeq > 0) { 1223 + const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1224 + if (leftoverLit) 1225 + { 1226 + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1227 + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1228 + sequence.litLength -= leftoverLit; 1229 + op += leftoverLit; 1230 + } 1231 + litPtr = dctx->litExtraBuffer; 1232 + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1233 + dctx->litBufferLocation = ZSTD_not_in_dst; 1234 + { 1235 + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1236 + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1237 + assert(!ZSTD_isError(oneSeqSize)); 1238 + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1239 + #endif 1240 + if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1241 + return oneSeqSize; 1242 + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1243 + op += oneSeqSize; 1244 + if (--nbSeq) 1245 + BIT_reloadDStream(&(seqState.DStream)); 1246 + } 1247 + } 1248 + } 1249 + 1250 + if (nbSeq > 0) /* there is remaining lit from extra buffer */ 1251 + { 1252 + 1253 + #if defined(__x86_64__) 1254 + __asm__(".p2align 6"); 1255 + __asm__("nop"); 1256 + # if __GNUC__ != 7 1257 + /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */ 1258 + __asm__(".p2align 4"); 1259 + __asm__("nop"); 1260 + __asm__(".p2align 3"); 1261 + # elif __GNUC__ >= 11 1262 + __asm__(".p2align 3"); 1263 + # else 1264 + __asm__(".p2align 5"); 1265 + __asm__("nop"); 1266 + __asm__(".p2align 3"); 1267 + # endif 1268 + #endif 1269 + 1270 + for (; ; ) { 1271 + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); 1272 + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); 1273 + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1274 + assert(!ZSTD_isError(oneSeqSize)); 1275 + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1276 + #endif 1277 + if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1278 + return oneSeqSize; 1279 + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1280 + op += oneSeqSize; 1281 + if (UNLIKELY(!--nbSeq)) 1282 + break; 1283 + BIT_reloadDStream(&(seqState.DStream)); 1284 + } 1285 + } 1286 + 1287 + /* check if reached exact end */ 1288 + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); 1289 + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); 1290 + RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); 1291 + /* save reps for next block */ 1292 + { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } 1293 + } 1294 + 1295 + /* last literal segment */ 1296 + if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ 1297 + { 1298 + size_t const lastLLSize = litBufferEnd - litPtr; 1299 + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); 1300 + if (op != NULL) { 1301 + ZSTD_memmove(op, litPtr, lastLLSize); 1302 + op += lastLLSize; 1303 + } 1304 + litPtr = dctx->litExtraBuffer; 1305 + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1306 + dctx->litBufferLocation = ZSTD_not_in_dst; 1307 + } 1308 + { size_t const lastLLSize = litBufferEnd - litPtr; 1309 + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1310 + if (op != NULL) { 1311 + ZSTD_memcpy(op, litPtr, lastLLSize); 1312 + op += lastLLSize; 1313 + } 1314 + } 1315 + 1316 + return op-ostart; 1317 + } 1318 + 1319 + FORCE_INLINE_TEMPLATE size_t 1320 + DONT_VECTORIZE 1321 + ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, 1322 + void* dst, size_t maxDstSize, 1323 + const void* seqStart, size_t seqSize, int nbSeq, 1324 + const ZSTD_longOffset_e isLongOffset, 1325 + const int frame) 1326 + { 1327 + const BYTE* ip = (const BYTE*)seqStart; 1328 + const BYTE* const iend = ip + seqSize; 1329 + BYTE* const ostart = (BYTE*)dst; 1330 + BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer; 1331 + BYTE* op = ostart; 1332 + const BYTE* litPtr = dctx->litPtr; 1333 + const BYTE* const litEnd = litPtr + dctx->litSize; 1334 + const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); 1335 + const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); 1336 + const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); 1337 + DEBUGLOG(5, "ZSTD_decompressSequences_body"); 1338 + (void)frame; 1339 + 1340 + /* Regen sequences */ 1341 + if (nbSeq) { 1342 + seqState_t seqState; 1343 + dctx->fseEntropy = 1; 1344 + { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1345 + RETURN_ERROR_IF( 1346 + ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)), 1347 + corruption_detected, ""); 1348 + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); 1349 + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); 1350 + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); 1351 + assert(dst != NULL); 1352 + 1353 + ZSTD_STATIC_ASSERT( 1354 + BIT_DStream_unfinished < BIT_DStream_completed && 1355 + BIT_DStream_endOfBuffer < BIT_DStream_completed && 1356 + BIT_DStream_completed < BIT_DStream_overflow); 1357 + 1358 + #if defined(__x86_64__) 1359 + __asm__(".p2align 6"); 1360 + __asm__("nop"); 1361 + # if __GNUC__ >= 7 1362 + __asm__(".p2align 5"); 1363 + __asm__("nop"); 1364 + __asm__(".p2align 3"); 1365 + # else 1366 + __asm__(".p2align 4"); 1367 + __asm__("nop"); 1368 + __asm__(".p2align 3"); 1369 + # endif 1370 + #endif 1371 + 1400 1372 for ( ; ; ) { 1401 - seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch); 1373 + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); 1402 1374 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); 1403 1375 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1404 1376 assert(!ZSTD_isError(oneSeqSize)); 1405 1377 if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); 1406 1378 #endif 1379 + if (UNLIKELY(ZSTD_isError(oneSeqSize))) 1380 + return oneSeqSize; 1407 1381 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); 1408 - BIT_reloadDStream(&(seqState.DStream)); 1409 1382 op += oneSeqSize; 1410 - /* gcc and clang both don't like early returns in this loop. 1411 - * Instead break and check for an error at the end of the loop. 1412 - */ 1413 - if (UNLIKELY(ZSTD_isError(oneSeqSize))) { 1414 - error = oneSeqSize; 1383 + if (UNLIKELY(!--nbSeq)) 1415 1384 break; 1416 - } 1417 - if (UNLIKELY(!--nbSeq)) break; 1385 + BIT_reloadDStream(&(seqState.DStream)); 1418 1386 } 1419 1387 1420 1388 /* check if reached exact end */ 1421 1389 DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); 1422 - if (ZSTD_isError(error)) return error; 1423 1390 RETURN_ERROR_IF(nbSeq, corruption_detected, ""); 1424 1391 RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); 1425 1392 /* save reps for next block */ ··· 1632 1229 { 1633 1230 return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1634 1231 } 1232 + 1233 + static size_t 1234 + ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, 1235 + void* dst, size_t maxDstSize, 1236 + const void* seqStart, size_t seqSize, int nbSeq, 1237 + const ZSTD_longOffset_e isLongOffset, 1238 + const int frame) 1239 + { 1240 + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1241 + } 1635 1242 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1636 1243 1637 1244 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1245 + 1246 + FORCE_INLINE_TEMPLATE size_t 1247 + ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, 1248 + const BYTE* const prefixStart, const BYTE* const dictEnd) 1249 + { 1250 + prefetchPos += sequence.litLength; 1251 + { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; 1252 + const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. 1253 + * No consequence though : memory address is only used for prefetching, not for dereferencing */ 1254 + PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ 1255 + } 1256 + return prefetchPos + sequence.matchLength; 1257 + } 1258 + 1259 + /* This decoding function employs prefetching 1260 + * to reduce latency impact of cache misses. 1261 + * It's generally employed when block contains a significant portion of long-distance matches 1262 + * or when coupled with a "cold" dictionary */ 1638 1263 FORCE_INLINE_TEMPLATE size_t 1639 1264 ZSTD_decompressSequencesLong_body( 1640 1265 ZSTD_DCtx* dctx, ··· 1674 1243 const BYTE* ip = (const BYTE*)seqStart; 1675 1244 const BYTE* const iend = ip + seqSize; 1676 1245 BYTE* const ostart = (BYTE*)dst; 1677 - BYTE* const oend = ostart + maxDstSize; 1246 + BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize; 1678 1247 BYTE* op = ostart; 1679 1248 const BYTE* litPtr = dctx->litPtr; 1680 - const BYTE* const litEnd = litPtr + dctx->litSize; 1249 + const BYTE* litBufferEnd = dctx->litBufferEnd; 1681 1250 const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); 1682 1251 const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); 1683 1252 const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); ··· 1685 1254 1686 1255 /* Regen sequences */ 1687 1256 if (nbSeq) { 1688 - #define STORED_SEQS 4 1257 + #define STORED_SEQS 8 1689 1258 #define STORED_SEQS_MASK (STORED_SEQS-1) 1690 - #define ADVANCED_SEQS 4 1259 + #define ADVANCED_SEQS STORED_SEQS 1691 1260 seq_t sequences[STORED_SEQS]; 1692 1261 int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); 1693 1262 seqState_t seqState; 1694 1263 int seqNb; 1264 + size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */ 1265 + 1695 1266 dctx->fseEntropy = 1; 1696 1267 { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } 1697 - seqState.prefixStart = prefixStart; 1698 - seqState.pos = (size_t)(op-prefixStart); 1699 - seqState.dictEnd = dictEnd; 1700 1268 assert(dst != NULL); 1701 1269 assert(iend >= ip); 1702 1270 RETURN_ERROR_IF( ··· 1707 1277 1708 1278 /* prepare in advance */ 1709 1279 for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) { 1710 - sequences[seqNb] = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch); 1711 - PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ 1280 + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); 1281 + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1282 + sequences[seqNb] = sequence; 1712 1283 } 1713 1284 RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, ""); 1714 1285 1715 - /* decode and decompress */ 1716 - for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) { 1717 - seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch); 1718 - size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); 1286 + /* decompress without stomping litBuffer */ 1287 + for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) { 1288 + seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); 1289 + size_t oneSeqSize; 1290 + 1291 + if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) 1292 + { 1293 + /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ 1294 + const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1295 + if (leftoverLit) 1296 + { 1297 + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1298 + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1299 + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit; 1300 + op += leftoverLit; 1301 + } 1302 + litPtr = dctx->litExtraBuffer; 1303 + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1304 + dctx->litBufferLocation = ZSTD_not_in_dst; 1305 + oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1719 1306 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1720 - assert(!ZSTD_isError(oneSeqSize)); 1721 - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); 1307 + assert(!ZSTD_isError(oneSeqSize)); 1308 + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); 1722 1309 #endif 1723 - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1724 - PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ 1725 - sequences[seqNb & STORED_SEQS_MASK] = sequence; 1726 - op += oneSeqSize; 1310 + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1311 + 1312 + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1313 + sequences[seqNb & STORED_SEQS_MASK] = sequence; 1314 + op += oneSeqSize; 1315 + } 1316 + else 1317 + { 1318 + /* lit buffer is either wholly contained in first or second split, or not split at all*/ 1319 + oneSeqSize = dctx->litBufferLocation == ZSTD_split ? 1320 + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : 1321 + ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1322 + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1323 + assert(!ZSTD_isError(oneSeqSize)); 1324 + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); 1325 + #endif 1326 + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1327 + 1328 + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); 1329 + sequences[seqNb & STORED_SEQS_MASK] = sequence; 1330 + op += oneSeqSize; 1331 + } 1727 1332 } 1728 1333 RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, ""); 1729 1334 1730 1335 /* finish queue */ 1731 1336 seqNb -= seqAdvance; 1732 1337 for ( ; seqNb<nbSeq ; seqNb++) { 1733 - size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); 1338 + seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]); 1339 + if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) 1340 + { 1341 + const size_t leftoverLit = dctx->litBufferEnd - litPtr; 1342 + if (leftoverLit) 1343 + { 1344 + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); 1345 + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); 1346 + sequence->litLength -= leftoverLit; 1347 + op += leftoverLit; 1348 + } 1349 + litPtr = dctx->litExtraBuffer; 1350 + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1351 + dctx->litBufferLocation = ZSTD_not_in_dst; 1352 + { 1353 + size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1734 1354 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1735 - assert(!ZSTD_isError(oneSeqSize)); 1736 - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); 1355 + assert(!ZSTD_isError(oneSeqSize)); 1356 + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); 1737 1357 #endif 1738 - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1739 - op += oneSeqSize; 1358 + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1359 + op += oneSeqSize; 1360 + } 1361 + } 1362 + else 1363 + { 1364 + size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? 1365 + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : 1366 + ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); 1367 + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) 1368 + assert(!ZSTD_isError(oneSeqSize)); 1369 + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); 1370 + #endif 1371 + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; 1372 + op += oneSeqSize; 1373 + } 1740 1374 } 1741 1375 1742 1376 /* save reps for next block */ ··· 1808 1314 } 1809 1315 1810 1316 /* last literal segment */ 1811 - { size_t const lastLLSize = litEnd - litPtr; 1317 + if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */ 1318 + { 1319 + size_t const lastLLSize = litBufferEnd - litPtr; 1320 + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); 1321 + if (op != NULL) { 1322 + ZSTD_memmove(op, litPtr, lastLLSize); 1323 + op += lastLLSize; 1324 + } 1325 + litPtr = dctx->litExtraBuffer; 1326 + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; 1327 + } 1328 + { size_t const lastLLSize = litBufferEnd - litPtr; 1812 1329 RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); 1813 1330 if (op != NULL) { 1814 - ZSTD_memcpy(op, litPtr, lastLLSize); 1331 + ZSTD_memmove(op, litPtr, lastLLSize); 1815 1332 op += lastLLSize; 1816 1333 } 1817 1334 } ··· 1846 1341 #if DYNAMIC_BMI2 1847 1342 1848 1343 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 1849 - static TARGET_ATTRIBUTE("bmi2") size_t 1344 + static BMI2_TARGET_ATTRIBUTE size_t 1850 1345 DONT_VECTORIZE 1851 1346 ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, 1852 1347 void* dst, size_t maxDstSize, ··· 1856 1351 { 1857 1352 return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1858 1353 } 1354 + static BMI2_TARGET_ATTRIBUTE size_t 1355 + DONT_VECTORIZE 1356 + ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, 1357 + void* dst, size_t maxDstSize, 1358 + const void* seqStart, size_t seqSize, int nbSeq, 1359 + const ZSTD_longOffset_e isLongOffset, 1360 + const int frame) 1361 + { 1362 + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1363 + } 1859 1364 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1860 1365 1861 1366 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT 1862 - static TARGET_ATTRIBUTE("bmi2") size_t 1367 + static BMI2_TARGET_ATTRIBUTE size_t 1863 1368 ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, 1864 1369 void* dst, size_t maxDstSize, 1865 1370 const void* seqStart, size_t seqSize, int nbSeq, ··· 1898 1383 { 1899 1384 DEBUGLOG(5, "ZSTD_decompressSequences"); 1900 1385 #if DYNAMIC_BMI2 1901 - if (dctx->bmi2) { 1386 + if (ZSTD_DCtx_get_bmi2(dctx)) { 1902 1387 return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1903 1388 } 1904 1389 #endif 1905 - return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1390 + return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1391 + } 1392 + static size_t 1393 + ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, 1394 + const void* seqStart, size_t seqSize, int nbSeq, 1395 + const ZSTD_longOffset_e isLongOffset, 1396 + const int frame) 1397 + { 1398 + DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); 1399 + #if DYNAMIC_BMI2 1400 + if (ZSTD_DCtx_get_bmi2(dctx)) { 1401 + return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1402 + } 1403 + #endif 1404 + return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1906 1405 } 1907 1406 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ 1908 1407 ··· 1936 1407 { 1937 1408 DEBUGLOG(5, "ZSTD_decompressSequencesLong"); 1938 1409 #if DYNAMIC_BMI2 1939 - if (dctx->bmi2) { 1410 + if (ZSTD_DCtx_get_bmi2(dctx)) { 1940 1411 return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); 1941 1412 } 1942 1413 #endif ··· 1977 1448 size_t 1978 1449 ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, 1979 1450 void* dst, size_t dstCapacity, 1980 - const void* src, size_t srcSize, const int frame) 1451 + const void* src, size_t srcSize, const int frame, const streaming_operation streaming) 1981 1452 { /* blockType == blockCompressed */ 1982 1453 const BYTE* ip = (const BYTE*)src; 1983 1454 /* isLongOffset must be true if there are long offsets. ··· 1992 1463 RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); 1993 1464 1994 1465 /* Decode literals section */ 1995 - { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); 1466 + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); 1996 1467 DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); 1997 1468 if (ZSTD_isError(litCSize)) return litCSize; 1998 1469 ip += litCSize; ··· 2040 1511 2041 1512 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG 2042 1513 /* else */ 2043 - return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); 1514 + if (dctx->litBufferLocation == ZSTD_split) 1515 + return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); 1516 + else 1517 + return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); 2044 1518 #endif 2045 1519 } 2046 1520 } ··· 2066 1534 { 2067 1535 size_t dSize; 2068 1536 ZSTD_checkContinuity(dctx, dst, dstCapacity); 2069 - dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0); 1537 + dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming); 2070 1538 dctx->previousDstEnd = (char*)dst + dSize; 2071 1539 return dSize; 2072 1540 }
+8 -2
lib/zstd/decompress/zstd_decompress_block.h
··· 33 33 */ 34 34 35 35 36 + /* Streaming state is used to inform allocation of the literal buffer */ 37 + typedef enum { 38 + not_streaming = 0, 39 + is_streaming = 1 40 + } streaming_operation; 41 + 36 42 /* ZSTD_decompressBlock_internal() : 37 43 * decompress block, starting at `src`, 38 44 * into destination buffer `dst`. ··· 47 41 */ 48 42 size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, 49 43 void* dst, size_t dstCapacity, 50 - const void* src, size_t srcSize, const int frame); 44 + const void* src, size_t srcSize, const int frame, const streaming_operation streaming); 51 45 52 46 /* ZSTD_buildFSETable() : 53 47 * generate FSE decoding table for one symbol (ll, ml or off) ··· 60 54 */ 61 55 void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, 62 56 const short* normalizedCounter, unsigned maxSymbolValue, 63 - const U32* baseValue, const U32* nbAdditionalBits, 57 + const U32* baseValue, const U8* nbAdditionalBits, 64 58 unsigned tableLog, void* wksp, size_t wkspSize, 65 59 int bmi2); 66 60
+32 -6
lib/zstd/decompress/zstd_decompress_internal.h
··· 20 20 * Dependencies 21 21 *********************************************************/ 22 22 #include "../common/mem.h" /* BYTE, U16, U32 */ 23 - #include "../common/zstd_internal.h" /* ZSTD_seqSymbol */ 23 + #include "../common/zstd_internal.h" /* constants : MaxLL, MaxML, MaxOff, LLFSELog, etc. */ 24 24 25 25 26 26 ··· 40 40 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 41 41 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD }; 42 42 43 - static UNUSED_ATTR const U32 OF_bits[MaxOff+1] = { 43 + static UNUSED_ATTR const U8 OF_bits[MaxOff+1] = { 44 44 0, 1, 2, 3, 4, 5, 6, 7, 45 45 8, 9, 10, 11, 12, 13, 14, 15, 46 46 16, 17, 18, 19, 20, 21, 22, 23, ··· 106 106 size_t ddictPtrCount; 107 107 } ZSTD_DDictHashSet; 108 108 109 + #ifndef ZSTD_DECODER_INTERNAL_BUFFER 110 + # define ZSTD_DECODER_INTERNAL_BUFFER (1 << 16) 111 + #endif 112 + 113 + #define ZSTD_LBMIN 64 114 + #define ZSTD_LBMAX (128 << 10) 115 + 116 + /* extra buffer, compensates when dst is not large enough to store litBuffer */ 117 + #define ZSTD_LITBUFFEREXTRASIZE BOUNDED(ZSTD_LBMIN, ZSTD_DECODER_INTERNAL_BUFFER, ZSTD_LBMAX) 118 + 119 + typedef enum { 120 + ZSTD_not_in_dst = 0, /* Stored entirely within litExtraBuffer */ 121 + ZSTD_in_dst = 1, /* Stored entirely within dst (in memory after current output write) */ 122 + ZSTD_split = 2 /* Split between litExtraBuffer and dst */ 123 + } ZSTD_litLocation_e; 124 + 109 125 struct ZSTD_DCtx_s 110 126 { 111 127 const ZSTD_seqSymbol* LLTptr; ··· 152 136 size_t litSize; 153 137 size_t rleSize; 154 138 size_t staticSize; 139 + #if DYNAMIC_BMI2 != 0 155 140 int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ 141 + #endif 156 142 157 143 /* dictionary */ 158 144 ZSTD_DDict* ddictLocal; ··· 176 158 size_t outStart; 177 159 size_t outEnd; 178 160 size_t lhSize; 179 - void* legacyContext; 180 - U32 previousLegacyVersion; 181 - U32 legacyVersion; 182 161 U32 hostageByte; 183 162 int noForwardProgress; 184 163 ZSTD_bufferMode_e outBufferMode; 185 164 ZSTD_outBuffer expectedOutBuffer; 186 165 187 166 /* workspace */ 188 - BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH]; 167 + BYTE* litBuffer; 168 + const BYTE* litBufferEnd; 169 + ZSTD_litLocation_e litBufferLocation; 170 + BYTE litExtraBuffer[ZSTD_LITBUFFEREXTRASIZE + WILDCOPY_OVERLENGTH]; /* literal buffer can be split between storage within dst and within this scratch buffer */ 189 171 BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; 190 172 191 173 size_t oversizedDuration; ··· 198 180 /* Tracing */ 199 181 }; /* typedef'd to ZSTD_DCtx within "zstd.h" */ 200 182 183 + MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) { 184 + #if DYNAMIC_BMI2 != 0 185 + return dctx->bmi2; 186 + #else 187 + (void)dctx; 188 + return 0; 189 + #endif 190 + } 201 191 202 192 /*-******************************************************* 203 193 * Shared internal functions
+6
lib/zstd/decompress_sources.h
··· 16 16 * decompression. 17 17 */ 18 18 19 + /* 20 + * Disable the ASM Huffman implementation because we need to 21 + * include all the sources. 22 + */ 23 + #define ZSTD_DISABLE_ASM 1 24 + 19 25 #include "common/debug.c" 20 26 #include "common/entropy_common.c" 21 27 #include "common/error_private.c"
+32
lib/zstd/zstd_common_module.c
··· 1 + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause 2 + /* 3 + * Copyright (c) Facebook, Inc. 4 + * All rights reserved. 5 + * 6 + * This source code is licensed under both the BSD-style license (found in the 7 + * LICENSE file in the root directory of this source tree) and the GPLv2 (found 8 + * in the COPYING file in the root directory of this source tree). 9 + * You may select, at your option, one of the above-listed licenses. 10 + */ 11 + 12 + #include <linux/module.h> 13 + 14 + #include "common/huf.h" 15 + #include "common/fse.h" 16 + #include "common/zstd_internal.h" 17 + 18 + // Export symbols shared by compress and decompress into a common module 19 + 20 + #undef ZSTD_isError /* defined within zstd_internal.h */ 21 + EXPORT_SYMBOL_GPL(FSE_readNCount); 22 + EXPORT_SYMBOL_GPL(HUF_readStats); 23 + EXPORT_SYMBOL_GPL(HUF_readStats_wksp); 24 + EXPORT_SYMBOL_GPL(ZSTD_isError); 25 + EXPORT_SYMBOL_GPL(ZSTD_getErrorName); 26 + EXPORT_SYMBOL_GPL(ZSTD_getErrorCode); 27 + EXPORT_SYMBOL_GPL(ZSTD_customMalloc); 28 + EXPORT_SYMBOL_GPL(ZSTD_customCalloc); 29 + EXPORT_SYMBOL_GPL(ZSTD_customFree); 30 + 31 + MODULE_LICENSE("Dual BSD/GPL"); 32 + MODULE_DESCRIPTION("Zstd Common");
+5 -1
lib/zstd/zstd_compress_module.c
··· 133 133 size_t zstd_reset_cstream(zstd_cstream *cstream, 134 134 unsigned long long pledged_src_size) 135 135 { 136 - return ZSTD_resetCStream(cstream, pledged_src_size); 136 + if (pledged_src_size == 0) 137 + pledged_src_size = ZSTD_CONTENTSIZE_UNKNOWN; 138 + ZSTD_FORWARD_IF_ERR( ZSTD_CCtx_reset(cstream, ZSTD_reset_session_only) ); 139 + ZSTD_FORWARD_IF_ERR( ZSTD_CCtx_setPledgedSrcSize(cstream, pledged_src_size) ); 140 + return 0; 137 141 } 138 142 EXPORT_SYMBOL(zstd_reset_cstream); 139 143