[PATCH 1/2] staging: Add Snappy compression library (v3)
Zeev Tarantov
zeev.tarantov at gmail.com
Fri Apr 15 23:52:29 UTC 2011
From: Zeev Tarantov <zeev.tarantov at gmail.com>
Google's Snappy data compression library is a faster alternative to LZO,
optimized for x86-64. On compressible input it compresses ~2.5x faster than LZO
and decompresses ~1.5-2x faster than LZO. On incompressible input, it skips the
input at 100x faster than LZO and decompresses ~4x faster than LZO.
It is released under BSD license.
This is a kernel port from user space C++ code.
The current intended use is with zram (see next patch in series).
Signed-off-by: Zeev Tarantov <zeev.tarantov at gmail.com>
---
Changelog:
v3: added forgotten staging/snappy/Kconfig
second part of patch is the same:
http://driverdev.linuxdriverproject.org/pipermail/devel/2011-April/015114.html
v2: move to staging, run checkpatch
http://driverdev.linuxdriverproject.org/pipermail/devel/2011-April/015113.html
drivers/staging/Kconfig | 2 +
drivers/staging/Makefile | 2 +
drivers/staging/snappy/Kconfig | 5 +
drivers/staging/snappy/Makefile | 5 +
drivers/staging/snappy/csnappy.h | 125 +++++++
drivers/staging/snappy/csnappy_compress.c | 497 +++++++++++++++++++++++++++
drivers/staging/snappy/csnappy_decompress.c | 321 +++++++++++++++++
drivers/staging/snappy/csnappy_internal.h | 83 +++++
8 files changed, 1040 insertions(+), 0 deletions(-)
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index dca4a0b..6c222f0 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -123,6 +123,8 @@ source "drivers/staging/iio/Kconfig"
source "drivers/staging/cs5535_gpio/Kconfig"
+source "drivers/staging/snappy/Kconfig"
+
source "drivers/staging/zram/Kconfig"
source "drivers/staging/zcache/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index eb93012..3b216a6 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -71,3 +71,5 @@ obj-$(CONFIG_ALTERA_STAPL) +=altera-stapl/
obj-$(CONFIG_TOUCHSCREEN_CLEARPAD_TM1217) += cptm1217/
obj-$(CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI4) += ste_rmi4/
obj-$(CONFIG_DRM_PSB) += gma500/
+obj-$(CONFIG_SNAPPY_COMPRESS) += snappy/
+obj-$(CONFIG_SNAPPY_DECOMPRESS) += snappy/
diff --git a/drivers/staging/snappy/Kconfig b/drivers/staging/snappy/Kconfig
new file mode 100644
index 0000000..24f6908
--- /dev/null
+++ b/drivers/staging/snappy/Kconfig
@@ -0,0 +1,5 @@
+config SNAPPY_COMPRESS
+ tristate "Google Snappy Compression"
+
+config SNAPPY_DECOMPRESS
+ tristate "Google Snappy Decompression"
diff --git a/drivers/staging/snappy/Makefile b/drivers/staging/snappy/Makefile
new file mode 100644
index 0000000..399d070
--- /dev/null
+++ b/drivers/staging/snappy/Makefile
@@ -0,0 +1,5 @@
+snappy_compress-objs := csnappy_compress.o
+snappy_decompress-objs := csnappy_decompress.o
+
+obj-$(CONFIG_SNAPPY_COMPRESS) += csnappy_compress.o
+obj-$(CONFIG_SNAPPY_DECOMPRESS) += csnappy_decompress.o
diff --git a/drivers/staging/snappy/csnappy.h b/drivers/staging/snappy/csnappy.h
new file mode 100644
index 0000000..af68232
--- /dev/null
+++ b/drivers/staging/snappy/csnappy.h
@@ -0,0 +1,125 @@
+#ifndef __CSNAPPY_H__
+#define __CSNAPPY_H__
+/*
+File modified for the Linux Kernel by
+Zeev Tarantov <zeev.tarantov at gmail.com>
+*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CSNAPPY_VERSION 4
+
+#define CSNAPPY_WORKMEM_BYTES_POWER_OF_TWO 15
+#define CSNAPPY_WORKMEM_BYTES (1 << CSNAPPY_WORKMEM_BYTES_POWER_OF_TWO)
+
+/*
+ * Returns the maximal size of the compressed representation of
+ * input data that is "source_len" bytes in length;
+ */
+uint32_t
+csnappy_max_compressed_length(uint32_t source_len) __attribute__((const));
+
+/*
+ * Flat array compression that does not emit the "uncompressed length"
+ * prefix. Compresses "input" array to the "output" array.
+ *
+ * REQUIRES: "input" is at most 32KiB long.
+ * REQUIRES: "output" points to an array of memory that is at least
+ * "csnappy_max_compressed_length(input_length)" in size.
+ * REQUIRES: working_memory has (1 << workmem_bytes_power_of_two) bytes.
+ * REQUIRES: 9 <= workmem_bytes_power_of_two <= 15.
+ *
+ * Returns an "end" pointer into "output" buffer.
+ * "end - output" is the compressed size of "input".
+ */
+char*
+csnappy_compress_fragment(
+ const char *input,
+ const uint32_t input_length,
+ char *output,
+ void *working_memory,
+ const int workmem_bytes_power_of_two);
+
+/*
+ * REQUIRES: "compressed" must point to an area of memory that is at
+ * least "csnappy_max_compressed_length(input_length)" bytes in length.
+ * REQUIRES: working_memory has (1 << workmem_bytes_power_of_two) bytes.
+ * REQUIRES: 9 <= workmem_bytes_power_of_two <= 15.
+ *
+ * Takes the data stored in "input[0..input_length]" and stores
+ * it in the array pointed to by "compressed".
+ *
+ * "*out_compressed_length" is set to the length of the compressed output.
+ */
+void
+csnappy_compress(
+ const char *input,
+ uint32_t input_length,
+ char *compressed,
+ uint32_t *out_compressed_length,
+ void *working_memory,
+ const int workmem_bytes_power_of_two);
+
+/*
+ * Reads header of compressed data to get stored length of uncompressed data.
+ * REQUIRES: start points to compressed data.
+ * REQUIRES: n is length of available compressed data.
+ *
+ * Returns SNAPPY_E_HEADER_BAD on error.
+ * Returns number of bytes read from input on success.
+ * Stores decoded length into *result.
+ */
+int
+csnappy_get_uncompressed_length(
+ const char *start,
+ uint32_t n,
+ uint32_t *result);
+
+/*
+ * Safely decompresses all data from array "src" of length "src_len" containing
+ * entire compressed stream (with header) into array "dst" of size "dst_len".
+ * REQUIRES: dst_len is at least csnappy_get_uncompressed_length(...).
+ *
+ * Iff sucessful, returns CSNAPPY_E_OK.
+ * If recorded length in header is greater than dst_len, returns
+ * CSNAPPY_E_OUTPUT_INSUF.
+ * If compressed data is malformed, does not write more than dst_len into dst.
+ */
+int
+csnappy_decompress(
+ const char *src,
+ uint32_t src_len,
+ char *dst,
+ uint32_t dst_len);
+
+/*
+ * Safely decompresses stream src_len bytes long read from src to dst.
+ * Amount of available space at dst must be provided in *dst_len by caller.
+ * If compressed stream needs more space, it will not overflow and return
+ * CSNAPPY_E_OUTPUT_OVERRUN.
+ * On success, sets *dst_len to actal number of bytes decompressed.
+ * Iff sucessful, returns CSNAPPY_E_OK.
+ */
+int
+csnappy_decompress_noheader(
+ const char *src,
+ uint32_t src_len,
+ char *dst,
+ uint32_t *dst_len);
+
+/*
+ * Return values (< 0 = Error)
+ */
+#define CSNAPPY_E_OK 0
+#define CSNAPPY_E_HEADER_BAD (-1)
+#define CSNAPPY_E_OUTPUT_INSUF (-2)
+#define CSNAPPY_E_OUTPUT_OVERRUN (-3)
+#define CSNAPPY_E_INPUT_NOT_CONSUMED (-4)
+#define CSNAPPY_E_DATA_MALFORMED (-5)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/drivers/staging/snappy/csnappy_compress.c b/drivers/staging/snappy/csnappy_compress.c
new file mode 100644
index 0000000..40bb90d
--- /dev/null
+++ b/drivers/staging/snappy/csnappy_compress.c
@@ -0,0 +1,497 @@
+/*
+Copyright 2011, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File modified for the Linux Kernel by
+Zeev Tarantov <zeev.tarantov at gmail.com>
+*/
+
+#include "csnappy_internal.h"
+#ifdef __KERNEL__
+#include <linux/kernel.h>
+#include <linux/module.h>
+#endif
+#include "csnappy.h"
+
+
+static inline char*
+encode_varint32(char *sptr, uint32_t v)
+{
+ uint8_t* ptr = (uint8_t *)sptr;
+ static const int B = 128;
+ if (v < (1<<7)) {
+ *(ptr++) = v;
+ } else if (v < (1<<14)) {
+ *(ptr++) = v | B;
+ *(ptr++) = v>>7;
+ } else if (v < (1<<21)) {
+ *(ptr++) = v | B;
+ *(ptr++) = (v>>7) | B;
+ *(ptr++) = v>>14;
+ } else if (v < (1<<28)) {
+ *(ptr++) = v | B;
+ *(ptr++) = (v>>7) | B;
+ *(ptr++) = (v>>14) | B;
+ *(ptr++) = v>>21;
+ } else {
+ *(ptr++) = v | B;
+ *(ptr++) = (v>>7) | B;
+ *(ptr++) = (v>>14) | B;
+ *(ptr++) = (v>>21) | B;
+ *(ptr++) = v>>28;
+ }
+ return (char *)ptr;
+}
+
+
+/*
+ * Any hash function will produce a valid compressed bitstream, but a good
+ * hash function reduces the number of collisions and thus yields better
+ * compression for compressible input, and more speed for incompressible
+ * input. Of course, it doesn't hurt if the hash function is reasonably fast
+ * either, as it gets called a lot.
+ */
+static inline uint32_t HashBytes(uint32_t bytes, int shift)
+{
+ uint32_t kMul = 0x1e35a7bd;
+ return (bytes * kMul) >> shift;
+}
+static inline uint32_t Hash(const char *p, int shift)
+{
+ return HashBytes(UNALIGNED_LOAD32(p), shift);
+}
+
+
+/*
+ * *** DO NOT CHANGE THE VALUE OF kBlockSize ***
+
+ * New Compression code chops up the input into blocks of at most
+ * the following size. This ensures that back-references in the
+ * output never cross kBlockSize block boundaries. This can be
+ * helpful in implementing blocked decompression. However the
+ * decompression code should not rely on this guarantee since older
+ * compression code may not obey it.
+ */
+#define kBlockLog 15
+#define kBlockSize (1 << kBlockLog)
+
+
+/*
+ * Return the largest n such that
+ *
+ * s1[0,n-1] == s2[0,n-1]
+ * and n <= (s2_limit - s2).
+ *
+ * Does not read *s2_limit or beyond.
+ * Does not read *(s1 + (s2_limit - s2)) or beyond.
+ * Requires that s2_limit >= s2.
+ *
+ * Separate implementation for x86_64, for speed. Uses the fact that
+ * x86_64 is little endian.
+ */
+#if defined(__x86_64__)
+static inline int
+FindMatchLength(const char *s1, const char *s2, const char *s2_limit)
+{
+ uint64_t x;
+ int matched, matching_bits;
+ DCHECK_GE(s2_limit, s2);
+ matched = 0;
+ /*
+ * Find out how long the match is. We loop over the data 64 bits at a
+ * time until we find a 64-bit block that doesn't match; then we find
+ * the first non-matching bit and use that to calculate the total
+ * length of the match.
+ */
+ while (likely(s2 <= s2_limit - 8)) {
+ if (unlikely(UNALIGNED_LOAD64(s1 + matched) ==
+ UNALIGNED_LOAD64(s2))) {
+ s2 += 8;
+ matched += 8;
+ } else {
+ /*
+ * On current (mid-2008) Opteron models there is a 3%
+ * more efficient code sequence to find the first
+ * non-matching byte. However, what follows is ~10%
+ * better on Intel Core 2 and newer, and we expect AMD's
+ * bsf instruction to improve.
+ */
+ x = UNALIGNED_LOAD64(s1 + matched) ^
+ UNALIGNED_LOAD64(s2);
+ matching_bits = FindLSBSetNonZero64(x);
+ matched += matching_bits >> 3;
+ return matched;
+ }
+ }
+ while (likely(s2 < s2_limit)) {
+ if (likely(s1[matched] == *s2)) {
+ ++s2;
+ ++matched;
+ } else {
+ return matched;
+ }
+ }
+ return matched;
+}
+#else /* !defined(__x86_64__) */
+static inline int
+FindMatchLength(const char *s1, const char *s2, const char *s2_limit)
+{
+ /* Implementation based on the x86-64 version, above. */
+ int matched = 0;
+ DCHECK_GE(s2_limit, s2);
+
+ while (s2 <= s2_limit - 4 &&
+ UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) {
+ s2 += 4;
+ matched += 4;
+ }
+#if defined(__LITTLE_ENDIAN)
+ if (s2 <= s2_limit - 4) {
+ uint32_t x = UNALIGNED_LOAD32(s1 + matched) ^
+ UNALIGNED_LOAD32(s2);
+ int matching_bits = FindLSBSetNonZero(x);
+ matched += matching_bits >> 3;
+ } else {
+ while ((s2 < s2_limit) && (s1[matched] == *s2)) {
+ ++s2;
+ ++matched;
+ }
+ }
+#else
+ while ((s2 < s2_limit) && (s1[matched] == *s2)) {
+ ++s2;
+ ++matched;
+ }
+#endif
+ return matched;
+}
+#endif /* !defined(__x86_64__) */
+
+
+static inline char*
+EmitLiteral(char *op, const char *literal, int len, int allow_fast_path)
+{
+ int n = len - 1; /* Zero-length literals are disallowed */
+ if (n < 60) {
+ /* Fits in tag byte */
+ *op++ = LITERAL | (n << 2);
+ /*
+ The vast majority of copies are below 16 bytes, for which a
+ call to memcpy is overkill. This fast path can sometimes
+ copy up to 15 bytes too much, but that is okay in the
+ main loop, since we have a bit to go on for both sides:
+ - The input will always have kInputMarginBytes = 15 extra
+ available bytes, as long as we're in the main loop, and
+ if not, allow_fast_path = false.
+ - The output will always have 32 spare bytes (see
+ snappy_max_compressed_length).
+ */
+ if (allow_fast_path && len <= 16) {
+ UNALIGNED_STORE64(op, UNALIGNED_LOAD64(literal));
+ UNALIGNED_STORE64(op + 8,
+ UNALIGNED_LOAD64(literal + 8));
+ return op + len;
+ }
+ } else {
+ /* Encode in upcoming bytes */
+ char *base = op;
+ int count = 0;
+ op++;
+ while (n > 0) {
+ *op++ = n & 0xff;
+ n >>= 8;
+ count++;
+ }
+ DCHECK_GE(count, 1);
+ DCHECK_LE(count, 4);
+ *base = LITERAL | ((59+count) << 2);
+ }
+ memcpy(op, literal, len);
+ return op + len;
+}
+
+static inline char*
+EmitCopyLessThan64(char *op, int offset, int len)
+{
+ DCHECK_LE(len, 64);
+ DCHECK_GE(len, 4);
+ DCHECK_LT(offset, 65536);
+
+ if ((len < 12) && (offset < 2048)) {
+ int len_minus_4 = len - 4;
+ DCHECK_LT(len_minus_4, 8); /* Must fit in 3 bits */
+ *op++ = COPY_1_BYTE_OFFSET |
+ ((len_minus_4) << 2) |
+ ((offset >> 8) << 5);
+ *op++ = offset & 0xff;
+ } else {
+ *op++ = COPY_2_BYTE_OFFSET | ((len-1) << 2);
+ put_unaligned_le16(offset, op);
+ op += 2;
+ }
+ return op;
+}
+
+static inline char*
+EmitCopy(char *op, int offset, int len)
+{
+ /* Emit 64 byte copies but make sure to keep at least four bytes
+ * reserved */
+ while (len >= 68) {
+ op = EmitCopyLessThan64(op, offset, 64);
+ len -= 64;
+ }
+
+ /* Emit an extra 60 byte copy if have too much data to fit in one
+ * copy */
+ if (len > 64) {
+ op = EmitCopyLessThan64(op, offset, 60);
+ len -= 60;
+ }
+
+ /* Emit remainder */
+ op = EmitCopyLessThan64(op, offset, len);
+ return op;
+}
+
+
+/*
+ * For 0 <= offset <= 4, GetUint32AtOffset(UNALIGNED_LOAD64(p), offset) will
+ * equal UNALIGNED_LOAD32(p + offset). Motivation: On x86-64 hardware we have
+ * empirically found that overlapping loads such as
+ * UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
+ * are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32_t.
+ */
+static inline uint32_t
+GetUint32AtOffset(uint64_t v, int offset)
+{
+ DCHECK(0 <= offset && offset <= 4);
+#ifdef __LITTLE_ENDIAN
+ return v >> (8 * offset);
+#else
+ return v >> (32 - 8 * offset);
+#endif
+}
+
+#define kInputMarginBytes 15
+char*
+csnappy_compress_fragment(
+ const char *input,
+ const uint32_t input_size,
+ char *op,
+ void *working_memory,
+ const int workmem_bytes_power_of_two)
+{
+ const char *ip, *ip_end, *base_ip, *next_emit, *ip_limit, *next_ip,
+ *candidate, *base;
+ uint16_t *table = (uint16_t *)working_memory;
+ uint64_t input_bytes;
+ uint32_t hash, next_hash, prev_hash, cur_hash, skip, candidate_bytes;
+ int shift, matched;
+
+ DCHECK_GE(workmem_bytes_power_of_two, 9);
+ DCHECK_LE(workmem_bytes_power_of_two, 15);
+ /* Table of 2^X bytes, need (X-1) bits to address table of uint16_t.
+ * How many bits of 32bit hash function result are discarded? */
+ shift = 33 - workmem_bytes_power_of_two;
+ /* "ip" is the input pointer, and "op" is the output pointer. */
+ ip = input;
+ DCHECK_LE(input_size, kBlockSize);
+ ip_end = input + input_size;
+ base_ip = ip;
+ /* Bytes in [next_emit, ip) will be emitted as literal bytes. Or
+ [next_emit, ip_end) after the main loop. */
+ next_emit = ip;
+
+ if (unlikely(input_size < kInputMarginBytes))
+ goto emit_remainder;
+
+ memset(working_memory, 0, 1 << workmem_bytes_power_of_two);
+
+ ip_limit = input + input_size - kInputMarginBytes;
+ next_hash = Hash(++ip, shift);
+
+main_loop:
+ DCHECK_LT(next_emit, ip);
+ /*
+ * The body of this loop calls EmitLiteral once and then EmitCopy one or
+ * more times. (The exception is that when we're close to exhausting
+ * the input we goto emit_remainder.)
+ *
+ * In the first iteration of this loop we're just starting, so
+ * there's nothing to copy, so calling EmitLiteral once is
+ * necessary. And we only start a new iteration when the
+ * current iteration has determined that a call to EmitLiteral will
+ * precede the next call to EmitCopy (if any).
+ *
+ * Step 1: Scan forward in the input looking for a 4-byte-long match.
+ * If we get close to exhausting the input then goto emit_remainder.
+ *
+ * Heuristic match skipping: If 32 bytes are scanned with no matches
+ * found, start looking only at every other byte. If 32 more bytes are
+ * scanned, look at every third byte, etc.. When a match is found,
+ * immediately go back to looking at every byte. This is a small loss
+ * (~5% performance, ~0.1% density) for compressible data due to more
+ * bookkeeping, but for non-compressible data (such as JPEG) it's a huge
+ * win since the compressor quickly "realizes" the data is incompressible
+ * and doesn't bother looking for matches everywhere.
+ *
+ * The "skip" variable keeps track of how many bytes there are since the
+ * last match; dividing it by 32 (ie. right-shifting by five) gives the
+ * number of bytes to move ahead for each iteration.
+ */
+ skip = 32;
+
+ next_ip = ip;
+ do {
+ ip = next_ip;
+ hash = next_hash;
+ DCHECK_EQ(hash, Hash(ip, shift));
+ next_ip = ip + (skip++ >> 5);
+ if (unlikely(next_ip > ip_limit))
+ goto emit_remainder;
+ next_hash = Hash(next_ip, shift);
+ candidate = base_ip + table[hash];
+ DCHECK_GE(candidate, base_ip);
+ DCHECK_LT(candidate, ip);
+
+ table[hash] = ip - base_ip;
+ } while (likely(UNALIGNED_LOAD32(ip) !=
+ UNALIGNED_LOAD32(candidate)));
+
+ /*
+ * Step 2: A 4-byte match has been found. We'll later see if more
+ * than 4 bytes match. But, prior to the match, input
+ * bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
+ */
+ DCHECK_LE(next_emit + 16, ip_end);
+ op = EmitLiteral(op, next_emit, ip - next_emit, 1);
+
+ /*
+ * Step 3: Call EmitCopy, and then see if another EmitCopy could
+ * be our next move. Repeat until we find no match for the
+ * input immediately after what was consumed by the last EmitCopy call.
+ *
+ * If we exit this loop normally then we need to call EmitLiteral next,
+ * though we don't yet know how big the literal will be. We handle that
+ * by proceeding to the next iteration of the main loop. We also can exit
+ * this loop via goto if we get close to exhausting the input.
+ */
+ input_bytes = 0;
+ candidate_bytes = 0;
+
+ do {
+ /* We have a 4-byte match at ip, and no need to emit any
+ "literal bytes" prior to ip. */
+ base = ip;
+ matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
+ ip += matched;
+ DCHECK_EQ(0, memcmp(base, candidate, matched));
+ op = EmitCopy(op, base - candidate, matched);
+ /* We could immediately start working at ip now, but to improve
+ compression we first update table[Hash(ip - 1, ...)]. */
+ next_emit = ip;
+ if (unlikely(ip >= ip_limit))
+ goto emit_remainder;
+ input_bytes = UNALIGNED_LOAD64(ip - 1);
+ prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
+ table[prev_hash] = ip - base_ip - 1;
+ cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
+ candidate = base_ip + table[cur_hash];
+ candidate_bytes = UNALIGNED_LOAD32(candidate);
+ table[cur_hash] = ip - base_ip;
+ } while (GetUint32AtOffset(input_bytes, 1) == candidate_bytes);
+
+ next_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift);
+ ++ip;
+ goto main_loop;
+
+emit_remainder:
+ /* Emit the remaining bytes as a literal */
+ if (next_emit < ip_end)
+ op = EmitLiteral(op, next_emit, ip_end - next_emit, 0);
+
+ return op;
+}
+#if defined(__KERNEL__) && !defined(STATIC)
+EXPORT_SYMBOL(csnappy_compress_fragment);
+#endif
+
+uint32_t __attribute__((const))
+csnappy_max_compressed_length(uint32_t source_len)
+{
+ return 32 + source_len + source_len/6;
+}
+#if defined(__KERNEL__) && !defined(STATIC)
+EXPORT_SYMBOL(csnappy_max_compressed_length);
+#endif
+
+void
+csnappy_compress(
+ const char *input,
+ uint32_t input_length,
+ char *compressed,
+ uint32_t *compressed_length,
+ void *working_memory,
+ const int workmem_bytes_power_of_two)
+{
+ int workmem_size;
+ int num_to_read;
+ uint32_t written = 0;
+ char *p = encode_varint32(compressed, input_length);
+ written += (p - compressed);
+ compressed = p;
+ while (input_length > 0) {
+ num_to_read = min(input_length, (uint32_t)kBlockSize);
+ workmem_size = workmem_bytes_power_of_two;
+ if (num_to_read < kBlockSize) {
+ for (workmem_size = 9;
+ workmem_size < workmem_bytes_power_of_two;
+ ++workmem_size) {
+ if ((1 << (workmem_size-1)) >= num_to_read)
+ break;
+ }
+ }
+ p = csnappy_compress_fragment(
+ input, num_to_read, compressed,
+ working_memory, workmem_size);
+ written += (p - compressed);
+ compressed = p;
+ input_length -= num_to_read;
+ input += num_to_read;
+ }
+ *compressed_length = written;
+}
+#if defined(__KERNEL__) && !defined(STATIC)
+EXPORT_SYMBOL(csnappy_compress);
+
+MODULE_LICENSE("BSD");
+MODULE_DESCRIPTION("Snappy Compressor");
+#endif
diff --git a/drivers/staging/snappy/csnappy_decompress.c b/drivers/staging/snappy/csnappy_decompress.c
new file mode 100644
index 0000000..4d3a09b
--- /dev/null
+++ b/drivers/staging/snappy/csnappy_decompress.c
@@ -0,0 +1,321 @@
+/*
+Copyright 2011, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+File modified for the Linux Kernel by
+Zeev Tarantov <zeev.tarantov at gmail.com>
+*/
+
+#include "csnappy_internal.h"
+#ifdef __KERNEL__
+#include <linux/kernel.h>
+#include <linux/module.h>
+#endif
+#include "csnappy.h"
+
+
+/* Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits */
+static const uint32_t wordmask[] = {
+ 0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
+};
+
+/*
+ * Data stored per entry in lookup table:
+ * Range Bits-used Description
+ * ------------------------------------
+ * 1..64 0..7 Literal/copy length encoded in opcode byte
+ * 0..7 8..10 Copy offset encoded in opcode byte / 256
+ * 0..4 11..13 Extra bytes after opcode
+ *
+ * We use eight bits for the length even though 7 would have sufficed
+ * because of efficiency reasons:
+ * (1) Extracting a byte is faster than a bit-field
+ * (2) It properly aligns copy offset so we do not need a <<8
+ */
+static const uint16_t char_table[256] = {
+ 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
+ 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
+ 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
+ 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
+ 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
+ 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
+ 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
+ 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
+ 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
+ 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
+ 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
+ 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
+ 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
+ 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
+ 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
+ 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
+ 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
+ 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
+ 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
+ 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
+ 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
+ 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
+ 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
+ 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
+ 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
+ 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
+ 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
+ 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
+ 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
+ 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
+ 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
+ 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
+};
+
+/*
+ * Copy "len" bytes from "src" to "op", one byte at a time. Used for
+ * handling COPY operations where the input and output regions may
+ * overlap. For example, suppose:
+ * src == "ab"
+ * op == src + 2
+ * len == 20
+ * After IncrementalCopy(src, op, len), the result will have
+ * eleven copies of "ab"
+ * ababababababababababab
+ * Note that this does not match the semantics of either memcpy()
+ * or memmove().
+ */
+static inline void IncrementalCopy(const char *src, char *op, int len)
+{
+ DCHECK_GT(len, 0);
+ do {
+ *op++ = *src++;
+ } while (--len > 0);
+}
+
+/*
+ * Equivalent to IncrementalCopy except that it can write up to ten extra
+ * bytes after the end of the copy, and that it is faster.
+ *
+ * The main part of this loop is a simple copy of eight bytes at a time until
+ * we've copied (at least) the requested amount of bytes. However, if op and
+ * src are less than eight bytes apart (indicating a repeating pattern of
+ * length < 8), we first need to expand the pattern in order to get the correct
+ * results. For instance, if the buffer looks like this, with the eight-byte
+ * <src> and <op> patterns marked as intervals:
+ *
+ * abxxxxxxxxxxxx
+ * [------] src
+ * [------] op
+ *
+ * a single eight-byte copy from <src> to <op> will repeat the pattern once,
+ * after which we can move <op> two bytes without moving <src>:
+ *
+ * ababxxxxxxxxxx
+ * [------] src
+ * [------] op
+ *
+ * and repeat the exercise until the two no longer overlap.
+ *
+ * This allows us to do very well in the special case of one single byte
+ * repeated many times, without taking a big hit for more general cases.
+ *
+ * The worst case of extra writing past the end of the match occurs when
+ * op - src == 1 and len == 1; the last copy will read from byte positions
+ * [0..7] and write to [4..11], whereas it was only supposed to write to
+ * position 1. Thus, ten excess bytes.
+ */
+static const int kMaxIncrementCopyOverflow = 10;
+static inline void IncrementalCopyFastPath(const char *src, char *op, int len)
+{
+ while (op - src < 8) {
+ UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+ len -= op - src;
+ op += op - src;
+ }
+ while (len > 0) {
+ UNALIGNED_STORE64(op, UNALIGNED_LOAD64(src));
+ src += 8;
+ op += 8;
+ len -= 8;
+ }
+}
+
+
+/* A type that writes to a flat array. */
+struct SnappyArrayWriter {
+ char *base;
+ char *op;
+ char *op_limit;
+};
+
+static inline int
+SAW__Append(struct SnappyArrayWriter *this,
+ const char *ip, uint32_t len, int allow_fast_path)
+{
+ char *op = this->op;
+ const int space_left = this->op_limit - op;
+ /*Fast path, used for the majority (about 90%) of dynamic invocations.*/
+ if (allow_fast_path && len <= 16 && space_left >= 16) {
+ UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
+ UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
+ } else {
+ if (space_left < len)
+ return CSNAPPY_E_OUTPUT_OVERRUN;
+ memcpy(op, ip, len);
+ }
+ this->op = op + len;
+ return CSNAPPY_E_OK;
+}
+
+static inline int
+SAW__AppendFromSelf(struct SnappyArrayWriter *this,
+ uint32_t offset, uint32_t len)
+{
+ char *op = this->op;
+ const int space_left = this->op_limit - op;
+ /* -1u catches offset==0 */
+ if (op - this->base <= offset - 1u)
+ return CSNAPPY_E_DATA_MALFORMED;
+ /* Fast path, used for the majority (70-80%) of dynamic invocations. */
+ if (len <= 16 && offset >= 8 && space_left >= 16) {
+ UNALIGNED_STORE64(op, UNALIGNED_LOAD64(op - offset));
+ UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(op - offset + 8));
+ } else if (space_left >= len + kMaxIncrementCopyOverflow) {
+ IncrementalCopyFastPath(op - offset, op, len);
+ } else {
+ if (space_left < len)
+ return CSNAPPY_E_OUTPUT_OVERRUN;
+ IncrementalCopy(op - offset, op, len);
+ }
+ this->op = op + len;
+ return CSNAPPY_E_OK;
+}
+
+
+int
+csnappy_get_uncompressed_length(
+ const char *src,
+ uint32_t src_len,
+ uint32_t *result)
+{
+ const char *src_base = src;
+ uint32_t shift = 0;
+ uint8_t c;
+ /* Length is encoded in 1..5 bytes */
+ *result = 0;
+ for (;;) {
+ if (shift >= 32)
+ goto err_out;
+ if (src_len == 0)
+ goto err_out;
+ c = *(const uint8_t *)src++;
+ src_len -= 1;
+ *result |= (uint32_t)(c & 0x7f) << shift;
+ if (c < 128)
+ break;
+ shift += 7;
+ }
+ return src - src_base;
+err_out:
+ return CSNAPPY_E_HEADER_BAD;
+}
+#if defined(__KERNEL__) && !defined(STATIC)
+EXPORT_SYMBOL(csnappy_get_uncompressed_length);
+#endif
+
+int
+csnappy_decompress_noheader(
+ const char *src,
+ uint32_t src_remaining,
+ char *dst,
+ uint32_t *dst_len)
+{
+ struct SnappyArrayWriter writer;
+ uint32_t length, trailer, opword, extra_bytes;
+ int ret;
+ uint8_t opcode;
+ char scratch[5];
+ writer.op = writer.base = dst;
+ writer.op_limit = writer.op + *dst_len;
+ while (src_remaining) {
+ if (unlikely(src_remaining < 5)) {
+ memcpy(scratch, src, src_remaining);
+ src = scratch;
+ }
+ opcode = *(const uint8_t *)src++;
+ opword = char_table[opcode];
+ extra_bytes = opword >> 11;
+ trailer = get_unaligned_le32(src) & wordmask[extra_bytes];
+ src += extra_bytes;
+ src_remaining -= 1 + extra_bytes;
+ length = opword & 0xff;
+ if (opcode & 0x3) {
+ trailer += opword & 0x700;
+ ret = SAW__AppendFromSelf(&writer, trailer, length);
+ if (ret < 0)
+ return ret;
+ } else {
+ length += trailer;
+ if (unlikely(src_remaining < length))
+ return CSNAPPY_E_DATA_MALFORMED;
+ ret = src_remaining >= 16;
+ ret = SAW__Append(&writer, src, length, ret);
+ if (ret < 0)
+ return ret;
+ src += length;
+ src_remaining -= length;
+ }
+ }
+ *dst_len = writer.op - writer.base;
+ return CSNAPPY_E_OK;
+}
+#if defined(__KERNEL__) && !defined(STATIC)
+EXPORT_SYMBOL(csnappy_decompress_noheader);
+#endif
+
+int
+csnappy_decompress(
+ const char *src,
+ uint32_t src_len,
+ char *dst,
+ uint32_t dst_len)
+{
+ int n;
+ uint32_t olen = 0;
+ /* Read uncompressed length from the front of the compressed input */
+ n = csnappy_get_uncompressed_length(src, src_len, &olen);
+ if (unlikely(n < CSNAPPY_E_OK))
+ return n;
+ /* Protect against possible DoS attack */
+ if (unlikely(olen > dst_len))
+ return CSNAPPY_E_OUTPUT_INSUF;
+ return csnappy_decompress_noheader(src + n, src_len - n, dst, &olen);
+}
+#if defined(__KERNEL__) && !defined(STATIC)
+EXPORT_SYMBOL(csnappy_decompress);
+
+MODULE_LICENSE("BSD");
+MODULE_DESCRIPTION("Snappy Decompressor");
+#endif
diff --git a/drivers/staging/snappy/csnappy_internal.h b/drivers/staging/snappy/csnappy_internal.h
new file mode 100644
index 0000000..6255a3f
--- /dev/null
+++ b/drivers/staging/snappy/csnappy_internal.h
@@ -0,0 +1,83 @@
+/*
+Copyright 2011 Google Inc. All Rights Reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Various stubs for the open-source version of Snappy.
+
+File modified for the Linux Kernel by
+Zeev Tarantov <zeev.tarantov at gmail.com>
+*/
+
+#ifndef CSNAPPY_INTERNAL_H_
+#define CSNAPPY_INTERNAL_H_
+
+#ifndef __KERNEL__
+#include "csnappy_internal_userspace.h"
+#else
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/compiler.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+
+#ifdef DEBUG
+#define DCHECK(cond) if (!(cond)) \
+ printk(KERN_DEBUG "assert failed @ %s:%i\n", \
+ __FILE__, __LINE__)
+#else
+#define DCHECK(cond)
+#endif
+
+#define UNALIGNED_LOAD16(_p) get_unaligned((const uint16_t *)(_p))
+#define UNALIGNED_LOAD32(_p) get_unaligned((const uint32_t *)(_p))
+#define UNALIGNED_LOAD64(_p) get_unaligned((const uint64_t *)(_p))
+#define UNALIGNED_STORE16(_p, _val) put_unaligned((_val), (uint16_t *)(_p))
+#define UNALIGNED_STORE32(_p, _val) put_unaligned((_val), (uint32_t *)(_p))
+#define UNALIGNED_STORE64(_p, _val) put_unaligned((_val), (uint64_t *)(_p))
+
+#define FindLSBSetNonZero(n) __builtin_ctz(n)
+#define FindLSBSetNonZero64(n) __builtin_ctzll(n)
+
+#endif /* __KERNEL__ */
+
+#define DCHECK_EQ(a, b) DCHECK(((a) == (b)))
+#define DCHECK_NE(a, b) DCHECK(((a) != (b)))
+#define DCHECK_GT(a, b) DCHECK(((a) > (b)))
+#define DCHECK_GE(a, b) DCHECK(((a) >= (b)))
+#define DCHECK_LT(a, b) DCHECK(((a) < (b)))
+#define DCHECK_LE(a, b) DCHECK(((a) <= (b)))
+
+enum {
+ LITERAL = 0,
+ COPY_1_BYTE_OFFSET = 1, /* 3 bit length + 3 bits of offset in opcode */
+ COPY_2_BYTE_OFFSET = 2,
+ COPY_4_BYTE_OFFSET = 3
+};
+
+#endif /* CSNAPPY_INTERNAL_H_ */
More information about the devel
mailing list