1 | /* |
2 | * Copyright 2004-2014 H2 Group. Multiple-Licensed under the MPL 2.0, |
3 | * and the EPL 1.0 (http://h2database.com/html/license.html). |
4 | * |
5 | * This code is based on the LZF algorithm from Marc Lehmann. It is a |
6 | * re-implementation of the C code: |
7 | * http://cvs.schmorp.de/liblzf/lzf_c.c?view=markup |
8 | * http://cvs.schmorp.de/liblzf/lzf_d.c?view=markup |
9 | * |
10 | * According to a mail from Marc Lehmann, it's OK to use his algorithm: |
11 | * Date: 2010-07-15 15:57 |
12 | * Subject: Re: Question about LZF licensing |
13 | * ... |
14 | * The algorithm is not copyrighted (and cannot be copyrighted afaik) - as long |
15 | * as you wrote everything yourself, without copying my code, that's just fine |
16 | * (looking is of course fine too). |
17 | * ... |
18 | * |
19 | * Still I would like to keep his copyright info: |
20 | * |
21 | * Copyright (c) 2000-2005 Marc Alexander Lehmann <schmorp@schmorp.de> |
22 | * Copyright (c) 2005 Oren J. Maurice <oymaurice@hazorea.org.il> |
23 | * |
24 | * Redistribution and use in source and binary forms, with or without |
25 | * modification, are permitted provided that the following conditions are met: |
26 | * |
27 | * 1. Redistributions of source code must retain the above copyright notice, |
28 | * this list of conditions and the following disclaimer. |
29 | * |
30 | * 2. Redistributions in binary form must reproduce the above copyright |
31 | * notice, this list of conditions and the following disclaimer in the |
32 | * documentation and/or other materials provided with the distribution. |
33 | * |
34 | * 3. The name of the author may not be used to endorse or promote products |
35 | * derived from this software without specific prior written permission. |
36 | * |
37 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY EXPRESS OR IMPLIED |
38 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
39 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO |
40 | * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
41 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
42 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; |
43 | * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
44 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR |
45 | * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF |
46 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
47 | */ |
48 | |
49 | package org.h2.compress; |
50 | |
51 | import java.nio.ByteBuffer; |
52 | |
53 | /** |
54 | * <p> |
55 | * This class implements the LZF lossless data compression algorithm. LZF is a |
56 | * Lempel-Ziv variant with byte-aligned output, and optimized for speed. |
57 | * </p> |
58 | * <p> |
59 | * Safety/Use Notes: |
60 | * </p> |
61 | * <ul> |
62 | * <li>Each instance should be used by a single thread only.</li> |
63 | * <li>The data buffers should be smaller than 1 GB.</li> |
64 | * <li>For performance reasons, safety checks on expansion are omitted.</li> |
65 | * <li>Invalid compressed data can cause an ArrayIndexOutOfBoundsException.</li> |
66 | * </ul> |
67 | * <p> |
68 | * The LZF compressed format knows literal runs and back-references: |
69 | * </p> |
70 | * <ul> |
71 | * <li>Literal run: directly copy bytes from input to output.</li> |
72 | * <li>Back-reference: copy previous data to output stream, with specified |
73 | * offset from location and length. The length is at least 3 bytes.</li> |
74 | * </ul> |
75 | *<p> |
76 | * The first byte of the compressed stream is the control byte. For literal |
77 | * runs, the highest three bits of the control byte are not set, the the lower |
78 | * bits are the literal run length, and the next bytes are data to copy directly |
79 | * into the output. For back-references, the highest three bits of the control |
80 | * byte are the back-reference length. If all three bits are set, then the |
81 | * back-reference length is stored in the next byte. The lower bits of the |
82 | * control byte combined with the next byte form the offset for the |
83 | * back-reference. |
84 | * </p> |
85 | */ |
86 | public final class CompressLZF implements Compressor { |
87 | |
88 | /** |
89 | * The number of entries in the hash table. The size is a trade-off between |
90 | * hash collisions (reduced compression) and speed (amount that fits in CPU |
91 | * cache). |
92 | */ |
93 | private static final int HASH_SIZE = 1 << 14; |
94 | |
95 | /** |
96 | * The maximum number of literals in a chunk (32). |
97 | */ |
98 | private static final int MAX_LITERAL = 1 << 5; |
99 | |
100 | /** |
101 | * The maximum offset allowed for a back-reference (8192). |
102 | */ |
103 | private static final int MAX_OFF = 1 << 13; |
104 | |
105 | /** |
106 | * The maximum back-reference length (264). |
107 | */ |
108 | private static final int MAX_REF = (1 << 8) + (1 << 3); |
109 | |
110 | /** |
111 | * Hash table for matching byte sequences (reused for performance). |
112 | */ |
113 | private int[] cachedHashTable; |
114 | |
115 | @Override |
116 | public void setOptions(String options) { |
117 | // nothing to do |
118 | } |
119 | |
120 | /** |
121 | * Return the integer with the first two bytes 0, then the bytes at the |
122 | * index, then at index+1. |
123 | */ |
124 | private static int first(byte[] in, int inPos) { |
125 | return (in[inPos] << 8) | (in[inPos + 1] & 255); |
126 | } |
127 | |
128 | /** |
129 | * Return the integer with the first two bytes 0, then the bytes at the |
130 | * index, then at index+1. |
131 | */ |
132 | private static int first(ByteBuffer in, int inPos) { |
133 | return (in.get(inPos) << 8) | (in.get(inPos + 1) & 255); |
134 | } |
135 | |
136 | /** |
137 | * Shift the value 1 byte left, and add the byte at index inPos+2. |
138 | */ |
139 | private static int next(int v, byte[] in, int inPos) { |
140 | return (v << 8) | (in[inPos + 2] & 255); |
141 | } |
142 | |
143 | /** |
144 | * Shift the value 1 byte left, and add the byte at index inPos+2. |
145 | */ |
146 | private static int next(int v, ByteBuffer in, int inPos) { |
147 | return (v << 8) | (in.get(inPos + 2) & 255); |
148 | } |
149 | |
150 | /** |
151 | * Compute the address in the hash table. |
152 | */ |
153 | private static int hash(int h) { |
154 | return ((h * 2777) >> 9) & (HASH_SIZE - 1); |
155 | } |
156 | |
157 | @Override |
158 | public int compress(byte[] in, int inLen, byte[] out, int outPos) { |
159 | int inPos = 0; |
160 | if (cachedHashTable == null) { |
161 | cachedHashTable = new int[HASH_SIZE]; |
162 | } |
163 | int[] hashTab = cachedHashTable; |
164 | int literals = 0; |
165 | outPos++; |
166 | int future = first(in, 0); |
167 | while (inPos < inLen - 4) { |
168 | byte p2 = in[inPos + 2]; |
169 | // next |
170 | future = (future << 8) + (p2 & 255); |
171 | int off = hash(future); |
172 | int ref = hashTab[off]; |
173 | hashTab[off] = inPos; |
174 | // if (ref < inPos |
175 | // && ref > 0 |
176 | // && (off = inPos - ref - 1) < MAX_OFF |
177 | // && in[ref + 2] == p2 |
178 | // && (((in[ref] & 255) << 8) | (in[ref + 1] & 255)) == |
179 | // ((future >> 8) & 0xffff)) { |
180 | if (ref < inPos |
181 | && ref > 0 |
182 | && (off = inPos - ref - 1) < MAX_OFF |
183 | && in[ref + 2] == p2 |
184 | && in[ref + 1] == (byte) (future >> 8) |
185 | && in[ref] == (byte) (future >> 16)) { |
186 | // match |
187 | int maxLen = inLen - inPos - 2; |
188 | if (maxLen > MAX_REF) { |
189 | maxLen = MAX_REF; |
190 | } |
191 | if (literals == 0) { |
192 | // multiple back-references, |
193 | // so there is no literal run control byte |
194 | outPos--; |
195 | } else { |
196 | // set the control byte at the start of the literal run |
197 | // to store the number of literals |
198 | out[outPos - literals - 1] = (byte) (literals - 1); |
199 | literals = 0; |
200 | } |
201 | int len = 3; |
202 | while (len < maxLen && in[ref + len] == in[inPos + len]) { |
203 | len++; |
204 | } |
205 | len -= 2; |
206 | if (len < 7) { |
207 | out[outPos++] = (byte) ((off >> 8) + (len << 5)); |
208 | } else { |
209 | out[outPos++] = (byte) ((off >> 8) + (7 << 5)); |
210 | out[outPos++] = (byte) (len - 7); |
211 | } |
212 | out[outPos++] = (byte) off; |
213 | // move one byte forward to allow for a literal run control byte |
214 | outPos++; |
215 | inPos += len; |
216 | // rebuild the future, and store the last bytes to the |
217 | // hashtable. Storing hashes of the last bytes in back-reference |
218 | // improves the compression ratio and only reduces speed |
219 | // slightly. |
220 | future = first(in, inPos); |
221 | future = next(future, in, inPos); |
222 | hashTab[hash(future)] = inPos++; |
223 | future = next(future, in, inPos); |
224 | hashTab[hash(future)] = inPos++; |
225 | } else { |
226 | // copy one byte from input to output as part of literal |
227 | out[outPos++] = in[inPos++]; |
228 | literals++; |
229 | // at the end of this literal chunk, write the length |
230 | // to the control byte and start a new chunk |
231 | if (literals == MAX_LITERAL) { |
232 | out[outPos - literals - 1] = (byte) (literals - 1); |
233 | literals = 0; |
234 | // move ahead one byte to allow for the |
235 | // literal run control byte |
236 | outPos++; |
237 | } |
238 | } |
239 | } |
240 | // write the remaining few bytes as literals |
241 | while (inPos < inLen) { |
242 | out[outPos++] = in[inPos++]; |
243 | literals++; |
244 | if (literals == MAX_LITERAL) { |
245 | out[outPos - literals - 1] = (byte) (literals - 1); |
246 | literals = 0; |
247 | outPos++; |
248 | } |
249 | } |
250 | // writes the final literal run length to the control byte |
251 | out[outPos - literals - 1] = (byte) (literals - 1); |
252 | if (literals == 0) { |
253 | outPos--; |
254 | } |
255 | return outPos; |
256 | } |
257 | |
258 | /** |
259 | * Compress a number of bytes. |
260 | * |
261 | * @param in the input data |
262 | * @param out the output area |
263 | * @param outPos the offset at the output array |
264 | * @return the end position |
265 | */ |
266 | public int compress(ByteBuffer in, byte[] out, int outPos) { |
267 | int inPos = in.position(); |
268 | int inLen = in.capacity() - inPos; |
269 | if (cachedHashTable == null) { |
270 | cachedHashTable = new int[HASH_SIZE]; |
271 | } |
272 | int[] hashTab = cachedHashTable; |
273 | int literals = 0; |
274 | outPos++; |
275 | int future = first(in, 0); |
276 | while (inPos < inLen - 4) { |
277 | byte p2 = in.get(inPos + 2); |
278 | // next |
279 | future = (future << 8) + (p2 & 255); |
280 | int off = hash(future); |
281 | int ref = hashTab[off]; |
282 | hashTab[off] = inPos; |
283 | // if (ref < inPos |
284 | // && ref > 0 |
285 | // && (off = inPos - ref - 1) < MAX_OFF |
286 | // && in[ref + 2] == p2 |
287 | // && (((in[ref] & 255) << 8) | (in[ref + 1] & 255)) == |
288 | // ((future >> 8) & 0xffff)) { |
289 | if (ref < inPos |
290 | && ref > 0 |
291 | && (off = inPos - ref - 1) < MAX_OFF |
292 | && in.get(ref + 2) == p2 |
293 | && in.get(ref + 1) == (byte) (future >> 8) |
294 | && in.get(ref) == (byte) (future >> 16)) { |
295 | // match |
296 | int maxLen = inLen - inPos - 2; |
297 | if (maxLen > MAX_REF) { |
298 | maxLen = MAX_REF; |
299 | } |
300 | if (literals == 0) { |
301 | // multiple back-references, |
302 | // so there is no literal run control byte |
303 | outPos--; |
304 | } else { |
305 | // set the control byte at the start of the literal run |
306 | // to store the number of literals |
307 | out[outPos - literals - 1] = (byte) (literals - 1); |
308 | literals = 0; |
309 | } |
310 | int len = 3; |
311 | while (len < maxLen && in.get(ref + len) == in.get(inPos + len)) { |
312 | len++; |
313 | } |
314 | len -= 2; |
315 | if (len < 7) { |
316 | out[outPos++] = (byte) ((off >> 8) + (len << 5)); |
317 | } else { |
318 | out[outPos++] = (byte) ((off >> 8) + (7 << 5)); |
319 | out[outPos++] = (byte) (len - 7); |
320 | } |
321 | out[outPos++] = (byte) off; |
322 | // move one byte forward to allow for a literal run control byte |
323 | outPos++; |
324 | inPos += len; |
325 | // rebuild the future, and store the last bytes to the |
326 | // hashtable. Storing hashes of the last bytes in back-reference |
327 | // improves the compression ratio and only reduces speed |
328 | // slightly. |
329 | future = first(in, inPos); |
330 | future = next(future, in, inPos); |
331 | hashTab[hash(future)] = inPos++; |
332 | future = next(future, in, inPos); |
333 | hashTab[hash(future)] = inPos++; |
334 | } else { |
335 | // copy one byte from input to output as part of literal |
336 | out[outPos++] = in.get(inPos++); |
337 | literals++; |
338 | // at the end of this literal chunk, write the length |
339 | // to the control byte and start a new chunk |
340 | if (literals == MAX_LITERAL) { |
341 | out[outPos - literals - 1] = (byte) (literals - 1); |
342 | literals = 0; |
343 | // move ahead one byte to allow for the |
344 | // literal run control byte |
345 | outPos++; |
346 | } |
347 | } |
348 | } |
349 | // write the remaining few bytes as literals |
350 | while (inPos < inLen) { |
351 | out[outPos++] = in.get(inPos++); |
352 | literals++; |
353 | if (literals == MAX_LITERAL) { |
354 | out[outPos - literals - 1] = (byte) (literals - 1); |
355 | literals = 0; |
356 | outPos++; |
357 | } |
358 | } |
359 | // writes the final literal run length to the control byte |
360 | out[outPos - literals - 1] = (byte) (literals - 1); |
361 | if (literals == 0) { |
362 | outPos--; |
363 | } |
364 | return outPos; |
365 | } |
366 | |
367 | @Override |
368 | public void expand(byte[] in, int inPos, int inLen, byte[] out, int outPos, |
369 | int outLen) { |
370 | // if ((inPos | outPos | outLen) < 0) { |
371 | if (inPos < 0 || outPos < 0 || outLen < 0) { |
372 | throw new IllegalArgumentException(); |
373 | } |
374 | do { |
375 | int ctrl = in[inPos++] & 255; |
376 | if (ctrl < MAX_LITERAL) { |
377 | // literal run of length = ctrl + 1, |
378 | ctrl++; |
379 | // copy to output and move forward this many bytes |
380 | // while (ctrl-- > 0) { |
381 | // out[outPos++] = in[inPos++]; |
382 | // } |
383 | System.arraycopy(in, inPos, out, outPos, ctrl); |
384 | outPos += ctrl; |
385 | inPos += ctrl; |
386 | } else { |
387 | // back reference |
388 | // the highest 3 bits are the match length |
389 | int len = ctrl >> 5; |
390 | // if the length is maxed, add the next byte to the length |
391 | if (len == 7) { |
392 | len += in[inPos++] & 255; |
393 | } |
394 | // minimum back-reference is 3 bytes, |
395 | // so 2 was subtracted before storing size |
396 | len += 2; |
397 | |
398 | // ctrl is now the offset for a back-reference... |
399 | // the logical AND operation removes the length bits |
400 | ctrl = -((ctrl & 0x1f) << 8) - 1; |
401 | |
402 | // the next byte augments/increases the offset |
403 | ctrl -= in[inPos++] & 255; |
404 | |
405 | // copy the back-reference bytes from the given |
406 | // location in output to current position |
407 | ctrl += outPos; |
408 | if (outPos + len >= out.length) { |
409 | // reduce array bounds checking |
410 | throw new ArrayIndexOutOfBoundsException(); |
411 | } |
412 | for (int i = 0; i < len; i++) { |
413 | out[outPos++] = out[ctrl++]; |
414 | } |
415 | } |
416 | } while (outPos < outLen); |
417 | } |
418 | |
419 | /** |
420 | * Expand a number of compressed bytes. |
421 | * |
422 | * @param in the compressed data |
423 | * @param out the output area |
424 | */ |
425 | public static void expand(ByteBuffer in, ByteBuffer out) { |
426 | do { |
427 | int ctrl = in.get() & 255; |
428 | if (ctrl < MAX_LITERAL) { |
429 | // literal run of length = ctrl + 1, |
430 | ctrl++; |
431 | // copy to output and move forward this many bytes |
432 | // (maybe slice would be faster) |
433 | for (int i = 0; i < ctrl; i++) { |
434 | out.put(in.get()); |
435 | } |
436 | } else { |
437 | // back reference |
438 | // the highest 3 bits are the match length |
439 | int len = ctrl >> 5; |
440 | // if the length is maxed, add the next byte to the length |
441 | if (len == 7) { |
442 | len += in.get() & 255; |
443 | } |
444 | // minimum back-reference is 3 bytes, |
445 | // so 2 was subtracted before storing size |
446 | len += 2; |
447 | |
448 | // ctrl is now the offset for a back-reference... |
449 | // the logical AND operation removes the length bits |
450 | ctrl = -((ctrl & 0x1f) << 8) - 1; |
451 | |
452 | // the next byte augments/increases the offset |
453 | ctrl -= in.get() & 255; |
454 | |
455 | // copy the back-reference bytes from the given |
456 | // location in output to current position |
457 | // (maybe slice would be faster) |
458 | ctrl += out.position(); |
459 | for (int i = 0; i < len; i++) { |
460 | out.put(out.get(ctrl++)); |
461 | } |
462 | } |
463 | } while (out.position() < out.capacity()); |
464 | } |
465 | |
466 | @Override |
467 | public int getAlgorithm() { |
468 | return Compressor.LZF; |
469 | } |
470 | |
471 | } |