/*
* Copyright (c) 2000-2013 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/*
This file contains x86_64 hand optimized implementation of WKdm memory page decompressor.
void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words);
input :
src_buf : address of input compressed data buffer
dest_buf : address of output decompressed buffer
scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller
words : this argument is not used in the implementation
output :
the input buffer is decompressed and the dest_buf is written with decompressed data.
Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress x86_64 assembly code WKdmCompress.s
The bit stream (*src_buf) consists of
a. 12 bytes header
b. 256 bytes for 1024 packed tags
c. (varying number of) words for new words not matched to dictionary word.
d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows
for (i=0;i<1024;i++) {
tag = *next_tag++
switch (tag) {
case 0 : *dest_buf++ = 0; break;
case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break;
case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break;
case 3 : *dest_buf++ = dictionary[*dict_index++]; break;
}
cclee, 11/30/12
Added zero page, single value page, sparse page, early abort optimizations
rsrini, 09/14/14
*/
#define MZV_MAGIC $17185 // magic value used to identify MZV page encoding
.text
.globl _WKdm_decompress_new
_WKdm_decompress_new:
// save registers, and allocate stack memory for local variables
pushq %rbp
movq %rsp, %rbp
pushq %r12
pushq %r13
pushq %rbx
subq $(64+8+16), %rsp
movl 0(%rdi), %eax // read the 1st word from the header
cmpl MZV_MAGIC, %eax // is the alternate packer used (i.e. is MZV page)?
jne L_default_decompressor // default decompressor was used
// Mostly Zero Page Handling...
// {
movq $0, %rax
1: // Zero out the entire page
movq $0, 0(%rsi, %rax)
movq $0, 8(%rsi, %rax)
movq $0, 16(%rsi, %rax)
movq $0, 24(%rsi, %rax)
movq $0, 32(%rsi, %rax)
movq $0, 40(%rsi, %rax)
movq $0, 48(%rsi, %rax)
movq $0, 56(%rsi, %rax)
addq $64, %rax
cmpq $4096, %rax
jne 1b
movq $4, %r12 // current byte position in src to read from
2:
movl 0(%rdi, %r12), %eax // get the word
movzwq 4(%rdi, %r12), %rdx // get the index
movl %eax, 0(%rsi, %rdx) // store non-0 word in the destination buffer
addq $6, %r12 // 6 more bytes processed
cmpl %ecx, %r12d // finished processing all the bytes?
jne 2b
jmp L_done
// }
L_default_decompressor:
movq %rsi, %r12 // dest_buf
movq %rdx, %r13 // scratch_buf
// PRELOAD_DICTONARY; dictionary starting address : starting address 0(%rsp)
// NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR
#if 1
movl $0, 0(%rsp)
movl $0, 4(%rsp)
movl $0, 8(%rsp)
movl $0, 12(%rsp)
movl $0, 16(%rsp)
movl $0, 20(%rsp)
movl $0, 24(%rsp)
movl $0, 28(%rsp)
movl $0, 32(%rsp)
movl $0, 36(%rsp)
movl $0, 40(%rsp)
movl $0, 44(%rsp)
movl $0, 48(%rsp)
movl $0, 52(%rsp)
movl $0, 56(%rsp)
movl $0, 60(%rsp)
#else
mov $0x100000001, %rax
mov %rax, (%rsp)
mov %rax, 8(%rsp)
mov %rax, 16(%rsp)
mov %rax, 24(%rsp)
mov %rax, 32(%rsp)
mov %rax, 40(%rsp)
mov %rax, 48(%rsp)
mov %rax, 56(%rsp)
#endif
// WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray);
leaq 268(%rdi), %r10 // TAGS_AREA_END
leaq 12(%rdi), %rax // TAGS_AREA_START
movq %r13, %rsi // tempTagsArray
cmpq %rax, %r10 // TAGS_AREA_END vs TAGS_AREA_START
jbe 1f // if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits
movq %r13, %rcx // next_word
xorl %r8d, %r8d // i = 0
mov $(50529027<<32)+50529027, %r9
L_WK_unpack_2bits:
movl 12(%rdi,%r8, 4), %eax
movl 12(%rdi,%r8, 4), %edx
shrl $2, %eax
shlq $32, %rax
orq %rdx, %rax
movq %rax, %rdx
shrq $4, %rax
andq %r9, %rdx
andq %r9, %rax
incq %r8 // i++
movq %rdx, (%rcx)
movq %rax, 8(%rcx)
addq $16, %rcx // next_tags += 16
cmpq $64, %r8 // i vs 64
jne L_WK_unpack_2bits // repeat loop until i==64
1:
// WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray);
mov 4(%rdi), %eax // WKdm header qpos end
leaq (%rdi,%rax,4), %r9 // QPOS_AREA_END
mov 0(%rdi), %eax // WKdm header qpos start
leaq (%rdi,%rax,4), %r8 // QPOS_AREA_START
leaq 1024(%r13), %rbx // tempQPosArray
cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START
jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits
leaq 8(%rbx), %rcx // next_qpos
mov $(252645135<<32)+252645135, %r11
L_WK_unpack_4bits:
movl (%r8), %eax // w = *next_word
movl %eax, %edx // w
shlq $28, %rax
orq %rdx, %rax
addq $4, %r8 // next_word++
andq %r11, %rax
movq %rax, -8(%rcx)
addq $8, %rcx // next_qpos+=8
cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START
ja L_WK_unpack_4bits // repeat loop until QPOS_AREA_END <= QPOS_AREA_START
1:
// WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray);
movl 8(%rdi), %eax // LOW_BITS_AREA_END offset
leaq (%rdi,%rax,4), %rdi // LOW_BITS_AREA_END
leaq 2048(%r13), %r11 // tempLowBitsArray
leaq 4094(%r13), %r13 // final tenbits addr
sub %r9, %rdi // LOW_BITS_AREA_START vs LOW_BITS_AREA_END
jle 1f // if START>=END, skip L_WK_unpack_3_tenbits
movq %r11, %rcx // next_low_bits
L_WK_unpack_3_tenbits:
movl (%r9), %eax // w = *next_word, 0:c:b:a
movl $(1023<<10), %edx
movl $(1023<<20), %r8d
andl %eax, %edx // b << 10
andl %eax, %r8d // c << 20
andq $1023, %rax
shll $6, %edx
shlq $12, %r8
orl %edx, %eax
orq %r8, %rax
cmp %r13, %rcx
je 2f
mov %rax, (%rcx)
jmp 3f
2: mov %ax, (%rcx)
3:
addq $4, %r9 // next_word++
addq $6, %rcx // next_low_bits += 3
sub $4, %rdi
jg L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word
1:
#define next_qpos %rbx
#define hash %r8
#define tags_counter %edi
#define dest_buf %r12
#define next_full_patt %r10
leaq _hashLookupTable_new(%rip), hash // hash look up table
movl $1024, tags_counter // tags_counter
jmp L_next
.align 4,0x90
L_nonpartital:
jl L_ZERO_TAG
cmpb $2, -1(%rsi)
je L_MISS_TAG
L_EXACT_TAG:
movzbl (next_qpos), %eax // qpos = *next_qpos
incq next_qpos // next_qpos++
decl tags_counter // tags_counter--
movl (%rsp,%rax,4), %eax // w = dictionary[qpos]
movl %eax, -4(dest_buf) // *dest_buf = w
je L_done
L_next:
incq %rsi // next_tag++
addq $4, dest_buf
cmpb $1, -1(%rsi)
jne L_nonpartital
L_PARTIAL_TAG:
movzbl (next_qpos),%edx // qpos = *next_qpos
incq next_qpos // next_qpos++
movl (%rsp,%rdx,4), %eax // read dictionary word
andl $-1024, %eax // clear lower 10 bits
or (%r11), %ax // pad the lower 10-bits from *next_low_bits
addq $2, %r11 // next_low_bits++
decl tags_counter // tags_counter--
movl %eax, (%rsp,%rdx,4) // *dict_location = newly formed word
movl %eax, -4(dest_buf) // *dest_buf = newly formed word
jg L_next // repeat loop until next_tag==tag_area_end
L_done:
// release stack memory, restore registers, and return
addq $(64+8+16), %rsp
popq %rbx
popq %r13
popq %r12
leave
ret
.align 4,0x90
L_MISS_TAG:
movl (next_full_patt), %edx // w = *next_full_patt
movl (next_full_patt), %eax // w = *next_full_patt
shrl $10, %edx // w>>10
addq $4, next_full_patt // next_full_patt++
movzbl %dl, %edx // 8-bit hash table index
movl %eax, -4(dest_buf) // *dest_buf = word
movzbl (hash,%rdx),%edx // qpos
decl tags_counter // tags_counter--
movl %eax, (%rsp,%rdx) // dictionary[qpos] = word
jg L_next // repeat the loop
jmp L_done
.align 4,0x90
L_ZERO_TAG:
decl tags_counter // tags_counter--
movl $0, -4(dest_buf) // *dest_buf = 0
jg L_next // repeat the loop
jmp L_done