Moving to non-multiply-free entropy coder
This commit is contained in:
parent
fc43dbb7d6
commit
9c3e22c440
3 changed files with 389 additions and 1 deletions
|
@ -2,7 +2,7 @@ INCLUDES =
|
|||
METASOURCES = AUTO
|
||||
lib_LTLIBRARIES = libentcode.la
|
||||
libentcode_la_SOURCES = bitrdec.c bitree.c bitrenc.c ecintrin.h entcode.c \
|
||||
entdec.c entenc.c laplace.c mfrngdec.c mfrngenc.c probdec.c probenc.c probmod.c
|
||||
entdec.c entenc.c laplace.c rangedec.c rangeenc.c probdec.c probenc.c probmod.c
|
||||
bin_PROGRAMS = ectest
|
||||
ectest_SOURCES = ectest.c
|
||||
ectest_LDADD = $(top_builddir)/libentcode/libentcode.la
|
||||
|
|
243
libentcode/rangedec.c
Normal file
243
libentcode/rangedec.c
Normal file
|
@ -0,0 +1,243 @@
|
|||
#include <stddef.h>
|
||||
#include "entdec.h"
|
||||
#include "mfrngcod.h"
|
||||
|
||||
|
||||
|
||||
/*A multiply-free range decoder.
|
||||
This is an entropy decoder based upon \cite{Mar79}, which is itself a
|
||||
rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
|
||||
It is very similar to arithmetic encoding, except that encoding is done with
|
||||
digits in any base, instead of with bits, and so it is faster when using
|
||||
larger bases (i.e.: a byte).
|
||||
The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
|
||||
is the base, longer than the theoretical optimum, but to my knowledge there
|
||||
is no published justification for this claim.
|
||||
This only seems true when using near-infinite precision arithmetic so that
|
||||
the process is carried out with no rounding errors.
|
||||
|
||||
IBM (the author's employer) never sought to patent the idea, and to my
|
||||
knowledge the algorithm is unencumbered by any patents, though its
|
||||
performance is very competitive with proprietary arithmetic coding.
|
||||
The two are based on very similar ideas, however.
|
||||
An excellent description of implementation details is available at
|
||||
http://www.arturocampos.com/ac_range.html
|
||||
A recent work \cite{MNW98} which proposes several changes to arithmetic
|
||||
encoding for efficiency actually re-discovers many of the principles
|
||||
behind range encoding, and presents a good theoretical analysis of them.
|
||||
|
||||
The coder is made multiply-free by replacing the standard multiply/divide
|
||||
used to partition the current interval according to the total frequency
|
||||
count.
|
||||
The new partition function scales the count so that it differs from the size
|
||||
of the interval by no more than a factor of two and then assigns each symbol
|
||||
one or two code words in the interval.
|
||||
For details see \cite{SM98}.
|
||||
|
||||
This coder also handles the end of the stream in a slightly more graceful
|
||||
fashion than most arithmetic or range coders.
|
||||
Once the final symbol has been encoded, the coder selects the code word with
|
||||
the shortest number of bits that still falls within the final interval.
|
||||
This method is not novel.
|
||||
Here, by the length of the code word, we refer to the number of bits until
|
||||
its final 1.
|
||||
Any trailing zeros may be discarded, since the encoder, once it runs out of
|
||||
input, will pad its buffer with zeros.
|
||||
|
||||
But this means that no encoded stream would ever have any zero bytes at the
|
||||
end.
|
||||
Since there are some coded representations we cannot produce, it implies that
|
||||
there is still some redundancy in the stream.
|
||||
In this case, we can pick a special byte value, RSV1, and should the stream
|
||||
end in a sequence of zeros, followed by the RSV1 byte, we can code the
|
||||
zeros, and discard the RSV1 byte.
|
||||
The decoder, knowing that the encoder would never produce a sequence of zeros
|
||||
at the end, would then know to add in the RSV1 byte if it observed it.
|
||||
|
||||
Now, the encoder would never produce a stream that ended in a sequence of
|
||||
zeros followed by a RSV1 byte.
|
||||
So, if the stream ends in a non-empty sequence of zeros, followed by any
|
||||
positive number of RSV1 bytes, the last RSV1 byte is discarded.
|
||||
The decoder, if it encounters a stream that ends in non-empty sequence of
|
||||
zeros followed by any non-negative number of RSV1 bytes, adds an additional
|
||||
RSV1 byte to the stream.
|
||||
With this strategy, every possible sequence of input bytes is transformed to
|
||||
one that could actually be produced by the encoder.
|
||||
|
||||
The only question is what non-zero value to use for RSV1.
|
||||
We select 0x80, since it has the nice property of producing the shortest
|
||||
possible byte streams when using our strategy for selecting a number within
|
||||
the final interval to encode.
|
||||
Clearly if the shortest possible code word that falls within the interval has
|
||||
its last one bit as the most significant bit of the final byte, and the
|
||||
previous bytes were a non-empty sequence of zeros followed by a non-negative
|
||||
number of 0x80 bytes, then the last byte would be discarded.
|
||||
If the shortest code word is not so formed, then no other code word in the
|
||||
interval would result in any more bytes being discarded.
|
||||
Any longer code word would have an additional one bit somewhere, and so would
|
||||
require at a minimum that that byte would be coded.
|
||||
If the shortest code word has a 1 before the final one that is preventing the
|
||||
stream from ending in a non-empty sequence of zeros followed by a
|
||||
non-negative number of 0x80's, then there is no code word of the same length
|
||||
which contains that bit as a zero.
|
||||
If there were, then we could simply leave that bit a 1, and drop all the bits
|
||||
after it without leaving the interval, thus producing a shorter code word.
|
||||
|
||||
In this case, RSV1 can only drop 1 bit off the final stream.
|
||||
Other choices could lead to savings of up to 8 bits for particular streams,
|
||||
but this would produce the odd situation that a stream with more non-zero
|
||||
bits is actually encoded in fewer bytes.
|
||||
|
||||
@PHDTHESIS{Pas76,
|
||||
author="Richard Clark Pasco",
|
||||
title="Sorce coding algorithms for fast data compression",
|
||||
school="Dept. of Electrical Engineering, Stanford University",
|
||||
address="Stanford, CA",
|
||||
month=May,
|
||||
year=1976
|
||||
}
|
||||
@INPROCEEDINGS{Mar79,
|
||||
author="Martin, G.N.N.",
|
||||
title="Range encoding: an algorithm for removing redundancy from a digitised
|
||||
message",
|
||||
booktitle="Video & Data Recording Conference",
|
||||
year=1979,
|
||||
address="Southampton",
|
||||
month=Jul
|
||||
}
|
||||
@ARTICLE{MNW98,
|
||||
author="Alistair Moffat and Radford Neal and Ian H. Witten",
|
||||
title="Arithmetic Coding Revisited",
|
||||
journal="{ACM} Transactions on Information Systems",
|
||||
year=1998,
|
||||
volume=16,
|
||||
number=3,
|
||||
pages="256--294",
|
||||
month=Jul,
|
||||
URL="http://dev.acm.org/pubs/citations/journals/tois/1998-16-3/p256-moffat/"
|
||||
}
|
||||
@INPROCEEDINGS{SM98,
|
||||
author="Lang Stuiver and Alistair Moffat",
|
||||
title="Piecewise Integer Mapping for Arithmetic Coding",
|
||||
booktitle="Proceedings of the {IEEE} Data Compression Conference",
|
||||
pages="1--10",
|
||||
address="Snowbird, UT",
|
||||
month="Mar./Apr.",
|
||||
year=1998
|
||||
}*/
|
||||
|
||||
|
||||
|
||||
/*Gets the next byte of input.
|
||||
After all the bytes in the current packet have been consumed, and the extra
|
||||
end code returned if needed, this function will continue to return zero each
|
||||
time it is called.
|
||||
Return: The next byte of input.*/
|
||||
static int ec_dec_in(ec_dec *_this){
|
||||
int ret;
|
||||
ret=ec_byte_read1(_this->buf);
|
||||
if(ret<0){
|
||||
unsigned char *buf;
|
||||
long bytes;
|
||||
bytes=ec_byte_bytes(_this->buf);
|
||||
buf=ec_byte_get_buffer(_this->buf);
|
||||
/*Breaking abstraction: don't do this at home, kids.*/
|
||||
if(_this->buf->storage==bytes){
|
||||
ec_byte_adv1(_this->buf);
|
||||
if(bytes>0){
|
||||
unsigned char *p;
|
||||
p=buf+bytes;
|
||||
/*If we end in a string of 0 or more EC_FOF_RSV1 bytes preceded by a
|
||||
zero, return an extra EC_FOF_RSV1 byte.*/
|
||||
do p--;
|
||||
while(p>buf&&p[0]==EC_FOF_RSV1);
|
||||
if(!p[0])return EC_FOF_RSV1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
else return ret;
|
||||
}
|
||||
|
||||
/*Normalizes the contents of low and rng so that rng is contained in the
|
||||
high-order symbol of low.*/
|
||||
static void ec_dec_normalize(ec_dec *_this){
|
||||
/*If the range is too small, rescale it and input some bits.*/
|
||||
while(_this->rng<=EC_CODE_BOT){
|
||||
int sym;
|
||||
_this->rng<<=EC_SYM_BITS;
|
||||
/*Use up the remaining bits from our last symbol.*/
|
||||
sym=_this->rem<<EC_CODE_EXTRA&EC_SYM_MAX;
|
||||
/*Read the next value from the input.*/
|
||||
_this->rem=ec_dec_in(_this);
|
||||
/*Take the rest of the bits we need from this new symbol.*/
|
||||
sym|=_this->rem>>EC_SYM_BITS-EC_CODE_EXTRA;
|
||||
_this->dif=(_this->dif<<EC_SYM_BITS)-sym&EC_CODE_MASK;
|
||||
/*dif can never be larger than EC_CODE_TOP.
|
||||
This is equivalent to the slightly more readable:
|
||||
if(_this->dif>EC_CODE_TOP)_this->dif-=EC_CODE_TOP;*/
|
||||
_this->dif^=(_this->dif&_this->dif-1)&EC_CODE_TOP;
|
||||
}
|
||||
}
|
||||
|
||||
void ec_dec_init(ec_dec *_this,ec_byte_buffer *_buf){
|
||||
_this->buf=_buf;
|
||||
_this->rem=ec_dec_in(_this);
|
||||
_this->rng=1U<<EC_CODE_EXTRA;
|
||||
_this->dif=_this->rng-(_this->rem>>EC_SYM_BITS-EC_CODE_EXTRA);
|
||||
/*Normalize the interval.*/
|
||||
ec_dec_normalize(_this);
|
||||
}
|
||||
|
||||
|
||||
unsigned ec_decode(ec_dec *_this,unsigned _ft){
|
||||
unsigned s;
|
||||
_this->nrm=_this->rng/_ft;
|
||||
s=(unsigned)((_this->dif-1)/_this->nrm);
|
||||
return _ft-EC_MINI(s+1,_ft);
|
||||
}
|
||||
|
||||
void ec_dec_update(ec_dec *_this,unsigned _fl,unsigned _fh,unsigned _ft){
|
||||
ec_uint32 s;
|
||||
s=_this->nrm*(_ft-_fh);
|
||||
_this->dif-=s;
|
||||
_this->rng=_fl>0?_this->nrm*(_fh-_fl):_this->rng-s;
|
||||
ec_dec_normalize(_this);
|
||||
}
|
||||
|
||||
#if 0
|
||||
int ec_dec_done(ec_dec *_this){
|
||||
unsigned low;
|
||||
int ret;
|
||||
/*Check to make sure we've used all the input bytes.
|
||||
This ensures that no more ones would ever be inserted into the decoder.*/
|
||||
if(_this->buf->ptr-ec_byte_get_buffer(_this->buf)<=
|
||||
ec_byte_bytes(_this->buf)){
|
||||
return 0;
|
||||
}
|
||||
/*We compute the smallest finitely odd fraction that fits inside the current
|
||||
range, and write that to the stream.
|
||||
This is guaranteed to yield the smallest possible encoding.*/
|
||||
/*TODO: Fix this line, as it is wrong.
|
||||
It doesn't seem worth being able to make this check to do an extra
|
||||
subtraction for every symbol decoded.*/
|
||||
low=/*What we want: _this->top-_this->rng; What we have:*/_this->dif
|
||||
if(low){
|
||||
unsigned end;
|
||||
end=EC_CODE_TOP;
|
||||
/*Ensure that the next free end is in the range.*/
|
||||
if(end-low>=_this->rng){
|
||||
unsigned msk;
|
||||
msk=EC_CODE_TOP-1;
|
||||
do{
|
||||
msk>>=1;
|
||||
end=(low+msk)&~msk|msk+1;
|
||||
}
|
||||
while(end-low>=_this->rng);
|
||||
}
|
||||
/*The remaining input should have been the next free end.*/
|
||||
return end-low!=_this->dif;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
#endif
|
145
libentcode/rangeenc.c
Normal file
145
libentcode/rangeenc.c
Normal file
|
@ -0,0 +1,145 @@
|
|||
#include <stddef.h>
|
||||
#include "entenc.h"
|
||||
#include "mfrngcod.h"
|
||||
|
||||
|
||||
|
||||
/*A multiply-free range encoder.
|
||||
See mfrngdec.c and the references for implementation details
|
||||
\cite{Mar79,MNW98,SM98}.
|
||||
|
||||
@INPROCEEDINGS{Mar79,
|
||||
author="Martin, G.N.N.",
|
||||
title="Range encoding: an algorithm for removing redundancy from a digitised
|
||||
message",
|
||||
booktitle="Video \& Data Recording Conference",
|
||||
year=1979,
|
||||
address="Southampton",
|
||||
month=Jul
|
||||
}
|
||||
@ARTICLE{MNW98,
|
||||
author="Alistair Moffat and Radford Neal and Ian H. Witten",
|
||||
title="Arithmetic Coding Revisited",
|
||||
journal="{ACM} Transactions on Information Systems",
|
||||
year=1998,
|
||||
volume=16,
|
||||
number=3,
|
||||
pages="256--294",
|
||||
month=Jul,
|
||||
URL="http://dev.acm.org/pubs/citations/journals/tois/1998-16-3/p256-moffat/"
|
||||
}
|
||||
@INPROCEEDINGS{SM98,
|
||||
author="Lang Stuiver and Alistair Moffat",
|
||||
title="Piecewise Integer Mapping for Arithmetic Coding",
|
||||
booktitle="Proceedings of the {IEEE} Data Compression Conference",
|
||||
pages="1--10",
|
||||
address="Snowbird, UT",
|
||||
month="Mar./Apr.",
|
||||
year=1998
|
||||
}*/
|
||||
|
||||
|
||||
|
||||
/*Outputs a symbol, with a carry bit.
|
||||
If there is a potential to propogate a carry over several symbols, they are
|
||||
buffered until it can be determined whether or not an actual carry will
|
||||
occur.
|
||||
If the counter for the buffered symbols overflows, then the range is
|
||||
truncated to force a carry to occur, towards whichever side maximizes the
|
||||
remaining range.*/
|
||||
static void ec_enc_carry_out(ec_enc *_this,int _c){
|
||||
if(_c!=EC_SYM_MAX){
|
||||
/*No further carry propogation possible, flush buffer.*/
|
||||
int carry;
|
||||
carry=_c>>EC_SYM_BITS;
|
||||
/*Don't output a byte on the first write.
|
||||
This compare should be taken care of by branch-prediction thereafter.*/
|
||||
if(_this->rem>=0)ec_byte_write1(_this->buf,_this->rem+carry);
|
||||
if(_this->ext>0){
|
||||
unsigned sym;
|
||||
sym=EC_SYM_MAX+carry&EC_SYM_MAX;
|
||||
do ec_byte_write1(_this->buf,sym);
|
||||
while(--(_this->ext)>0);
|
||||
}
|
||||
_this->rem=_c&EC_SYM_MAX;
|
||||
}
|
||||
else _this->ext++;
|
||||
}
|
||||
|
||||
static void ec_enc_normalize(ec_enc *_this){
|
||||
/*If the range is too small, output some bits and rescale it.*/
|
||||
while(_this->rng<=EC_CODE_BOT){
|
||||
ec_enc_carry_out(_this,(int)(_this->low>>EC_CODE_SHIFT));
|
||||
/*Move the next-to-high-order symbol into the high-order position.*/
|
||||
_this->low=_this->low<<EC_SYM_BITS&EC_CODE_TOP-1;
|
||||
_this->rng<<=EC_SYM_BITS;
|
||||
}
|
||||
}
|
||||
|
||||
void ec_enc_init(ec_enc *_this,ec_byte_buffer *_buf){
|
||||
_this->buf=_buf;
|
||||
_this->rem=-1;
|
||||
_this->ext=0;
|
||||
_this->low=0;
|
||||
_this->rng=EC_CODE_TOP;
|
||||
}
|
||||
|
||||
void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft){
|
||||
unsigned r;
|
||||
unsigned s;
|
||||
r=_this->rng/_ft;
|
||||
if(_fl>0){
|
||||
s=r*(_ft-_fl);
|
||||
_this->low+=_this->rng-s;
|
||||
_this->rng=r*(_fh-_fl);
|
||||
}
|
||||
else _this->rng-=r*(_ft-_fh);
|
||||
ec_enc_normalize(_this);
|
||||
}
|
||||
|
||||
void ec_enc_done(ec_enc *_this){
|
||||
/*We compute the integer in the current interval that has the largest number
|
||||
of trailing zeros, and write that to the stream.
|
||||
This is guaranteed to yield the smallest possible encoding.*/
|
||||
if(_this->low){
|
||||
unsigned end;
|
||||
end=EC_CODE_TOP;
|
||||
/*Ensure that the end value is in the range.*/
|
||||
if(end-_this->low>=_this->rng){
|
||||
unsigned msk;
|
||||
msk=EC_CODE_TOP-1;
|
||||
do{
|
||||
msk>>=1;
|
||||
end=(_this->low+msk)&~msk|msk+1;
|
||||
}
|
||||
while(end-_this->low>=_this->rng);
|
||||
}
|
||||
/*The remaining output is the next free end.*/
|
||||
while(end){
|
||||
ec_enc_carry_out(_this,end>>EC_CODE_SHIFT);
|
||||
end=end<<EC_SYM_BITS&EC_CODE_TOP-1;
|
||||
}
|
||||
}
|
||||
/*If we have a buffered byte...*/
|
||||
if(_this->rem>=0){
|
||||
unsigned char *p;
|
||||
unsigned char *buf;
|
||||
/*Flush it into the output buffer.*/
|
||||
ec_enc_carry_out(_this,0);
|
||||
/*We may be able to drop some redundant bytes from the end.*/
|
||||
buf=ec_byte_get_buffer(_this->buf);
|
||||
p=buf+ec_byte_bytes(_this->buf)-1;
|
||||
/*Strip trailing zeros.*/
|
||||
while(p>=buf&&!p[0])p--;
|
||||
/*Strip one trailing EC_FOF_RSV1 byte if the buffer ends in a string of
|
||||
consecutive EC_FOF_RSV1 bytes preceded by one (or more) zeros.*/
|
||||
if(p>buf&&p[0]==EC_FOF_RSV1){
|
||||
unsigned char *q;
|
||||
q=p;
|
||||
do q--;
|
||||
while(q>buf&&q[0]==EC_FOF_RSV1);
|
||||
if(!q[0])p--;
|
||||
}
|
||||
ec_byte_writetrunc(_this->buf,p+1-buf);
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue