From 9c3e22c440274d0be4578115a0c5ba7fc9f49c44 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Fri, 7 Dec 2007 22:25:31 +1100 Subject: [PATCH] Moving to non-multiply-free entropy coder --- libentcode/Makefile.am | 2 +- libentcode/rangedec.c | 243 +++++++++++++++++++++++++++++++++++++++++ libentcode/rangeenc.c | 145 ++++++++++++++++++++++++ 3 files changed, 389 insertions(+), 1 deletion(-) create mode 100644 libentcode/rangedec.c create mode 100644 libentcode/rangeenc.c diff --git a/libentcode/Makefile.am b/libentcode/Makefile.am index 97112e74..45a8f265 100644 --- a/libentcode/Makefile.am +++ b/libentcode/Makefile.am @@ -2,7 +2,7 @@ INCLUDES = METASOURCES = AUTO lib_LTLIBRARIES = libentcode.la libentcode_la_SOURCES = bitrdec.c bitree.c bitrenc.c ecintrin.h entcode.c \ - entdec.c entenc.c laplace.c mfrngdec.c mfrngenc.c probdec.c probenc.c probmod.c + entdec.c entenc.c laplace.c rangedec.c rangeenc.c probdec.c probenc.c probmod.c bin_PROGRAMS = ectest ectest_SOURCES = ectest.c ectest_LDADD = $(top_builddir)/libentcode/libentcode.la diff --git a/libentcode/rangedec.c b/libentcode/rangedec.c new file mode 100644 index 00000000..be93fcfd --- /dev/null +++ b/libentcode/rangedec.c @@ -0,0 +1,243 @@ +#include +#include "entdec.h" +#include "mfrngcod.h" + + + +/*A multiply-free range decoder. + This is an entropy decoder based upon \cite{Mar79}, which is itself a + rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}. + It is very similar to arithmetic encoding, except that encoding is done with + digits in any base, instead of with bits, and so it is faster when using + larger bases (i.e.: a byte). + The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$ + is the base, longer than the theoretical optimum, but to my knowledge there + is no published justification for this claim. + This only seems true when using near-infinite precision arithmetic so that + the process is carried out with no rounding errors. + + IBM (the author's employer) never sought to patent the idea, and to my + knowledge the algorithm is unencumbered by any patents, though its + performance is very competitive with proprietary arithmetic coding. + The two are based on very similar ideas, however. + An excellent description of implementation details is available at + http://www.arturocampos.com/ac_range.html + A recent work \cite{MNW98} which proposes several changes to arithmetic + encoding for efficiency actually re-discovers many of the principles + behind range encoding, and presents a good theoretical analysis of them. + + The coder is made multiply-free by replacing the standard multiply/divide + used to partition the current interval according to the total frequency + count. + The new partition function scales the count so that it differs from the size + of the interval by no more than a factor of two and then assigns each symbol + one or two code words in the interval. + For details see \cite{SM98}. + + This coder also handles the end of the stream in a slightly more graceful + fashion than most arithmetic or range coders. + Once the final symbol has been encoded, the coder selects the code word with + the shortest number of bits that still falls within the final interval. + This method is not novel. + Here, by the length of the code word, we refer to the number of bits until + its final 1. + Any trailing zeros may be discarded, since the encoder, once it runs out of + input, will pad its buffer with zeros. + + But this means that no encoded stream would ever have any zero bytes at the + end. + Since there are some coded representations we cannot produce, it implies that + there is still some redundancy in the stream. + In this case, we can pick a special byte value, RSV1, and should the stream + end in a sequence of zeros, followed by the RSV1 byte, we can code the + zeros, and discard the RSV1 byte. + The decoder, knowing that the encoder would never produce a sequence of zeros + at the end, would then know to add in the RSV1 byte if it observed it. + + Now, the encoder would never produce a stream that ended in a sequence of + zeros followed by a RSV1 byte. + So, if the stream ends in a non-empty sequence of zeros, followed by any + positive number of RSV1 bytes, the last RSV1 byte is discarded. + The decoder, if it encounters a stream that ends in non-empty sequence of + zeros followed by any non-negative number of RSV1 bytes, adds an additional + RSV1 byte to the stream. + With this strategy, every possible sequence of input bytes is transformed to + one that could actually be produced by the encoder. + + The only question is what non-zero value to use for RSV1. + We select 0x80, since it has the nice property of producing the shortest + possible byte streams when using our strategy for selecting a number within + the final interval to encode. + Clearly if the shortest possible code word that falls within the interval has + its last one bit as the most significant bit of the final byte, and the + previous bytes were a non-empty sequence of zeros followed by a non-negative + number of 0x80 bytes, then the last byte would be discarded. + If the shortest code word is not so formed, then no other code word in the + interval would result in any more bytes being discarded. + Any longer code word would have an additional one bit somewhere, and so would + require at a minimum that that byte would be coded. + If the shortest code word has a 1 before the final one that is preventing the + stream from ending in a non-empty sequence of zeros followed by a + non-negative number of 0x80's, then there is no code word of the same length + which contains that bit as a zero. + If there were, then we could simply leave that bit a 1, and drop all the bits + after it without leaving the interval, thus producing a shorter code word. + + In this case, RSV1 can only drop 1 bit off the final stream. + Other choices could lead to savings of up to 8 bits for particular streams, + but this would produce the odd situation that a stream with more non-zero + bits is actually encoded in fewer bytes. + + @PHDTHESIS{Pas76, + author="Richard Clark Pasco", + title="Sorce coding algorithms for fast data compression", + school="Dept. of Electrical Engineering, Stanford University", + address="Stanford, CA", + month=May, + year=1976 + } + @INPROCEEDINGS{Mar79, + author="Martin, G.N.N.", + title="Range encoding: an algorithm for removing redundancy from a digitised + message", + booktitle="Video & Data Recording Conference", + year=1979, + address="Southampton", + month=Jul + } + @ARTICLE{MNW98, + author="Alistair Moffat and Radford Neal and Ian H. Witten", + title="Arithmetic Coding Revisited", + journal="{ACM} Transactions on Information Systems", + year=1998, + volume=16, + number=3, + pages="256--294", + month=Jul, + URL="http://dev.acm.org/pubs/citations/journals/tois/1998-16-3/p256-moffat/" + } + @INPROCEEDINGS{SM98, + author="Lang Stuiver and Alistair Moffat", + title="Piecewise Integer Mapping for Arithmetic Coding", + booktitle="Proceedings of the {IEEE} Data Compression Conference", + pages="1--10", + address="Snowbird, UT", + month="Mar./Apr.", + year=1998 + }*/ + + + +/*Gets the next byte of input. + After all the bytes in the current packet have been consumed, and the extra + end code returned if needed, this function will continue to return zero each + time it is called. + Return: The next byte of input.*/ +static int ec_dec_in(ec_dec *_this){ + int ret; + ret=ec_byte_read1(_this->buf); + if(ret<0){ + unsigned char *buf; + long bytes; + bytes=ec_byte_bytes(_this->buf); + buf=ec_byte_get_buffer(_this->buf); + /*Breaking abstraction: don't do this at home, kids.*/ + if(_this->buf->storage==bytes){ + ec_byte_adv1(_this->buf); + if(bytes>0){ + unsigned char *p; + p=buf+bytes; + /*If we end in a string of 0 or more EC_FOF_RSV1 bytes preceded by a + zero, return an extra EC_FOF_RSV1 byte.*/ + do p--; + while(p>buf&&p[0]==EC_FOF_RSV1); + if(!p[0])return EC_FOF_RSV1; + } + } + return 0; + } + else return ret; +} + +/*Normalizes the contents of low and rng so that rng is contained in the + high-order symbol of low.*/ +static void ec_dec_normalize(ec_dec *_this){ + /*If the range is too small, rescale it and input some bits.*/ + while(_this->rng<=EC_CODE_BOT){ + int sym; + _this->rng<<=EC_SYM_BITS; + /*Use up the remaining bits from our last symbol.*/ + sym=_this->rem<rem=ec_dec_in(_this); + /*Take the rest of the bits we need from this new symbol.*/ + sym|=_this->rem>>EC_SYM_BITS-EC_CODE_EXTRA; + _this->dif=(_this->dif<dif>EC_CODE_TOP)_this->dif-=EC_CODE_TOP;*/ + _this->dif^=(_this->dif&_this->dif-1)&EC_CODE_TOP; + } +} + +void ec_dec_init(ec_dec *_this,ec_byte_buffer *_buf){ + _this->buf=_buf; + _this->rem=ec_dec_in(_this); + _this->rng=1U<dif=_this->rng-(_this->rem>>EC_SYM_BITS-EC_CODE_EXTRA); + /*Normalize the interval.*/ + ec_dec_normalize(_this); +} + + +unsigned ec_decode(ec_dec *_this,unsigned _ft){ + unsigned s; + _this->nrm=_this->rng/_ft; + s=(unsigned)((_this->dif-1)/_this->nrm); + return _ft-EC_MINI(s+1,_ft); +} + +void ec_dec_update(ec_dec *_this,unsigned _fl,unsigned _fh,unsigned _ft){ + ec_uint32 s; + s=_this->nrm*(_ft-_fh); + _this->dif-=s; + _this->rng=_fl>0?_this->nrm*(_fh-_fl):_this->rng-s; + ec_dec_normalize(_this); +} + +#if 0 +int ec_dec_done(ec_dec *_this){ + unsigned low; + int ret; + /*Check to make sure we've used all the input bytes. + This ensures that no more ones would ever be inserted into the decoder.*/ + if(_this->buf->ptr-ec_byte_get_buffer(_this->buf)<= + ec_byte_bytes(_this->buf)){ + return 0; + } + /*We compute the smallest finitely odd fraction that fits inside the current + range, and write that to the stream. + This is guaranteed to yield the smallest possible encoding.*/ + /*TODO: Fix this line, as it is wrong. + It doesn't seem worth being able to make this check to do an extra + subtraction for every symbol decoded.*/ + low=/*What we want: _this->top-_this->rng; What we have:*/_this->dif + if(low){ + unsigned end; + end=EC_CODE_TOP; + /*Ensure that the next free end is in the range.*/ + if(end-low>=_this->rng){ + unsigned msk; + msk=EC_CODE_TOP-1; + do{ + msk>>=1; + end=(low+msk)&~msk|msk+1; + } + while(end-low>=_this->rng); + } + /*The remaining input should have been the next free end.*/ + return end-low!=_this->dif; + } + return 1; +} +#endif diff --git a/libentcode/rangeenc.c b/libentcode/rangeenc.c new file mode 100644 index 00000000..29ea2919 --- /dev/null +++ b/libentcode/rangeenc.c @@ -0,0 +1,145 @@ +#include +#include "entenc.h" +#include "mfrngcod.h" + + + +/*A multiply-free range encoder. + See mfrngdec.c and the references for implementation details + \cite{Mar79,MNW98,SM98}. + + @INPROCEEDINGS{Mar79, + author="Martin, G.N.N.", + title="Range encoding: an algorithm for removing redundancy from a digitised + message", + booktitle="Video \& Data Recording Conference", + year=1979, + address="Southampton", + month=Jul + } + @ARTICLE{MNW98, + author="Alistair Moffat and Radford Neal and Ian H. Witten", + title="Arithmetic Coding Revisited", + journal="{ACM} Transactions on Information Systems", + year=1998, + volume=16, + number=3, + pages="256--294", + month=Jul, + URL="http://dev.acm.org/pubs/citations/journals/tois/1998-16-3/p256-moffat/" + } + @INPROCEEDINGS{SM98, + author="Lang Stuiver and Alistair Moffat", + title="Piecewise Integer Mapping for Arithmetic Coding", + booktitle="Proceedings of the {IEEE} Data Compression Conference", + pages="1--10", + address="Snowbird, UT", + month="Mar./Apr.", + year=1998 + }*/ + + + +/*Outputs a symbol, with a carry bit. + If there is a potential to propogate a carry over several symbols, they are + buffered until it can be determined whether or not an actual carry will + occur. + If the counter for the buffered symbols overflows, then the range is + truncated to force a carry to occur, towards whichever side maximizes the + remaining range.*/ +static void ec_enc_carry_out(ec_enc *_this,int _c){ + if(_c!=EC_SYM_MAX){ + /*No further carry propogation possible, flush buffer.*/ + int carry; + carry=_c>>EC_SYM_BITS; + /*Don't output a byte on the first write. + This compare should be taken care of by branch-prediction thereafter.*/ + if(_this->rem>=0)ec_byte_write1(_this->buf,_this->rem+carry); + if(_this->ext>0){ + unsigned sym; + sym=EC_SYM_MAX+carry&EC_SYM_MAX; + do ec_byte_write1(_this->buf,sym); + while(--(_this->ext)>0); + } + _this->rem=_c&EC_SYM_MAX; + } + else _this->ext++; +} + +static void ec_enc_normalize(ec_enc *_this){ + /*If the range is too small, output some bits and rescale it.*/ + while(_this->rng<=EC_CODE_BOT){ + ec_enc_carry_out(_this,(int)(_this->low>>EC_CODE_SHIFT)); + /*Move the next-to-high-order symbol into the high-order position.*/ + _this->low=_this->low<rng<<=EC_SYM_BITS; + } +} + +void ec_enc_init(ec_enc *_this,ec_byte_buffer *_buf){ + _this->buf=_buf; + _this->rem=-1; + _this->ext=0; + _this->low=0; + _this->rng=EC_CODE_TOP; +} + +void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft){ + unsigned r; + unsigned s; + r=_this->rng/_ft; + if(_fl>0){ + s=r*(_ft-_fl); + _this->low+=_this->rng-s; + _this->rng=r*(_fh-_fl); + } + else _this->rng-=r*(_ft-_fh); + ec_enc_normalize(_this); +} + +void ec_enc_done(ec_enc *_this){ + /*We compute the integer in the current interval that has the largest number + of trailing zeros, and write that to the stream. + This is guaranteed to yield the smallest possible encoding.*/ + if(_this->low){ + unsigned end; + end=EC_CODE_TOP; + /*Ensure that the end value is in the range.*/ + if(end-_this->low>=_this->rng){ + unsigned msk; + msk=EC_CODE_TOP-1; + do{ + msk>>=1; + end=(_this->low+msk)&~msk|msk+1; + } + while(end-_this->low>=_this->rng); + } + /*The remaining output is the next free end.*/ + while(end){ + ec_enc_carry_out(_this,end>>EC_CODE_SHIFT); + end=end<rem>=0){ + unsigned char *p; + unsigned char *buf; + /*Flush it into the output buffer.*/ + ec_enc_carry_out(_this,0); + /*We may be able to drop some redundant bytes from the end.*/ + buf=ec_byte_get_buffer(_this->buf); + p=buf+ec_byte_bytes(_this->buf)-1; + /*Strip trailing zeros.*/ + while(p>=buf&&!p[0])p--; + /*Strip one trailing EC_FOF_RSV1 byte if the buffer ends in a string of + consecutive EC_FOF_RSV1 bytes preceded by one (or more) zeros.*/ + if(p>buf&&p[0]==EC_FOF_RSV1){ + unsigned char *q; + q=p; + do q--; + while(q>buf&&q[0]==EC_FOF_RSV1); + if(!q[0])p--; + } + ec_byte_writetrunc(_this->buf,p+1-buf); + } +}