summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJulian Seward <jseward@acm.org>1998-08-23 22:13:13 +0200
committerJulian Seward <jseward@acm.org>1998-08-23 22:13:13 +0200
commit977101ad5f833f5c0a574bfeea408e5301a6b052 (patch)
treefc1e8fed202869c116cbf6b8c362456042494a0a
parentbzip2-0.1pl2 (diff)
downloadbzip2-977101ad5f833f5c0a574bfeea408e5301a6b052.tar.gz
bzip2-977101ad5f833f5c0a574bfeea408e5301a6b052.tar.bz2
bzip2-977101ad5f833f5c0a574bfeea408e5301a6b052.tar.xz
bzip2-0.9.0cbzip2-0.9.0c
-rw-r--r--ALGORITHMS47
-rw-r--r--CHANGES45
-rw-r--r--LICENSE360
-rw-r--r--Makefile52
-rw-r--r--README230
-rw-r--r--README.DOS16
-rw-r--r--blocksort.c709
-rw-r--r--bzip2.1191
-rw-r--r--bzip2.1.preformatted318
-rw-r--r--bzip2.c3389
-rw-r--r--bzip2.txt292
-rw-r--r--bzip2recover.c125
-rw-r--r--bzlib.c1512
-rw-r--r--bzlib.h299
-rw-r--r--bzlib_private.h523
-rw-r--r--compress.c588
-rw-r--r--crctable.c144
-rw-r--r--decompress.c636
-rw-r--r--dlltest.c163
-rw-r--r--dlltest.dsp93
-rw-r--r--howbig.c37
-rw-r--r--huffman.c228
-rw-r--r--libbz2.def25
-rw-r--r--libbz2.dsp130
-rw-r--r--manual.texi2100
-rw-r--r--randtable.c124
-rw-r--r--test.bat9
-rw-r--r--test.cmd9
-rw-r--r--words07
-rw-r--r--words11
-rw-r--r--words21
-rw-r--r--words321
-rw-r--r--words3sh12
33 files changed, 8332 insertions, 4104 deletions
diff --git a/ALGORITHMS b/ALGORITHMS
deleted file mode 100644
index 7c7d2ca..0000000
--- a/ALGORITHMS
+++ /dev/null
@@ -1,47 +0,0 @@
1
2Bzip2 is not research work, in the sense that it doesn't present any
3new ideas. Rather, it's an engineering exercise based on existing
4ideas.
5
6Four documents describe essentially all the ideas behind bzip2:
7
8 Michael Burrows and D. J. Wheeler:
9 "A block-sorting lossless data compression algorithm"
10 10th May 1994.
11 Digital SRC Research Report 124.
12 ftp://ftp.digital.com/pub/DEC/SRC/research-reports/SRC-124.ps.gz
13
14 Daniel S. Hirschberg and Debra A. LeLewer
15 "Efficient Decoding of Prefix Codes"
16 Communications of the ACM, April 1990, Vol 33, Number 4.
17 You might be able to get an electronic copy of this
18 from the ACM Digital Library.
19
20 David J. Wheeler
21 Program bred3.c and accompanying document bred3.ps.
22 This contains the idea behind the multi-table Huffman
23 coding scheme.
24 ftp://ftp.cl.cam.ac.uk/pub/user/djw3/
25
26 Jon L. Bentley and Robert Sedgewick
27 "Fast Algorithms for Sorting and Searching Strings"
28 Available from Sedgewick's web page,
29 www.cs.princeton.edu/~rs
30
31The following paper gives valuable additional insights into the
32algorithm, but is not immediately the basis of any code
33used in bzip2.
34
35 Peter Fenwick:
36 Block Sorting Text Compression
37 Proceedings of the 19th Australasian Computer Science Conference,
38 Melbourne, Australia. Jan 31 - Feb 2, 1996.
39 ftp://ftp.cs.auckland.ac.nz/pub/peter-f/ACSC96paper.ps
40
41All three are well written, and make fascinating reading. If you want
42to modify bzip2 in any non-trivial way, I strongly suggest you obtain,
43read and understand these papers.
44
45I am much indebted to the various authors for their help, support and
46advice.
47
diff --git a/CHANGES b/CHANGES
new file mode 100644
index 0000000..ac00f3a
--- /dev/null
+++ b/CHANGES
@@ -0,0 +1,45 @@
1
2
30.9.0
4~~~~~
5First version.
6
7
80.9.0a
9~~~~~~
10Removed 'ranlib' from Makefile, since most modern Unix-es
11don't need it, or even know about it.
12
13
140.9.0b
15~~~~~~
16Fixed a problem with error reporting in bzip2.c. This does not effect
17the library in any way. Problem is: versions 0.9.0 and 0.9.0a (of the
18program proper) compress and decompress correctly, but give misleading
19error messages (internal panics) when an I/O error occurs, instead of
20reporting the problem correctly. This shouldn't give any data loss
21(as far as I can see), but is confusing.
22
23Made the inline declarations disappear for non-GCC compilers.
24
25
260.9.0c
27~~~~~~
28Fixed some problems in the library pertaining to some boundary cases.
29This makes the library behave more correctly in those situations. The
30fixes apply only to features (calls and parameters) not used by
31bzip2.c, so the non-fixedness of them in previous versions has no
32effect on reliability of bzip2.c.
33
34In bzlib.c:
35 * made zero-length BZ_FLUSH work correctly in bzCompress().
36 * fixed bzWrite/bzRead to ignore zero-length requests.
37 * fixed bzread to correctly handle read requests after EOF.
38 * wrong parameter order in call to bzDecompressInit in
39 bzBuffToBuffDecompress. Fixed.
40
41In compress.c:
42 * changed setting of nGroups in sendMTFValues() so as to
43 do a bit better on small files. This _does_ effect
44 bzip2.c.
45
diff --git a/LICENSE b/LICENSE
index a43ea21..3de0301 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,339 +1,39 @@
1 GNU GENERAL PUBLIC LICENSE
2 Version 2, June 1991
3 1
4 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 2This program, "bzip2" and associated library "libbzip2", are
5 675 Mass Ave, Cambridge, MA 02139, USA 3copyright (C) 1996-1998 Julian R Seward. All rights reserved.
6 Everyone is permitted to copy and distribute verbatim copies
7 of this license document, but changing it is not allowed.
8 4
9 Preamble 5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions
7are met:
10 8
11 The licenses for most software are designed to take away your 91. Redistributions of source code must retain the above copyright
12freedom to share and change it. By contrast, the GNU General Public 10 notice, this list of conditions and the following disclaimer.
13License is intended to guarantee your freedom to share and change free
14software--to make sure the software is free for all its users. This
15General Public License applies to most of the Free Software
16Foundation's software and to any other program whose authors commit to
17using it. (Some other Free Software Foundation software is covered by
18the GNU Library General Public License instead.) You can apply it to
19your programs, too.
20 11
21 When we speak of free software, we are referring to freedom, not 122. The origin of this software must not be misrepresented; you must
22price. Our General Public Licenses are designed to make sure that you 13 not claim that you wrote the original software. If you use this
23have the freedom to distribute copies of free software (and charge for 14 software in a product, an acknowledgment in the product
24this service if you wish), that you receive source code or can get it 15 documentation would be appreciated but is not required.
25if you want it, that you can change the software or use pieces of it
26in new free programs; and that you know you can do these things.
27 16
28 To protect your rights, we need to make restrictions that forbid 173. Altered source versions must be plainly marked as such, and must
29anyone to deny you these rights or to ask you to surrender the rights. 18 not be misrepresented as being the original software.
30These restrictions translate to certain responsibilities for you if you
31distribute copies of the software, or if you modify it.
32 19
33 For example, if you distribute copies of such a program, whether 204. The name of the author may not be used to endorse or promote
34gratis or for a fee, you must give the recipients all the rights that 21 products derived from this software without specific prior written
35you have. You must make sure that they, too, receive or can get the 22 permission.
36source code. And you must show them these terms so they know their
37rights.
38 23
39 We protect your rights with two steps: (1) copyright the software, and 24THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
40(2) offer you this license which gives you legal permission to copy, 25OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
41distribute and/or modify the software. 26WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
28DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
30GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
32WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
33NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42 35
43 Also, for each author's protection and ours, we want to make certain 36Julian Seward, Guildford, Surrey, UK.
44that everyone understands that there is no warranty for this free 37jseward@acm.org
45software. If the software is modified by someone else and passed on, we 38bzip2/libbzip2 version 0.9.0 of 28 June 1998
46want its recipients to know that what they have is not the original, so
47that any problems introduced by others will not reflect on the original
48authors' reputations.
49 39
50 Finally, any free program is threatened constantly by software
51patents. We wish to avoid the danger that redistributors of a free
52program will individually obtain patent licenses, in effect making the
53program proprietary. To prevent this, we have made it clear that any
54patent must be licensed for everyone's free use or not licensed at all.
55
56 The precise terms and conditions for copying, distribution and
57modification follow.
58
59 GNU GENERAL PUBLIC LICENSE
60 TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61
62 0. This License applies to any program or other work which contains
63a notice placed by the copyright holder saying it may be distributed
64under the terms of this General Public License. The "Program", below,
65refers to any such program or work, and a "work based on the Program"
66means either the Program or any derivative work under copyright law:
67that is to say, a work containing the Program or a portion of it,
68either verbatim or with modifications and/or translated into another
69language. (Hereinafter, translation is included without limitation in
70the term "modification".) Each licensee is addressed as "you".
71
72Activities other than copying, distribution and modification are not
73covered by this License; they are outside its scope. The act of
74running the Program is not restricted, and the output from the Program
75is covered only if its contents constitute a work based on the
76Program (independent of having been made by running the Program).
77Whether that is true depends on what the Program does.
78
79 1. You may copy and distribute verbatim copies of the Program's
80source code as you receive it, in any medium, provided that you
81conspicuously and appropriately publish on each copy an appropriate
82copyright notice and disclaimer of warranty; keep intact all the
83notices that refer to this License and to the absence of any warranty;
84and give any other recipients of the Program a copy of this License
85along with the Program.
86
87You may charge a fee for the physical act of transferring a copy, and
88you may at your option offer warranty protection in exchange for a fee.
89
90 2. You may modify your copy or copies of the Program or any portion
91of it, thus forming a work based on the Program, and copy and
92distribute such modifications or work under the terms of Section 1
93above, provided that you also meet all of these conditions:
94
95 a) You must cause the modified files to carry prominent notices
96 stating that you changed the files and the date of any change.
97
98 b) You must cause any work that you distribute or publish, that in
99 whole or in part contains or is derived from the Program or any
100 part thereof, to be licensed as a whole at no charge to all third
101 parties under the terms of this License.
102
103 c) If the modified program normally reads commands interactively
104 when run, you must cause it, when started running for such
105 interactive use in the most ordinary way, to print or display an
106 announcement including an appropriate copyright notice and a
107 notice that there is no warranty (or else, saying that you provide
108 a warranty) and that users may redistribute the program under
109 these conditions, and telling the user how to view a copy of this
110 License. (Exception: if the Program itself is interactive but
111 does not normally print such an announcement, your work based on
112 the Program is not required to print an announcement.)
113
114These requirements apply to the modified work as a whole. If
115identifiable sections of that work are not derived from the Program,
116and can be reasonably considered independent and separate works in
117themselves, then this License, and its terms, do not apply to those
118sections when you distribute them as separate works. But when you
119distribute the same sections as part of a whole which is a work based
120on the Program, the distribution of the whole must be on the terms of
121this License, whose permissions for other licensees extend to the
122entire whole, and thus to each and every part regardless of who wrote it.
123
124Thus, it is not the intent of this section to claim rights or contest
125your rights to work written entirely by you; rather, the intent is to
126exercise the right to control the distribution of derivative or
127collective works based on the Program.
128
129In addition, mere aggregation of another work not based on the Program
130with the Program (or with a work based on the Program) on a volume of
131a storage or distribution medium does not bring the other work under
132the scope of this License.
133
134 3. You may copy and distribute the Program (or a work based on it,
135under Section 2) in object code or executable form under the terms of
136Sections 1 and 2 above provided that you also do one of the following:
137
138 a) Accompany it with the complete corresponding machine-readable
139 source code, which must be distributed under the terms of Sections
140 1 and 2 above on a medium customarily used for software interchange; or,
141
142 b) Accompany it with a written offer, valid for at least three
143 years, to give any third party, for a charge no more than your
144 cost of physically performing source distribution, a complete
145 machine-readable copy of the corresponding source code, to be
146 distributed under the terms of Sections 1 and 2 above on a medium
147 customarily used for software interchange; or,
148
149 c) Accompany it with the information you received as to the offer
150 to distribute corresponding source code. (This alternative is
151 allowed only for noncommercial distribution and only if you
152 received the program in object code or executable form with such
153 an offer, in accord with Subsection b above.)
154
155The source code for a work means the preferred form of the work for
156making modifications to it. For an executable work, complete source
157code means all the source code for all modules it contains, plus any
158associated interface definition files, plus the scripts used to
159control compilation and installation of the executable. However, as a
160special exception, the source code distributed need not include
161anything that is normally distributed (in either source or binary
162form) with the major components (compiler, kernel, and so on) of the
163operating system on which the executable runs, unless that component
164itself accompanies the executable.
165
166If distribution of executable or object code is made by offering
167access to copy from a designated place, then offering equivalent
168access to copy the source code from the same place counts as
169distribution of the source code, even though third parties are not
170compelled to copy the source along with the object code.
171
172 4. You may not copy, modify, sublicense, or distribute the Program
173except as expressly provided under this License. Any attempt
174otherwise to copy, modify, sublicense or distribute the Program is
175void, and will automatically terminate your rights under this License.
176However, parties who have received copies, or rights, from you under
177this License will not have their licenses terminated so long as such
178parties remain in full compliance.
179
180 5. You are not required to accept this License, since you have not
181signed it. However, nothing else grants you permission to modify or
182distribute the Program or its derivative works. These actions are
183prohibited by law if you do not accept this License. Therefore, by
184modifying or distributing the Program (or any work based on the
185Program), you indicate your acceptance of this License to do so, and
186all its terms and conditions for copying, distributing or modifying
187the Program or works based on it.
188
189 6. Each time you redistribute the Program (or any work based on the
190Program), the recipient automatically receives a license from the
191original licensor to copy, distribute or modify the Program subject to
192these terms and conditions. You may not impose any further
193restrictions on the recipients' exercise of the rights granted herein.
194You are not responsible for enforcing compliance by third parties to
195this License.
196
197 7. If, as a consequence of a court judgment or allegation of patent
198infringement or for any other reason (not limited to patent issues),
199conditions are imposed on you (whether by court order, agreement or
200otherwise) that contradict the conditions of this License, they do not
201excuse you from the conditions of this License. If you cannot
202distribute so as to satisfy simultaneously your obligations under this
203License and any other pertinent obligations, then as a consequence you
204may not distribute the Program at all. For example, if a patent
205license would not permit royalty-free redistribution of the Program by
206all those who receive copies directly or indirectly through you, then
207the only way you could satisfy both it and this License would be to
208refrain entirely from distribution of the Program.
209
210If any portion of this section is held invalid or unenforceable under
211any particular circumstance, the balance of the section is intended to
212apply and the section as a whole is intended to apply in other
213circumstances.
214
215It is not the purpose of this section to induce you to infringe any
216patents or other property right claims or to contest validity of any
217such claims; this section has the sole purpose of protecting the
218integrity of the free software distribution system, which is
219implemented by public license practices. Many people have made
220generous contributions to the wide range of software distributed
221through that system in reliance on consistent application of that
222system; it is up to the author/donor to decide if he or she is willing
223to distribute software through any other system and a licensee cannot
224impose that choice.
225
226This section is intended to make thoroughly clear what is believed to
227be a consequence of the rest of this License.
228
229 8. If the distribution and/or use of the Program is restricted in
230certain countries either by patents or by copyrighted interfaces, the
231original copyright holder who places the Program under this License
232may add an explicit geographical distribution limitation excluding
233those countries, so that distribution is permitted only in or among
234countries not thus excluded. In such case, this License incorporates
235the limitation as if written in the body of this License.
236
237 9. The Free Software Foundation may publish revised and/or new versions
238of the General Public License from time to time. Such new versions will
239be similar in spirit to the present version, but may differ in detail to
240address new problems or concerns.
241
242Each version is given a distinguishing version number. If the Program
243specifies a version number of this License which applies to it and "any
244later version", you have the option of following the terms and conditions
245either of that version or of any later version published by the Free
246Software Foundation. If the Program does not specify a version number of
247this License, you may choose any version ever published by the Free Software
248Foundation.
249
250 10. If you wish to incorporate parts of the Program into other free
251programs whose distribution conditions are different, write to the author
252to ask for permission. For software which is copyrighted by the Free
253Software Foundation, write to the Free Software Foundation; we sometimes
254make exceptions for this. Our decision will be guided by the two goals
255of preserving the free status of all derivatives of our free software and
256of promoting the sharing and reuse of software generally.
257
258 NO WARRANTY
259
260 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268REPAIR OR CORRECTION.
269
270 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278POSSIBILITY OF SUCH DAMAGES.
279
280 END OF TERMS AND CONDITIONS
281
282 Appendix: How to Apply These Terms to Your New Programs
283
284 If you develop a new program, and you want it to be of the greatest
285possible use to the public, the best way to achieve this is to make it
286free software which everyone can redistribute and change under these terms.
287
288 To do so, attach the following notices to the program. It is safest
289to attach them to the start of each source file to most effectively
290convey the exclusion of warranty; and each file should have at least
291the "copyright" line and a pointer to where the full notice is found.
292
293 <one line to give the program's name and a brief idea of what it does.>
294 Copyright (C) 19yy <name of author>
295
296 This program is free software; you can redistribute it and/or modify
297 it under the terms of the GNU General Public License as published by
298 the Free Software Foundation; either version 2 of the License, or
299 (at your option) any later version.
300
301 This program is distributed in the hope that it will be useful,
302 but WITHOUT ANY WARRANTY; without even the implied warranty of
303 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304 GNU General Public License for more details.
305
306 You should have received a copy of the GNU General Public License
307 along with this program; if not, write to the Free Software
308 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
309
310Also add information on how to contact you by electronic and paper mail.
311
312If the program is interactive, make it output a short notice like this
313when it starts in an interactive mode:
314
315 Gnomovision version 69, Copyright (C) 19yy name of author
316 Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 This is free software, and you are welcome to redistribute it
318 under certain conditions; type `show c' for details.
319
320The hypothetical commands `show w' and `show c' should show the appropriate
321parts of the General Public License. Of course, the commands you use may
322be called something other than `show w' and `show c'; they could even be
323mouse-clicks or menu items--whatever suits your program.
324
325You should also get your employer (if you work as a programmer) or your
326school, if any, to sign a "copyright disclaimer" for the program, if
327necessary. Here is a sample; alter the names:
328
329 Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 `Gnomovision' (which makes passes at compilers) written by James Hacker.
331
332 <signature of Ty Coon>, 1 April 1989
333 Ty Coon, President of Vice
334
335This General Public License does not permit incorporating your program into
336proprietary programs. If your program is a subroutine library, you may
337consider it more useful to permit linking proprietary applications with the
338library. If this is what you want to do, use the GNU Library General
339Public License instead of this License.
diff --git a/Makefile b/Makefile
index 9d35b43..8ebea66 100644
--- a/Makefile
+++ b/Makefile
@@ -1,30 +1,46 @@
1 1
2CC = gcc 2CC=gcc
3SH = /bin/sh 3CFLAGS=-Wall -O2 -fomit-frame-pointer -fno-strength-reduce
4 4
5CFLAGS = -O3 -fomit-frame-pointer -funroll-loops 5OBJS= blocksort.o \
6 6 huffman.o \
7 crctable.o \
8 randtable.o \
9 compress.o \
10 decompress.o \
11 bzlib.o
12
13all: lib bzip2 test
14
15bzip2: lib
16 $(CC) $(CFLAGS) -c bzip2.c
17 $(CC) $(CFLAGS) -o bzip2 bzip2.o -L. -lbz2
18 $(CC) $(CFLAGS) -o bzip2recover bzip2recover.c
7 19
20lib: $(OBJS)
21 rm -f libbz2.a
22 ar clq libbz2.a $(OBJS)
8 23
9all: 24test: bzip2
10 cat words0 25 @cat words1
11 $(CC) $(CFLAGS) -o bzip2 bzip2.c
12 $(CC) $(CFLAGS) -o bzip2recover bzip2recover.c
13 rm -f bunzip2
14 ln -s ./bzip2 ./bunzip2
15 cat words1
16 ./bzip2 -1 < sample1.ref > sample1.rb2 26 ./bzip2 -1 < sample1.ref > sample1.rb2
17 ./bzip2 -2 < sample2.ref > sample2.rb2 27 ./bzip2 -2 < sample2.ref > sample2.rb2
18 ./bunzip2 < sample1.bz2 > sample1.tst 28 ./bzip2 -d < sample1.bz2 > sample1.tst
19 ./bunzip2 < sample2.bz2 > sample2.tst 29 ./bzip2 -d < sample2.bz2 > sample2.tst
20 cat words2 30 @cat words2
21 cmp sample1.bz2 sample1.rb2 31 cmp sample1.bz2 sample1.rb2
22 cmp sample2.bz2 sample2.rb2 32 cmp sample2.bz2 sample2.rb2
23 cmp sample1.tst sample1.ref 33 cmp sample1.tst sample1.ref
24 cmp sample2.tst sample2.ref 34 cmp sample2.tst sample2.ref
25 cat words3 35 @cat words3
36
37
38clean:
39 rm -f *.o libbz2.a bzip2 bzip2recover sample1.rb2 sample2.rb2 sample1.tst sample2.tst
26 40
41.c.o: $*.o bzlib.h bzlib_private.h
42 $(CC) $(CFLAGS) -c $*.c -o $*.o
27 43
28clean: 44tarfile:
29 rm -f bzip2 bunzip2 bzip2recover sample*.tst sample*.rb2 45 tar cvf interim.tar *.c *.h Makefile manual.texi manual.ps LICENSE bzip2.1 bzip2.1.preformatted bzip2.txt words1 words2 words3 sample1.ref sample2.ref sample1.bz2 sample2.bz2 *.html README CHANGES libbz2.def libbz2.dsp dlltest.dsp
30 46
diff --git a/README b/README
index d58bb49..2f59ef7 100644
--- a/README
+++ b/README
@@ -1,194 +1,61 @@
1 1
2GREETINGS!
3 2
4 This is the README for bzip2, my block-sorting file compressor, 3This is the README for bzip2, a block-sorting file compressor, version
5 version 0.1. 40.9.0. This version is fully compatible with the previous public
5release, bzip2-0.1pl2.
6 6
7 bzip2 is distributed under the GNU General Public License version 2; 7bzip2-0.9.0 is distributed under a BSD-style license. For details,
8 for details, see the file LICENSE. Pointers to the algorithms used 8see the file LICENSE.
9 are in ALGORITHMS. Instructions for use are in bzip2.1.preformatted.
10 9
11 Please read all of this file carefully. 10Complete documentation is available in Postscript form (manual.ps)
11or html (manual_toc.html). A plain-text version of the manual page is
12available as bzip2.txt.
12 13
13 14
15HOW TO BUILD -- UNIX
14 16
15HOW TO BUILD 17Type `make'.
16 18
17 -- for UNIX: 19This creates binaries "bzip2" and "bzip2recover".
18 20
19 Type `make'. (tough, huh? :-) 21It also runs four compress-decompress tests to make sure things are
22working properly. If all goes well, you should be up & running.
23Please be sure to read the output from `make' just to be sure that the
24tests went ok.
20 25
21 This creates binaries "bzip2", and "bunzip2", 26To install bzip2 properly:
22 which is a symbolic link to "bzip2".
23 27
24 It also runs four compress-decompress tests to make sure 28* Copy the binaries "bzip2" and "bzip2recover" to a publically visible
25 things are working properly. If all goes well, you should be up & 29 place, possibly /usr/bin or /usr/local/bin.
26 running. Please be sure to read the output from `make'
27 just to be sure that the tests went ok.
28 30
29 To install bzip2 properly: 31* In that directory, make "bunzip2" and "bzcat" be symbolic links
32 to "bzip2".
30 33
31 -- Copy the binary "bzip2" to a publically visible place, 34* Copy the manual page, bzip2.1, to the relevant place.
32 possibly /usr/bin, /usr/common/bin or /usr/local/bin. 35 Probably the right place is /usr/man/man1/.
33
34 -- In that directory, make "bunzip2" be a symbolic link
35 to "bzip2".
36
37 -- Copy the manual page, bzip2.1, to the relevant place.
38 Probably the right place is /usr/man/man1/.
39
40 -- for Windows 95 and NT:
41 36
42 For a start, do you *really* want to recompile bzip2? 37If you want to program with the library, you'll need to copy libbz2.a
43 The standard distribution includes a pre-compiled version 38and bzlib.h to /usr/lib and /usr/include respectively.
44 for Windows 95 and NT, `bzip2.exe'. 39
45 40
46 This executable was created with Jacob Navia's excellent 41HOW TO BUILD -- Windows 95, NT, DOS, Mac, etc.
47 port to Win32 of Chris Fraser & David Hanson's excellent
48 ANSI C compiler, "lcc". You can get to it at the pages
49 of the CS department of Princeton University,
50 www.cs.princeton.edu.
51 I have not tried to compile this version of bzip2 with
52 a commercial C compiler such as MS Visual C, as I don't
53 have one available.
54
55 Note that lcc is designed primarily to be portable and
56 fast. Code quality is a secondary aim, so bzip2.exe
57 runs perhaps 40% slower than it could if compiled with
58 a good optimising compiler.
59
60 I compiled a previous version of bzip (0.21) with Borland
61 C 5.0, which worked fine, and with MS VC++ 2.0, which
62 didn't. Here is an comment from the README for bzip-0.21.
63
64 MS VC++ 2.0's optimising compiler has a bug which, at
65 maximum optimisation, gives an executable which produces
66 garbage compressed files. Proceed with caution.
67 I do not know whether or not this happens with later
68 versions of VC++.
69
70 Edit the defines starting at line 86 of bzip.c to
71 select your platform/compiler combination, and then compile.
72 Then check that the resulting executable (assumed to be
73 called bzip.exe) works correctly, using the SELFTEST.BAT file.
74 Bearing in mind the previous paragraph, the self-test is
75 important.
76
77 Note that the defines which bzip-0.21 had, to support
78 compilation with VC 2.0 and BC 5.0, are gone. Windows
79 is not my preferred operating system, and I am, for the
80 moment, content with the modestly fast executable created
81 by lcc-win32.
82
83 A manual page is supplied, unformatted (bzip2.1),
84 preformatted (bzip2.1.preformatted), and preformatted
85 and sanitised for MS-DOS (bzip2.txt).
86
87
88
89COMPILATION NOTES
90
91 bzip2 should work on any 32 or 64-bit machine. It is known to work
92 [meaning: it has compiled and passed self-tests] on the
93 following platform-os combinations:
94
95 Intel i386/i486 running Linux 2.0.21
96 Sun Sparcs (various) running SunOS 4.1.4 and Solaris 2.5
97 Intel i386/i486 running Windows 95 and NT
98 DEC Alpha running Digital Unix 4.0
99
100 Following the release of bzip-0.21, many people mailed me
101 from around the world to say they had made it work on all sorts
102 of weird and wonderful machines. Chances are, if you have
103 a reasonable ANSI C compiler and a 32-bit machine, you can
104 get it to work.
105
106 The #defines starting at around line 82 of bzip2.c supply some
107 degree of platform-independance. If you configure bzip2 for some
108 new far-out platform which is not covered by the existing definitions,
109 please send me the relevant definitions.
110
111 I recommend GNU C for compilation. The code is standard ANSI C,
112 except for the Unix-specific file handling, so any ANSI C compiler
113 should work. Note however that the many routines marked INLINE
114 should be inlined by your compiler, else performance will be very
115 poor. Asking your compiler to unroll loops gives some
116 small improvement too; for gcc, the relevant flag is
117 -funroll-loops.
118
119 On a 386/486 machines, I'd recommend giving gcc the
120 -fomit-frame-pointer flag; this liberates another register for
121 allocation, which measurably improves performance.
122
123 I used the abovementioned lcc compiler to develop bzip2.
124 I would highly recommend this compiler for day-to-day development;
125 it is fast, reliable, lightweight, has an excellent profiler,
126 and is generally excellent. And it's fun to retarget, if you're
127 into that kind of thing.
128
129 If you compile bzip2 on a new platform or with a new compiler,
130 please be sure to run the four compress-decompress tests, either
131 using the Makefile, or with the test.bat (MSDOS) or test.cmd (OS/2)
132 files. Some compilers have been seen to introduce subtle bugs
133 when optimising, so this check is important. Ideally you should
134 then go on to test bzip2 on a file several megabytes or even
135 tens of megabytes long, just to be 110% sure. ``Professional
136 programmers are paranoid programmers.'' (anon).
137 42
43It's difficult for me to support compilation on all these platforms.
44My approach is to collect binaries for these platforms, and put them
45on my web page (http://www.muraroa.demon.co.uk). Look there.
138 46
139 47
140VALIDATION 48VALIDATION
141 49
142 Correct operation, in the sense that a compressed file can always be 50Correct operation, in the sense that a compressed file can always be
143 decompressed to reproduce the original, is obviously of paramount 51decompressed to reproduce the original, is obviously of paramount
144 importance. To validate bzip2, I used a modified version of 52importance. To validate bzip2, I used a modified version of Mark
145 Mark Nelson's churn program. Churn is an automated test driver 53Nelson's churn program. Churn is an automated test driver which
146 which recursively traverses a directory structure, using bzip2 to 54recursively traverses a directory structure, using bzip2 to compress
147 compress and then decompress each file it encounters, and checking 55and then decompress each file it encounters, and checking that the
148 that the decompressed data is the same as the original. As test 56decompressed data is the same as the original. There are more details
149 material, I used several runs over several filesystems of differing 57in Section 4 of the user guide.
150 sizes.
151
152 One set of tests was done on my base Linux filesystem,
153 410 megabytes in 23,000 files. There were several runs over
154 this filesystem, in various configurations designed to break bzip2.
155 That filesystem also contained some specially constructed test
156 files designed to exercise boundary cases in the code.
157 This included files of zero length, various long, highly repetitive
158 files, and some files which generate blocks with all values the same.
159 58
160 The other set of tests was done just with the "normal" configuration,
161 but on a much larger quantity of data.
162
163 Tests are:
164
165 Linux FS, 410M, 23000 files
166
167 As above, with --repetitive-fast
168
169 As above, with -1
170
171 Low level disk image of a disk containing
172 Windows NT4.0; 420M in a single huge file
173
174 Linux distribution, incl Slackware,
175 all GNU sources. 1900M in 2300 files.
176
177 Approx ~100M compiler sources and related
178 programming tools, running under Purify.
179
180 About 500M of data in 120 files of around
181 4 M each. This is raw data from a
182 biomagnetometer (SQUID-based thing).
183
184 Overall, total volume of test data is about
185 3300 megabytes in 25000 files.
186
187 The distribution does four tests after building bzip. These tests
188 include test decompressions of pre-supplied compressed files, so
189 they not only test that bzip works correctly on the machine it was
190 built on, but can also decompress files compressed on a different
191 machine. This guards against unforseen interoperability problems.
192 59
193 60
194Please read and be aware of the following: 61Please read and be aware of the following:
@@ -234,14 +101,30 @@ PATENTS:
234End of legalities. 101End of legalities.
235 102
236 103
104WHAT'S NEW IN 0.9.0 (as compared to 0.1pl2) ?
105
106 * Approx 10% faster compression, 30% faster decompression
107 * -t (test mode) is a lot quicker
108 * Can decompress concatenated compressed files
109 * Programming interface, so programs can directly read/write .bz2 files
110 * Less restrictive (BSD-style) licensing
111 * Flag handling more compatible with GNU gzip
112 * Much more documentation, i.e., a proper user manual
113 * Hopefully, improved portability (at least of the library)
114
115
237I hope you find bzip2 useful. Feel free to contact me at 116I hope you find bzip2 useful. Feel free to contact me at
238 jseward@acm.org 117 jseward@acm.org
239if you have any suggestions or queries. Many people mailed me with 118if you have any suggestions or queries. Many people mailed me with
240comments, suggestions and patches after the releases of 0.15 and 0.21, 119comments, suggestions and patches after the releases of bzip-0.15,
241and the changes in bzip2 are largely a result of this feedback. 120bzip-0.21 and bzip2-0.1pl2, and the changes in bzip2 are largely a
242I thank you for your comments. 121result of this feedback. I thank you for your comments.
122
123At least for the time being, bzip2's "home" is
124http://www.muraroa.demon.co.uk.
243 125
244Julian Seward 126Julian Seward
127jseward@acm.org
245 128
246Manchester, UK 129Manchester, UK
24718 July 1996 (version 0.15) 13018 July 1996 (version 0.15)
@@ -250,4 +133,5 @@ Manchester, UK
250Guildford, Surrey, UK 133Guildford, Surrey, UK
2517 August 1997 (bzip2, version 0.1) 1347 August 1997 (bzip2, version 0.1)
25229 August 1997 (bzip2, version 0.1pl2) 13529 August 1997 (bzip2, version 0.1pl2)
13623 August 1998 (bzip2, version 0.9.0)
253 137
diff --git a/README.DOS b/README.DOS
deleted file mode 100644
index 048de8c..0000000
--- a/README.DOS
+++ /dev/null
@@ -1,16 +0,0 @@
1
2As of today (3 March 1998) I've removed the
3Win95/NT executables from this distribution, sorry.
4
5You can still get an executable from
6http://www.muraroa.demon.co.uk, or (as a last
7resort) by mailing me at jseward@acm.org.
8
9The reason for this change of packaging is that it
10makes it easier for me to fix problems with specific
11executables if they are not included in the main
12distribution.
13
14J
15
16
diff --git a/blocksort.c b/blocksort.c
new file mode 100644
index 0000000..d8bb26a
--- /dev/null
+++ b/blocksort.c
@@ -0,0 +1,709 @@
1
2/*-------------------------------------------------------------*/
3/*--- Block sorting machinery ---*/
4/*--- blocksort.c ---*/
5/*-------------------------------------------------------------*/
6
7/*--
8 This file is a part of bzip2 and/or libbzip2, a program and
9 library for lossless, block-sorting data compression.
10
11 Copyright (C) 1996-1998 Julian R Seward. All rights reserved.
12
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions
15 are met:
16
17 1. Redistributions of source code must retain the above copyright
18 notice, this list of conditions and the following disclaimer.
19
20 2. The origin of this software must not be misrepresented; you must
21 not claim that you wrote the original software. If you use this
22 software in a product, an acknowledgment in the product
23 documentation would be appreciated but is not required.
24
25 3. Altered source versions must be plainly marked as such, and must
26 not be misrepresented as being the original software.
27
28 4. The name of the author may not be used to endorse or promote
29 products derived from this software without specific prior written
30 permission.
31
32 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
33 OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
36 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
38 GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43
44 Julian Seward, Guildford, Surrey, UK.
45 jseward@acm.org
46 bzip2/libbzip2 version 0.9.0c of 18 October 1998
47
48 This program is based on (at least) the work of:
49 Mike Burrows
50 David Wheeler
51 Peter Fenwick
52 Alistair Moffat
53 Radford Neal
54 Ian H. Witten
55 Robert Sedgewick
56 Jon L. Bentley
57
58 For more information on these sources, see the manual.
59--*/
60
61
62#include "bzlib_private.h"
63
64/*---------------------------------------------*/
65/*--
66 Compare two strings in block. We assume (see
67 discussion above) that i1 and i2 have a max
68 offset of 10 on entry, and that the first
69 bytes of both block and quadrant have been
70 copied into the "overshoot area", ie
71 into the subscript range
72 [nblock .. nblock+NUM_OVERSHOOT_BYTES-1].
73--*/
74static __inline__ Bool fullGtU ( UChar* block,
75 UInt16* quadrant,
76 UInt32 nblock,
77 Int32* workDone,
78 Int32 i1,
79 Int32 i2
80 )
81{
82 Int32 k;
83 UChar c1, c2;
84 UInt16 s1, s2;
85
86 AssertD ( i1 != i2, "fullGtU(1)" );
87
88 c1 = block[i1];
89 c2 = block[i2];
90 if (c1 != c2) return (c1 > c2);
91 i1++; i2++;
92
93 c1 = block[i1];
94 c2 = block[i2];
95 if (c1 != c2) return (c1 > c2);
96 i1++; i2++;
97
98 c1 = block[i1];
99 c2 = block[i2];
100 if (c1 != c2) return (c1 > c2);
101 i1++; i2++;
102
103 c1 = block[i1];
104 c2 = block[i2];
105 if (c1 != c2) return (c1 > c2);
106 i1++; i2++;
107
108 c1 = block[i1];
109 c2 = block[i2];
110 if (c1 != c2) return (c1 > c2);
111 i1++; i2++;
112
113 c1 = block[i1];
114 c2 = block[i2];
115 if (c1 != c2) return (c1 > c2);
116 i1++; i2++;
117
118 k = nblock;
119
120 do {
121
122 c1 = block[i1];
123 c2 = block[i2];
124 if (c1 != c2) return (c1 > c2);
125 s1 = quadrant[i1];
126 s2 = quadrant[i2];
127 if (s1 != s2) return (s1 > s2);
128 i1++; i2++;
129
130 c1 = block[i1];
131 c2 = block[i2];
132 if (c1 != c2) return (c1 > c2);
133 s1 = quadrant[i1];
134 s2 = quadrant[i2];
135 if (s1 != s2) return (s1 > s2);
136 i1++; i2++;
137
138 c1 = block[i1];
139 c2 = block[i2];
140 if (c1 != c2) return (c1 > c2);
141 s1 = quadrant[i1];
142 s2 = quadrant[i2];
143 if (s1 != s2) return (s1 > s2);
144 i1++; i2++;
145
146 c1 = block[i1];
147 c2 = block[i2];
148 if (c1 != c2) return (c1 > c2);
149 s1 = quadrant[i1];
150 s2 = quadrant[i2];
151 if (s1 != s2) return (s1 > s2);
152 i1++; i2++;
153
154 if (i1 >= nblock) i1 -= nblock;
155 if (i2 >= nblock) i2 -= nblock;
156
157 k -= 4;
158 (*workDone)++;
159 }
160 while (k >= 0);
161
162 return False;
163}
164
165/*---------------------------------------------*/
166/*--
167 Knuth's increments seem to work better
168 than Incerpi-Sedgewick here. Possibly
169 because the number of elems to sort is
170 usually small, typically <= 20.
171--*/
172static Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
173 9841, 29524, 88573, 265720,
174 797161, 2391484 };
175
176static void simpleSort ( EState* s, Int32 lo, Int32 hi, Int32 d )
177{
178 Int32 i, j, h, bigN, hp;
179 Int32 v;
180
181 UChar* block = s->block;
182 UInt32* zptr = s->zptr;
183 UInt16* quadrant = s->quadrant;
184 Int32* workDone = &(s->workDone);
185 Int32 nblock = s->nblock;
186 Int32 workLimit = s->workLimit;
187 Bool firstAttempt = s->firstAttempt;
188
189 bigN = hi - lo + 1;
190 if (bigN < 2) return;
191
192 hp = 0;
193 while (incs[hp] < bigN) hp++;
194 hp--;
195
196 for (; hp >= 0; hp--) {
197 h = incs[hp];
198 i = lo + h;
199 while (True) {
200
201 /*-- copy 1 --*/
202 if (i > hi) break;
203 v = zptr[i];
204 j = i;
205 while ( fullGtU ( block, quadrant, nblock, workDone,
206 zptr[j-h]+d, v+d ) ) {
207 zptr[j] = zptr[j-h];
208 j = j - h;
209 if (j <= (lo + h - 1)) break;
210 }
211 zptr[j] = v;
212 i++;
213
214 /*-- copy 2 --*/
215 if (i > hi) break;
216 v = zptr[i];
217 j = i;
218 while ( fullGtU ( block, quadrant, nblock, workDone,
219 zptr[j-h]+d, v+d ) ) {
220 zptr[j] = zptr[j-h];
221 j = j - h;
222 if (j <= (lo + h - 1)) break;
223 }
224 zptr[j] = v;
225 i++;
226
227 /*-- copy 3 --*/
228 if (i > hi) break;
229 v = zptr[i];
230 j = i;
231 while ( fullGtU ( block, quadrant, nblock, workDone,
232 zptr[j-h]+d, v+d ) ) {
233 zptr[j] = zptr[j-h];
234 j = j - h;
235 if (j <= (lo + h - 1)) break;
236 }
237 zptr[j] = v;
238 i++;
239
240 if (*workDone > workLimit && firstAttempt) return;
241 }
242 }
243}
244
245
246/*---------------------------------------------*/
247/*--
248 The following is an implementation of
249 an elegant 3-way quicksort for strings,
250 described in a paper "Fast Algorithms for
251 Sorting and Searching Strings", by Robert
252 Sedgewick and Jon L. Bentley.
253--*/
254
255#define swap(lv1, lv2) \
256 { Int32 tmp = lv1; lv1 = lv2; lv2 = tmp; }
257
258static void vswap ( UInt32* zptr, Int32 p1, Int32 p2, Int32 n )
259{
260 while (n > 0) {
261 swap(zptr[p1], zptr[p2]);
262 p1++; p2++; n--;
263 }
264}
265
266static UChar med3 ( UChar a, UChar b, UChar c )
267{
268 UChar t;
269 if (a > b) { t = a; a = b; b = t; };
270 if (b > c) { t = b; b = c; c = t; };
271 if (a > b) b = a;
272 return b;
273}
274
275
276#define min(a,b) ((a) < (b)) ? (a) : (b)
277
278typedef
279 struct { Int32 ll; Int32 hh; Int32 dd; }
280 StackElem;
281
282#define push(lz,hz,dz) { stack[sp].ll = lz; \
283 stack[sp].hh = hz; \
284 stack[sp].dd = dz; \
285 sp++; }
286
287#define pop(lz,hz,dz) { sp--; \
288 lz = stack[sp].ll; \
289 hz = stack[sp].hh; \
290 dz = stack[sp].dd; }
291
292#define SMALL_THRESH 20
293#define DEPTH_THRESH 10
294
295/*--
296 If you are ever unlucky/improbable enough
297 to get a stack overflow whilst sorting,
298 increase the following constant and try
299 again. In practice I have never seen the
300 stack go above 27 elems, so the following
301 limit seems very generous.
302--*/
303#define QSORT_STACK_SIZE 1000
304
305
306static void qSort3 ( EState* s, Int32 loSt, Int32 hiSt, Int32 dSt )
307{
308 Int32 unLo, unHi, ltLo, gtHi, med, n, m;
309 Int32 sp, lo, hi, d;
310 StackElem stack[QSORT_STACK_SIZE];
311
312 UChar* block = s->block;
313 UInt32* zptr = s->zptr;
314 Int32* workDone = &(s->workDone);
315 Int32 workLimit = s->workLimit;
316 Bool firstAttempt = s->firstAttempt;
317
318 sp = 0;
319 push ( loSt, hiSt, dSt );
320
321 while (sp > 0) {
322
323 AssertH ( sp < QSORT_STACK_SIZE, 1001 );
324
325 pop ( lo, hi, d );
326
327 if (hi - lo < SMALL_THRESH || d > DEPTH_THRESH) {
328 simpleSort ( s, lo, hi, d );
329 if (*workDone > workLimit && firstAttempt) return;
330 continue;
331 }
332
333 med = med3 ( block[zptr[ lo ]+d],
334 block[zptr[ hi ]+d],
335 block[zptr[ (lo+hi)>>1 ]+d] );
336
337 unLo = ltLo = lo;
338 unHi = gtHi = hi;
339
340 while (True) {
341 while (True) {
342 if (unLo > unHi) break;
343 n = ((Int32)block[zptr[unLo]+d]) - med;
344 if (n == 0) { swap(zptr[unLo], zptr[ltLo]); ltLo++; unLo++; continue; };
345 if (n > 0) break;
346 unLo++;
347 }
348 while (True) {
349 if (unLo > unHi) break;
350 n = ((Int32)block[zptr[unHi]+d]) - med;
351 if (n == 0) { swap(zptr[unHi], zptr[gtHi]); gtHi--; unHi--; continue; };
352 if (n < 0) break;
353 unHi--;
354 }
355 if (unLo > unHi) break;
356 swap(zptr[unLo], zptr[unHi]); unLo++; unHi--;
357 }
358
359 AssertD ( unHi == unLo-1, "bad termination in qSort3" );
360
361 if (gtHi < ltLo) {
362 push(lo, hi, d+1 );
363 continue;
364 }
365
366 n = min(ltLo-lo, unLo-ltLo); vswap(zptr, lo, unLo-n, n);
367 m = min(hi-gtHi, gtHi-unHi); vswap(zptr, unLo, hi-m+1, m);
368
369 n = lo + unLo - ltLo - 1;
370 m = hi - (gtHi - unHi) + 1;
371
372 push ( lo, n, d );
373 push ( n+1, m-1, d+1 );
374 push ( m, hi, d );
375 }
376}
377
378
379/*---------------------------------------------*/
380
381#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
382
383#define SETMASK (1 << 21)
384#define CLEARMASK (~(SETMASK))
385
386static void sortMain ( EState* s )
387{
388 Int32 i, j, k, ss, sb;
389 Int32 runningOrder[256];
390 Int32 copy[256];
391 Bool bigDone[256];
392 UChar c1, c2;
393 Int32 numQSorted;
394
395 UChar* block = s->block;
396 UInt32* zptr = s->zptr;
397 UInt16* quadrant = s->quadrant;
398 Int32* ftab = s->ftab;
399 Int32* workDone = &(s->workDone);
400 Int32 nblock = s->nblock;
401 Int32 workLimit = s->workLimit;
402 Bool firstAttempt = s->firstAttempt;
403
404 /*--
405 In the various block-sized structures, live data runs
406 from 0 to last+NUM_OVERSHOOT_BYTES inclusive. First,
407 set up the overshoot area for block.
408 --*/
409
410 if (s->verbosity >= 4)
411 VPrintf0( " sort initialise ...\n" );
412
413 for (i = 0; i < BZ_NUM_OVERSHOOT_BYTES; i++)
414 block[nblock+i] = block[i % nblock];
415 for (i = 0; i < nblock+BZ_NUM_OVERSHOOT_BYTES; i++)
416 quadrant[i] = 0;
417
418
419 if (nblock <= 4000) {
420
421 /*--
422 Use simpleSort(), since the full sorting mechanism
423 has quite a large constant overhead.
424 --*/
425 if (s->verbosity >= 4) VPrintf0( " simpleSort ...\n" );
426 for (i = 0; i < nblock; i++) zptr[i] = i;
427 firstAttempt = False;
428 *workDone = workLimit = 0;
429 simpleSort ( s, 0, nblock-1, 0 );
430 if (s->verbosity >= 4) VPrintf0( " simpleSort done.\n" );
431
432 } else {
433
434 numQSorted = 0;
435 for (i = 0; i <= 255; i++) bigDone[i] = False;
436
437 if (s->verbosity >= 4) VPrintf0( " bucket sorting ...\n" );
438
439 for (i = 0; i <= 65536; i++) ftab[i] = 0;
440
441 c1 = block[nblock-1];
442 for (i = 0; i < nblock; i++) {
443 c2 = block[i];
444 ftab[(c1 << 8) + c2]++;
445 c1 = c2;
446 }
447
448 for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
449
450 c1 = block[0];
451 for (i = 0; i < nblock-1; i++) {
452 c2 = block[i+1];
453 j = (c1 << 8) + c2;
454 c1 = c2;
455 ftab[j]--;
456 zptr[ftab[j]] = i;
457 }
458 j = (block[nblock-1] << 8) + block[0];
459 ftab[j]--;
460 zptr[ftab[j]] = nblock-1;
461
462 /*--
463 Now ftab contains the first loc of every small bucket.
464 Calculate the running order, from smallest to largest
465 big bucket.
466 --*/
467
468 for (i = 0; i <= 255; i++) runningOrder[i] = i;
469
470 {
471 Int32 vv;
472 Int32 h = 1;
473 do h = 3 * h + 1; while (h <= 256);
474 do {
475 h = h / 3;
476 for (i = h; i <= 255; i++) {
477 vv = runningOrder[i];
478 j = i;
479 while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
480 runningOrder[j] = runningOrder[j-h];
481 j = j - h;
482 if (j <= (h - 1)) goto zero;
483 }
484 zero:
485 runningOrder[j] = vv;
486 }
487 } while (h != 1);
488 }
489
490 /*--
491 The main sorting loop.
492 --*/
493
494 for (i = 0; i <= 255; i++) {
495
496 /*--
497 Process big buckets, starting with the least full.
498 Basically this is a 4-step process in which we call
499 qSort3 to sort the small buckets [ss, j], but
500 also make a big effort to avoid the calls if we can.
501 --*/
502 ss = runningOrder[i];
503
504 /*--
505 Step 1:
506 Complete the big bucket [ss] by quicksorting
507 any unsorted small buckets [ss, j], for j != ss.
508 Hopefully previous pointer-scanning phases have already
509 completed many of the small buckets [ss, j], so
510 we don't have to sort them at all.
511 --*/
512 for (j = 0; j <= 255; j++) {
513 if (j != ss) {
514 sb = (ss << 8) + j;
515 if ( ! (ftab[sb] & SETMASK) ) {
516 Int32 lo = ftab[sb] & CLEARMASK;
517 Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
518 if (hi > lo) {
519 if (s->verbosity >= 4)
520 VPrintf4( " qsort [0x%x, 0x%x] done %d this %d\n",
521 ss, j, numQSorted, hi - lo + 1 );
522 qSort3 ( s, lo, hi, 2 );
523 numQSorted += ( hi - lo + 1 );
524 if (*workDone > workLimit && firstAttempt) return;
525 }
526 }
527 ftab[sb] |= SETMASK;
528 }
529 }
530
531 /*--
532 Step 2:
533 Deal specially with case [ss, ss]. This establishes the
534 sorted order for [ss, ss] without any comparisons.
535 A clever trick, cryptically described as steps Q6b and Q6c
536 in SRC-124 (aka BW94). This makes it entirely practical to
537 not use a preliminary run-length coder, but unfortunately
538 we are now stuck with the .bz2 file format.
539 --*/
540 {
541 Int32 put0, get0, put1, get1;
542 Int32 sbn = (ss << 8) + ss;
543 Int32 lo = ftab[sbn] & CLEARMASK;
544 Int32 hi = (ftab[sbn+1] & CLEARMASK) - 1;
545 UChar ssc = (UChar)ss;
546 put0 = lo;
547 get0 = ftab[ss << 8] & CLEARMASK;
548 put1 = hi;
549 get1 = (ftab[(ss+1) << 8] & CLEARMASK) - 1;
550 while (get0 < put0) {
551 j = zptr[get0]-1; if (j < 0) j += nblock;
552 c1 = block[j];
553 if (c1 == ssc) { zptr[put0] = j; put0++; };
554 get0++;
555 }
556 while (get1 > put1) {
557 j = zptr[get1]-1; if (j < 0) j += nblock;
558 c1 = block[j];
559 if (c1 == ssc) { zptr[put1] = j; put1--; };
560 get1--;
561 }
562 ftab[sbn] |= SETMASK;
563 }
564
565 /*--
566 Step 3:
567 The [ss] big bucket is now done. Record this fact,
568 and update the quadrant descriptors. Remember to
569 update quadrants in the overshoot area too, if
570 necessary. The "if (i < 255)" test merely skips
571 this updating for the last bucket processed, since
572 updating for the last bucket is pointless.
573
574 The quadrant array provides a way to incrementally
575 cache sort orderings, as they appear, so as to
576 make subsequent comparisons in fullGtU() complete
577 faster. For repetitive blocks this makes a big
578 difference (but not big enough to be able to avoid
579 randomisation for very repetitive data.)
580
581 The precise meaning is: at all times:
582
583 for 0 <= i < nblock and 0 <= j <= nblock
584
585 if block[i] != block[j],
586
587 then the relative values of quadrant[i] and
588 quadrant[j] are meaningless.
589
590 else {
591 if quadrant[i] < quadrant[j]
592 then the string starting at i lexicographically
593 precedes the string starting at j
594
595 else if quadrant[i] > quadrant[j]
596 then the string starting at j lexicographically
597 precedes the string starting at i
598
599 else
600 the relative ordering of the strings starting
601 at i and j has not yet been determined.
602 }
603 --*/
604 bigDone[ss] = True;
605
606 if (i < 255) {
607 Int32 bbStart = ftab[ss << 8] & CLEARMASK;
608 Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
609 Int32 shifts = 0;
610
611 while ((bbSize >> shifts) > 65534) shifts++;
612
613 for (j = 0; j < bbSize; j++) {
614 Int32 a2update = zptr[bbStart + j];
615 UInt16 qVal = (UInt16)(j >> shifts);
616 quadrant[a2update] = qVal;
617 if (a2update < BZ_NUM_OVERSHOOT_BYTES)
618 quadrant[a2update + nblock] = qVal;
619 }
620
621 AssertH ( ( ((bbSize-1) >> shifts) <= 65535 ), 1002 );
622 }
623
624 /*--
625 Step 4:
626 Now scan this big bucket [ss] so as to synthesise the
627 sorted order for small buckets [t, ss] for all t != ss.
628 This will avoid doing Real Work in subsequent Step 1's.
629 --*/
630 for (j = 0; j <= 255; j++)
631 copy[j] = ftab[(j << 8) + ss] & CLEARMASK;
632
633 for (j = ftab[ss << 8] & CLEARMASK;
634 j < (ftab[(ss+1) << 8] & CLEARMASK);
635 j++) {
636 k = zptr[j]-1; if (k < 0) k += nblock;
637 c1 = block[k];
638 if ( ! bigDone[c1] ) {
639 zptr[copy[c1]] = k;
640 copy[c1] ++;
641 }
642 }
643
644 for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
645 }
646 if (s->verbosity >= 4)
647 VPrintf3( " %d pointers, %d sorted, %d scanned\n",
648 nblock, numQSorted, nblock - numQSorted );
649 }
650}
651
652
653/*---------------------------------------------*/
654static void randomiseBlock ( EState* s )
655{
656 Int32 i;
657 BZ_RAND_INIT_MASK;
658 for (i = 0; i < 256; i++) s->inUse[i] = False;
659
660 for (i = 0; i < s->nblock; i++) {
661 BZ_RAND_UPD_MASK;
662 s->block[i] ^= BZ_RAND_MASK;
663 s->inUse[s->block[i]] = True;
664 }
665}
666
667
668/*---------------------------------------------*/
669void blockSort ( EState* s )
670{
671 Int32 i;
672
673 s->workLimit = s->workFactor * (s->nblock - 1);
674 s->workDone = 0;
675 s->blockRandomised = False;
676 s->firstAttempt = True;
677
678 sortMain ( s );
679
680 if (s->verbosity >= 3)
681 VPrintf3( " %d work, %d block, ratio %5.2f\n",
682 s->workDone, s->nblock-1,
683 (float)(s->workDone) / (float)(s->nblock-1) );
684
685 if (s->workDone > s->workLimit && s->firstAttempt) {
686 if (s->verbosity >= 2)
687 VPrintf0( " sorting aborted; randomising block\n" );
688 randomiseBlock ( s );
689 s->workLimit = s->workDone = 0;
690 s->blockRandomised = True;
691 s->firstAttempt = False;
692 sortMain ( s );
693 if (s->verbosity >= 3)
694 VPrintf3( " %d work, %d block, ratio %f\n",
695 s->workDone, s->nblock-1,
696 (float)(s->workDone) / (float)(s->nblock-1) );
697 }
698
699 s->origPtr = -1;
700 for (i = 0; i < s->nblock; i++)
701 if (s->zptr[i] == 0)
702 { s->origPtr = i; break; };
703
704 AssertH( s->origPtr != -1, 1003 );
705}
706
707/*-------------------------------------------------------------*/
708/*--- end blocksort.c ---*/
709/*-------------------------------------------------------------*/
diff --git a/bzip2.1 b/bzip2.1
index 489668f..a6789a4 100644
--- a/bzip2.1
+++ b/bzip2.1
@@ -1,21 +1,29 @@
1.PU 1.PU
2.TH bzip2 1 2.TH bzip2 1
3.SH NAME 3.SH NAME
4bzip2, bunzip2 \- a block-sorting file compressor, v0.1 4bzip2, bunzip2 \- a block-sorting file compressor, v0.9.0
5.br
6bzcat \- decompresses files to stdout
5.br 7.br
6bzip2recover \- recovers data from damaged bzip2 files 8bzip2recover \- recovers data from damaged bzip2 files
7 9
8.SH SYNOPSIS 10.SH SYNOPSIS
9.ll +8 11.ll +8
10.B bzip2 12.B bzip2
11.RB [ " \-cdfkstvVL123456789 " ] 13.RB [ " \-cdfkstvzVL123456789 " ]
12[ 14[
13.I "filenames \&..." 15.I "filenames \&..."
14] 16]
15.ll -8 17.ll -8
16.br 18.br
17.B bunzip2 19.B bunzip2
18.RB [ " \-kvsVL " ] 20.RB [ " \-fkvsVL " ]
21[
22.I "filenames \&..."
23]
24.br
25.B bzcat
26.RB [ " \-s " ]
19[ 27[
20.I "filenames \&..." 28.I "filenames \&..."
21] 29]
@@ -24,7 +32,7 @@ bzip2recover \- recovers data from damaged bzip2 files
24.I "filename" 32.I "filename"
25 33
26.SH DESCRIPTION 34.SH DESCRIPTION
27.I Bzip2 35.I bzip2
28compresses files using the Burrows-Wheeler block-sorting 36compresses files using the Burrows-Wheeler block-sorting
29text compression algorithm, and Huffman coding. 37text compression algorithm, and Huffman coding.
30Compression is generally considerably 38Compression is generally considerably
@@ -38,7 +46,7 @@ those of
38.I GNU Gzip, 46.I GNU Gzip,
39but they are not identical. 47but they are not identical.
40 48
41.I Bzip2 49.I bzip2
42expects a list of file names to accompany the command-line flags. 50expects a list of file names to accompany the command-line flags.
43Each file is replaced by a compressed version of itself, 51Each file is replaced by a compressed version of itself,
44with the name "original_name.bz2". 52with the name "original_name.bz2".
@@ -50,11 +58,11 @@ original file names, permissions and dates in filesystems
50which lack these concepts, or have serious file name length 58which lack these concepts, or have serious file name length
51restrictions, such as MS-DOS. 59restrictions, such as MS-DOS.
52 60
53.I Bzip2 61.I bzip2
54and 62and
55.I bunzip2 63.I bunzip2
56will not overwrite existing files; if you want this to happen, 64will by default not overwrite existing files;
57you should delete them first. 65if you want this to happen, specify the \-f flag.
58 66
59If no file names are specified, 67If no file names are specified,
60.I bzip2 68.I bzip2
@@ -64,7 +72,7 @@ In this case,
64will decline to write compressed output to a terminal, as 72will decline to write compressed output to a terminal, as
65this would be entirely incomprehensible and therefore pointless. 73this would be entirely incomprehensible and therefore pointless.
66 74
67.I Bunzip2 75.I bunzip2
68(or 76(or
69.I bzip2 \-d 77.I bzip2 \-d
70) decompresses and restores all specified files whose names 78) decompresses and restores all specified files whose names
@@ -73,12 +81,28 @@ Files without this suffix are ignored.
73Again, supplying no filenames 81Again, supplying no filenames
74causes decompression from standard input to standard output. 82causes decompression from standard input to standard output.
75 83
84.I bunzip2
85will correctly decompress a file which is the concatenation
86of two or more compressed files. The result is the concatenation
87of the corresponding uncompressed files. Integrity testing
88(\-t) of concatenated compressed files is also supported.
89
76You can also compress or decompress files to 90You can also compress or decompress files to
77the standard output by giving the \-c flag. 91the standard output by giving the \-c flag.
78You can decompress multiple files like this, but you may 92Multiple files may be compressed and decompressed like this.
79only compress a single file this way, since it would otherwise 93The resulting outputs are fed sequentially to stdout.
80be difficult to separate out the compressed representations of 94Compression of multiple files in this manner generates
81the original files. 95a stream containing multiple compressed file representations.
96Such a stream can be decompressed correctly only by
97.I bzip2
98version 0.9.0 or later. Earlier versions of
99.I bzip2
100will stop after decompressing the first file in the stream.
101
102.I bzcat
103(or
104.I bzip2 \-dc
105) decompresses all specified files to the standard output.
82 106
83Compression is always performed, even if the compressed file is 107Compression is always performed, even if the compressed file is
84slightly larger than the original. Files of less than about 108slightly larger than the original. Files of less than about
@@ -132,7 +156,7 @@ Compression and decompression requirements, in bytes, can be estimated as:
132 156
133 Compression: 400k + ( 7 x block size ) 157 Compression: 400k + ( 7 x block size )
134 158
135 Decompression: 100k + ( 5 x block size ), or 159 Decompression: 100k + ( 4 x block size ), or
136.br 160.br
137 100k + ( 2.5 x block size ) 161 100k + ( 2.5 x block size )
138 162
@@ -147,7 +171,7 @@ choice of block size.
147 171
148For files compressed with the default 900k block size, 172For files compressed with the default 900k block size,
149.I bunzip2 173.I bunzip2
150will require about 4600 kbytes to decompress. 174will require about 3700 kbytes to decompress.
151To support decompression of any file on a 4 megabyte machine, 175To support decompression of any file on a 4 megabyte machine,
152.I bunzip2 176.I bunzip2
153has an option to decompress using approximately half this 177has an option to decompress using approximately half this
@@ -168,8 +192,8 @@ For example, compressing a file 20,000 bytes long with the flag
168\-9 192\-9
169will cause the compressor to allocate around 193will cause the compressor to allocate around
1706700k of memory, but only touch 400k + 20000 * 7 = 540 1946700k of memory, but only touch 400k + 20000 * 7 = 540
171kbytes of it. Similarly, the decompressor will allocate 4600k but 195kbytes of it. Similarly, the decompressor will allocate 3700k but
172only touch 100k + 20000 * 5 = 200 kbytes. 196only touch 100k + 20000 * 4 = 180 kbytes.
173 197
174Here is a table which summarises the maximum memory usage for 198Here is a table which summarises the maximum memory usage for
175different block sizes. Also recorded is the total compressed 199different block sizes. Also recorded is the total compressed
@@ -182,71 +206,73 @@ Corpus is dominated by smaller files.
182 Compress Decompress Decompress Corpus 206 Compress Decompress Decompress Corpus
183 Flag usage usage -s usage Size 207 Flag usage usage -s usage Size
184 208
185 -1 1100k 600k 350k 914704 209 -1 1100k 500k 350k 914704
186 -2 1800k 1100k 600k 877703 210 -2 1800k 900k 600k 877703
187 -3 2500k 1600k 850k 860338 211 -3 2500k 1300k 850k 860338
188 -4 3200k 2100k 1100k 846899 212 -4 3200k 1700k 1100k 846899
189 -5 3900k 2600k 1350k 845160 213 -5 3900k 2100k 1350k 845160
190 -6 4600k 3100k 1600k 838626 214 -6 4600k 2500k 1600k 838626
191 -7 5400k 3600k 1850k 834096 215 -7 5400k 2900k 1850k 834096
192 -8 6000k 4100k 2100k 828642 216 -8 6000k 3300k 2100k 828642
193 -9 6700k 4600k 2350k 828642 217 -9 6700k 3700k 2350k 828642
194 218
195.SH OPTIONS 219.SH OPTIONS
196.TP 220.TP
197.B \-c --stdout 221.B \-c --stdout
198Compress or decompress to standard output. \-c will decompress 222Compress or decompress to standard output. \-c will decompress
199multiple files to stdout, but will only compress a single file to 223multiple files to stdout, but will only compress a single file to
200stdout. 224stdout.
201.TP 225.TP
202.B \-d --decompress 226.B \-d --decompress
203Force decompression. 227Force decompression.
204.I Bzip2 228.I bzip2,
205and
206.I bunzip2 229.I bunzip2
207are really the same program, and the decision about whether to 230and
208compress or decompress is done on the basis of which name is 231.I bzcat
232are really the same program, and the decision about what actions
233to take is done on the basis of which name is
209used. This flag overrides that mechanism, and forces 234used. This flag overrides that mechanism, and forces
210.I bzip2 235.I bzip2
211to decompress. 236to decompress.
212.TP 237.TP
213.B \-f --compress 238.B \-z --compress
214The complement to \-d: forces compression, regardless of the invokation 239The complement to \-d: forces compression, regardless of the invokation
215name. 240name.
216.TP 241.TP
217.B \-t --test 242.B \-t --test
218Check integrity of the specified file(s), but don't decompress them. 243Check integrity of the specified file(s), but don't decompress them.
219This really performs a trial decompression and throws away the result, 244This really performs a trial decompression and throws away the result.
220using the low-memory decompression algorithm (see \-s). 245.TP
246.B \-f --force
247Force overwrite of output files. Normally,
248.I bzip2
249will not overwrite existing output files.
221.TP 250.TP
222.B \-k --keep 251.B \-k --keep
223Keep (don't delete) input files during compression or decompression. 252Keep (don't delete) input files during compression or decompression.
224.TP 253.TP
225.B \-s --small 254.B \-s --small
226Reduce memory usage, both for compression and decompression. 255Reduce memory usage, for compression, decompression and
227Files are decompressed using a modified algorithm which only 256testing.
257Files are decompressed and tested using a modified algorithm which only
228requires 2.5 bytes per block byte. This means any file can be 258requires 2.5 bytes per block byte. This means any file can be
229decompressed in 2300k of memory, albeit somewhat more slowly than 259decompressed in 2300k of memory, albeit at about half the normal
230usual. 260speed.
231 261
232During compression, -s selects a block size of 200k, which limits 262During compression, -s selects a block size of 200k, which limits
233memory use to around the same figure, at the expense of your 263memory use to around the same figure, at the expense of your
234compression ratio. In short, if your machine is low on memory 264compression ratio. In short, if your machine is low on memory
235(8 megabytes or less), use -s for everything. See 265(8 megabytes or less), use -s for everything. See
236MEMORY MANAGEMENT above. 266MEMORY MANAGEMENT above.
237
238.TP 267.TP
239.B \-v --verbose 268.B \-v --verbose
240Verbose mode -- show the compression ratio for each file processed. 269Verbose mode -- show the compression ratio for each file processed.
241Further \-v's increase the verbosity level, spewing out lots of 270Further \-v's increase the verbosity level, spewing out lots of
242information which is primarily of interest for diagnostic purposes. 271information which is primarily of interest for diagnostic purposes.
243.TP 272.TP
244.B \-L --license 273.B \-L --license -V --version
245Display the software version, license terms and conditions. 274Display the software version, license terms and conditions.
246.TP 275.TP
247.B \-V --version
248Same as \-L.
249.TP
250.B \-1 to \-9 276.B \-1 to \-9
251Set the block size to 100 k, 200 k .. 900 k when 277Set the block size to 100 k, 200 k .. 900 k when
252compressing. Has no effect when decompressing. 278compressing. Has no effect when decompressing.
@@ -329,10 +355,6 @@ to compress the latter.
329If you do get a file which causes severe slowness in compression, 355If you do get a file which causes severe slowness in compression,
330try making the block size as small as possible, with flag \-1. 356try making the block size as small as possible, with flag \-1.
331 357
332Incompressible or virtually-incompressible data may decompress
333rather more slowly than one would hope. This is due to
334a naive implementation of the move-to-front coder.
335
336.I bzip2 358.I bzip2
337usually allocates several megabytes of memory to operate in, 359usually allocates several megabytes of memory to operate in,
338and then charges all over it in a fairly random fashion. This 360and then charges all over it in a fairly random fashion. This
@@ -346,28 +368,19 @@ I imagine
346.I bzip2 368.I bzip2
347will perform best on machines with very large caches. 369will perform best on machines with very large caches.
348 370
349Test mode (\-t) uses the low-memory decompression algorithm
350(\-s). This means test mode does not run as fast as it could;
351it could run as fast as the normal decompression machinery.
352This could easily be fixed at the cost of some code bloat.
353
354.SH CAVEATS 371.SH CAVEATS
355I/O error messages are not as helpful as they could be. 372I/O error messages are not as helpful as they could be.
356.I Bzip2 373.I Bzip2
357tries hard to detect I/O errors and exit cleanly, but the 374tries hard to detect I/O errors and exit cleanly, but the
358details of what the problem is sometimes seem rather misleading. 375details of what the problem is sometimes seem rather misleading.
359 376
360This manual page pertains to version 0.1 of 377This manual page pertains to version 0.9.0 of
361.I bzip2. 378.I bzip2.
362It may well happen that some future version will 379Compressed data created by this version is entirely forwards and
363use a different compressed file format. If you try to 380backwards compatible with the previous public release, version 0.1pl2,
364decompress, using 0.1, a .bz2 file created with some 381but with the following exception: 0.9.0 can correctly decompress
365future version which uses a different compressed file format, 382multiple concatenated compressed files. 0.1pl2 cannot do this; it
3660.1 will complain that your file "is not a bzip2 file". 383will stop after decompressing just the first file in the stream.
367If that happens, you should obtain a more recent version
368of
369.I bzip2
370and use that to decompress the file.
371 384
372Wildcard expansion for Windows 95 and NT 385Wildcard expansion for Windows 95 and NT
373is flaky. 386is flaky.
@@ -377,63 +390,25 @@ uses 32-bit integers to represent bit positions in
377compressed files, so it cannot handle compressed files 390compressed files, so it cannot handle compressed files
378more than 512 megabytes long. This could easily be fixed. 391more than 512 megabytes long. This could easily be fixed.
379 392
380.I bzip2recover
381sometimes reports a very small, incomplete final block.
382This is spurious and can be safely ignored.
383
384.SH RELATIONSHIP TO bzip-0.21
385This program is a descendant of the
386.I bzip
387program, version 0.21, which I released in August 1996.
388The primary difference of
389.I bzip2
390is its avoidance of the possibly patented algorithms
391which were used in 0.21.
392.I bzip2
393also brings various useful refinements (\-s, \-t),
394uses less memory, decompresses significantly faster, and
395has support for recovering data from damaged files.
396
397Because
398.I bzip2
399uses Huffman coding to construct the compressed bitstream,
400rather than the arithmetic coding used in 0.21,
401the compressed representations generated by the two programs
402are incompatible, and they will not interoperate. The change
403in suffix from .bz to .bz2 reflects this. It would have been
404helpful to at least allow
405.I bzip2
406to decompress files created by 0.21, but this would
407defeat the primary aim of having a patent-free compressor.
408
409For a more precise statement about patent issues in
410bzip2, please see the README file in the distribution.
411
412Huffman coding necessarily involves some coding inefficiency
413compared to arithmetic coding. This means that
414.I bzip2
415compresses about 1% worse than 0.21, an unfortunate but
416unavoidable fact-of-life. On the other hand, decompression
417is approximately 50% faster for the same reason, and the
418change in file format gave an opportunity to add data-recovery
419features. So it is not all bad.
420
421.SH AUTHOR 393.SH AUTHOR
422Julian Seward, jseward@acm.org. 394Julian Seward, jseward@acm.org.
423 395
396http://www.muraroa.demon.co.uk
397
424The ideas embodied in 398The ideas embodied in
425.I bzip
426and
427.I bzip2 399.I bzip2
428are due to (at least) the following people: 400are due to (at least) the following people:
429Michael Burrows and David Wheeler (for the block sorting 401Michael Burrows and David Wheeler (for the block sorting
430transformation), David Wheeler (again, for the Huffman coder), 402transformation), David Wheeler (again, for the Huffman coder),
431Peter Fenwick (for the structured coding model in 0.21, 403Peter Fenwick (for the structured coding model in the original
404.I bzip,
432and many refinements), 405and many refinements),
433and 406and
434Alistair Moffat, Radford Neal and Ian Witten (for the arithmetic 407Alistair Moffat, Radford Neal and Ian Witten (for the arithmetic
435coder in 0.21). I am much indebted for their help, support and advice. 408coder in the original
436See the file ALGORITHMS in the source distribution for pointers to 409.I bzip).
410I am much indebted for their help, support and advice.
411See the manual in the source distribution for pointers to
437sources of documentation. 412sources of documentation.
438Christian von Roques encouraged me to look for faster 413Christian von Roques encouraged me to look for faster
439sorting algorithms, so as to speed up compression. 414sorting algorithms, so as to speed up compression.
diff --git a/bzip2.1.preformatted b/bzip2.1.preformatted
index 5206e05..8c4fab1 100644
--- a/bzip2.1.preformatted
+++ b/bzip2.1.preformatted
@@ -5,18 +5,20 @@ bzip2(1) bzip2(1)
5 5
6 6
7NNAAMMEE 7NNAAMMEE
8 bzip2, bunzip2 - a block-sorting file compressor, v0.1 8 bzip2, bunzip2 - a block-sorting file compressor, v0.9.0
9 bzcat - decompresses files to stdout
9 bzip2recover - recovers data from damaged bzip2 files 10 bzip2recover - recovers data from damaged bzip2 files
10 11
11 12
12SSYYNNOOPPSSIISS 13SSYYNNOOPPSSIISS
13 bbzziipp22 [ --ccddffkkssttvvVVLL112233445566778899 ] [ _f_i_l_e_n_a_m_e_s _._._. ] 14 bbzziipp22 [ --ccddffkkssttvvzzVVLL112233445566778899 ] [ _f_i_l_e_n_a_m_e_s _._._. ]
14 bbuunnzziipp22 [ --kkvvssVVLL ] [ _f_i_l_e_n_a_m_e_s _._._. ] 15 bbuunnzziipp22 [ --ffkkvvssVVLL ] [ _f_i_l_e_n_a_m_e_s _._._. ]
16 bbzzccaatt [ --ss ] [ _f_i_l_e_n_a_m_e_s _._._. ]
15 bbzziipp22rreeccoovveerr _f_i_l_e_n_a_m_e 17 bbzziipp22rreeccoovveerr _f_i_l_e_n_a_m_e
16 18
17 19
18DDEESSCCRRIIPPTTIIOONN 20DDEESSCCRRIIPPTTIIOONN
19 _B_z_i_p_2 compresses files using the Burrows-Wheeler block- 21 _b_z_i_p_2 compresses files using the Burrows-Wheeler block-
20 sorting text compression algorithm, and Huffman coding. 22 sorting text compression algorithm, and Huffman coding.
21 Compression is generally considerably better than that 23 Compression is generally considerably better than that
22 achieved by more conventional LZ77/LZ78-based compressors, 24 achieved by more conventional LZ77/LZ78-based compressors,
@@ -26,7 +28,7 @@ DDEESSCCRRIIPPTTIIOONN
26 The command-line options are deliberately very similar to 28 The command-line options are deliberately very similar to
27 those of _G_N_U _G_z_i_p_, but they are not identical. 29 those of _G_N_U _G_z_i_p_, but they are not identical.
28 30
29 _B_z_i_p_2 expects a list of file names to accompany the com- 31 _b_z_i_p_2 expects a list of file names to accompany the com-
30 mand-line flags. Each file is replaced by a compressed 32 mand-line flags. Each file is replaced by a compressed
31 version of itself, with the name "original_name.bz2". 33 version of itself, with the name "original_name.bz2".
32 Each compressed file has the same modification date and 34 Each compressed file has the same modification date and
@@ -38,8 +40,8 @@ DDEESSCCRRIIPPTTIIOONN
38 cepts, or have serious file name length restrictions, such 40 cepts, or have serious file name length restrictions, such
39 as MS-DOS. 41 as MS-DOS.
40 42
41 _B_z_i_p_2 and _b_u_n_z_i_p_2 will not overwrite existing files; if 43 _b_z_i_p_2 and _b_u_n_z_i_p_2 will by default not overwrite existing
42 you want this to happen, you should delete them first. 44 files; if you want this to happen, specify the -f flag.
43 45
44 If no file names are specified, _b_z_i_p_2 compresses from 46 If no file names are specified, _b_z_i_p_2 compresses from
45 standard input to standard output. In this case, _b_z_i_p_2 47 standard input to standard output. In this case, _b_z_i_p_2
@@ -47,17 +49,15 @@ DDEESSCCRRIIPPTTIIOONN
47 this would be entirely incomprehensible and therefore 49 this would be entirely incomprehensible and therefore
48 pointless. 50 pointless.
49 51
50 _B_u_n_z_i_p_2 (or _b_z_i_p_2 _-_d ) decompresses and restores all spec- 52 _b_u_n_z_i_p_2 (or _b_z_i_p_2 _-_d ) decompresses and restores all spec-
51 ified files whose names end in ".bz2". Files without this 53 ified files whose names end in ".bz2". Files without this
52 suffix are ignored. Again, supplying no filenames causes 54 suffix are ignored. Again, supplying no filenames causes
53 decompression from standard input to standard output. 55 decompression from standard input to standard output.
54 56
55 You can also compress or decompress files to the standard 57 _b_u_n_z_i_p_2 will correctly decompress a file which is the con-
56 output by giving the -c flag. You can decompress multiple 58 catenation of two or more compressed files. The result is
57 files like this, but you may only compress a single file 59 the concatenation of the corresponding uncompressed files.
58 this way, since it would otherwise be difficult to sepa- 60 Integrity testing (-t) of concatenated compressed files is
59 rate out the compressed representations of the original
60 files.
61 61
62 62
63 63
@@ -70,6 +70,21 @@ DDEESSCCRRIIPPTTIIOONN
70bzip2(1) bzip2(1) 70bzip2(1) bzip2(1)
71 71
72 72
73 also supported.
74
75 You can also compress or decompress files to the standard
76 output by giving the -c flag. Multiple files may be com-
77 pressed and decompressed like this. The resulting outputs
78 are fed sequentially to stdout. Compression of multiple
79 files in this manner generates a stream containing multi-
80 ple compressed file representations. Such a stream can be
81 decompressed correctly only by _b_z_i_p_2 version 0.9.0 or
82 later. Earlier versions of _b_z_i_p_2 will stop after decom-
83 pressing the first file in the stream.
84
85 _b_z_c_a_t (or _b_z_i_p_2 _-_d_c ) decompresses all specified files to
86 the standard output.
87
73 Compression is always performed, even if the compressed 88 Compression is always performed, even if the compressed
74 file is slightly larger than the original. Files of less 89 file is slightly larger than the original. Files of less
75 than about one hundred bytes tend to get larger, since the 90 than about one hundred bytes tend to get larger, since the
@@ -108,36 +123,37 @@ MMEEMMOORRYY MMAANNAAGGEEMMEENNTT
108 file, and _b_u_n_z_i_p_2 then allocates itself just enough memory 123 file, and _b_u_n_z_i_p_2 then allocates itself just enough memory
109 to decompress the file. Since block sizes are stored in 124 to decompress the file. Since block sizes are stored in
110 compressed files, it follows that the flags -1 to -9 are 125 compressed files, it follows that the flags -1 to -9 are
111 irrelevant to and so ignored during decompression. Com- 126 irrelevant to and so ignored during decompression.
112 pression and decompression requirements, in bytes, can be
113 estimated as:
114 127
115 Compression: 400k + ( 7 x block size )
116 128
117 Decompression: 100k + ( 5 x block size ), or
118 100k + ( 2.5 x block size )
119 129
120 Larger block sizes give rapidly diminishing marginal 130 2
121 returns; most of the compression comes from the first two
122 or three hundred k of block size, a fact worth bearing in
123 mind when using _b_z_i_p_2 on small machines. It is also
124 important to appreciate that the decompression memory
125 requirement is set at compression-time by the choice of
126 block size.
127 131
128 132
129 133
130 2
131 134
132 135
136bzip2(1) bzip2(1)
133 137
134 138
139 Compression and decompression requirements, in bytes, can
140 be estimated as:
135 141
136bzip2(1) bzip2(1) 142 Compression: 400k + ( 7 x block size )
137 143
144 Decompression: 100k + ( 4 x block size ), or
145 100k + ( 2.5 x block size )
146
147 Larger block sizes give rapidly diminishing marginal
148 returns; most of the compression comes from the first two
149 or three hundred k of block size, a fact worth bearing in
150 mind when using _b_z_i_p_2 on small machines. It is also
151 important to appreciate that the decompression memory
152 requirement is set at compression-time by the choice of
153 block size.
138 154
139 For files compressed with the default 900k block size, 155 For files compressed with the default 900k block size,
140 _b_u_n_z_i_p_2 will require about 4600 kbytes to decompress. To 156 _b_u_n_z_i_p_2 will require about 3700 kbytes to decompress. To
141 support decompression of any file on a 4 megabyte machine, 157 support decompression of any file on a 4 megabyte machine,
142 _b_u_n_z_i_p_2 has an option to decompress using approximately 158 _b_u_n_z_i_p_2 has an option to decompress using approximately
143 half this amount of memory, about 2300 kbytes. Decompres- 159 half this amount of memory, about 2300 kbytes. Decompres-
@@ -157,8 +173,8 @@ bzip2(1) bzip2(1)
157 file 20,000 bytes long with the flag -9 will cause the 173 file 20,000 bytes long with the flag -9 will cause the
158 compressor to allocate around 6700k of memory, but only 174 compressor to allocate around 6700k of memory, but only
159 touch 400k + 20000 * 7 = 540 kbytes of it. Similarly, the 175 touch 400k + 20000 * 7 = 540 kbytes of it. Similarly, the
160 decompressor will allocate 4600k but only touch 100k + 176 decompressor will allocate 3700k but only touch 100k +
161 20000 * 5 = 200 kbytes. 177 20000 * 4 = 180 kbytes.
162 178
163 Here is a table which summarises the maximum memory usage 179 Here is a table which summarises the maximum memory usage
164 for different block sizes. Also recorded is the total 180 for different block sizes. Also recorded is the total
@@ -172,64 +188,66 @@ bzip2(1) bzip2(1)
172 Compress Decompress Decompress Corpus 188 Compress Decompress Decompress Corpus
173 Flag usage usage -s usage Size 189 Flag usage usage -s usage Size
174 190
175 -1 1100k 600k 350k 914704 191 -1 1100k 500k 350k 914704
176 -2 1800k 1100k 600k 877703 192 -2 1800k 900k 600k 877703
177 -3 2500k 1600k 850k 860338
178 -4 3200k 2100k 1100k 846899
179 -5 3900k 2600k 1350k 845160
180 -6 4600k 3100k 1600k 838626
181 -7 5400k 3600k 1850k 834096
182 -8 6000k 4100k 2100k 828642
183 -9 6700k 4600k 2350k 828642
184 193
185 194
186OOPPTTIIOONNSS
187 --cc ----ssttddoouutt
188 Compress or decompress to standard output. -c will
189 decompress multiple files to stdout, but will only
190 compress a single file to stdout.
191
192 195
196 3
193 197
194 198
195 199
196 3
197 200
198 201
202bzip2(1) bzip2(1)
199 203
200 204
205 -3 2500k 1300k 850k 860338
206 -4 3200k 1700k 1100k 846899
207 -5 3900k 2100k 1350k 845160
208 -6 4600k 2500k 1600k 838626
209 -7 5400k 2900k 1850k 834096
210 -8 6000k 3300k 2100k 828642
211 -9 6700k 3700k 2350k 828642
201 212
202bzip2(1) bzip2(1)
203 213
214OOPPTTIIOONNSS
215 --cc ----ssttddoouutt
216 Compress or decompress to standard output. -c will
217 decompress multiple files to stdout, but will only
218 compress a single file to stdout.
204 219
205 --dd ----ddeeccoommpprreessss 220 --dd ----ddeeccoommpprreessss
206 Force decompression. _B_z_i_p_2 and _b_u_n_z_i_p_2 are really 221 Force decompression. _b_z_i_p_2_, _b_u_n_z_i_p_2 and _b_z_c_a_t are
207 the same program, and the decision about whether to 222 really the same program, and the decision about
208 compress or decompress is done on the basis of 223 what actions to take is done on the basis of which
209 which name is used. This flag overrides that mech- 224 name is used. This flag overrides that mechanism,
210 anism, and forces _b_z_i_p_2 to decompress. 225 and forces _b_z_i_p_2 to decompress.
211 226
212 --ff ----ccoommpprreessss 227 --zz ----ccoommpprreessss
213 The complement to -d: forces compression, regard- 228 The complement to -d: forces compression, regard-
214 less of the invokation name. 229 less of the invokation name.
215 230
216 --tt ----tteesstt 231 --tt ----tteesstt
217 Check integrity of the specified file(s), but don't 232 Check integrity of the specified file(s), but don't
218 decompress them. This really performs a trial 233 decompress them. This really performs a trial
219 decompression and throws away the result, using the 234 decompression and throws away the result.
220 low-memory decompression algorithm (see -s). 235
236 --ff ----ffoorrccee
237 Force overwrite of output files. Normally, _b_z_i_p_2
238 will not overwrite existing output files.
221 239
222 --kk ----kkeeeepp 240 --kk ----kkeeeepp
223 Keep (don't delete) input files during compression 241 Keep (don't delete) input files during compression
224 or decompression. 242 or decompression.
225 243
226 --ss ----ssmmaallll 244 --ss ----ssmmaallll
227 Reduce memory usage, both for compression and 245 Reduce memory usage, for compression, decompression
228 decompression. Files are decompressed using a mod- 246 and testing. Files are decompressed and tested
229 ified algorithm which only requires 2.5 bytes per 247 using a modified algorithm which only requires 2.5
230 block byte. This means any file can be decom- 248 bytes per block byte. This means any file can be
231 pressed in 2300k of memory, albeit somewhat more 249 decompressed in 2300k of memory, albeit at about
232 slowly than usual. 250 half the normal speed.
233 251
234 During compression, -s selects a block size of 252 During compression, -s selects a block size of
235 200k, which limits memory use to around the same 253 200k, which limits memory use to around the same
@@ -239,35 +257,32 @@ bzip2(1) bzip2(1)
239 MEMORY MANAGEMENT above. 257 MEMORY MANAGEMENT above.
240 258
241 259
260
261
262 4
263
264
265
266
267
268bzip2(1) bzip2(1)
269
270
242 --vv ----vveerrbboossee 271 --vv ----vveerrbboossee
243 Verbose mode -- show the compression ratio for each 272 Verbose mode -- show the compression ratio for each
244 file processed. Further -v's increase the ver- 273 file processed. Further -v's increase the ver-
245 bosity level, spewing out lots of information which 274 bosity level, spewing out lots of information which
246 is primarily of interest for diagnostic purposes. 275 is primarily of interest for diagnostic purposes.
247 276
248 --LL ----lliicceennssee 277 --LL ----lliicceennssee --VV ----vveerrssiioonn
249 Display the software version, license terms and 278 Display the software version, license terms and
250 conditions. 279 conditions.
251 280
252 --VV ----vveerrssiioonn
253 Same as -L.
254
255 --11 ttoo --99 281 --11 ttoo --99
256 Set the block size to 100 k, 200 k .. 900 k when 282 Set the block size to 100 k, 200 k .. 900 k when
257 compressing. Has no effect when decompressing. 283 compressing. Has no effect when decompressing.
258 See MEMORY MANAGEMENT above. 284 See MEMORY MANAGEMENT above.
259 285
260
261
262 4
263
264
265
266
267
268bzip2(1) bzip2(1)
269
270
271 ----rreeppeettiittiivvee--ffaasstt 286 ----rreeppeettiittiivvee--ffaasstt
272 _b_z_i_p_2 injects some small pseudo-random variations 287 _b_z_i_p_2 injects some small pseudo-random variations
273 into very repetitive blocks to limit worst-case 288 into very repetitive blocks to limit worst-case
@@ -306,34 +321,34 @@ RREECCOOVVEERRIINNGG DDAATTAA FFRROOMM DDAAMMAAGGEEDD F
306 _b_z_i_p_2_r_e_c_o_v_e_r takes a single argument, the name of the dam- 321 _b_z_i_p_2_r_e_c_o_v_e_r takes a single argument, the name of the dam-
307 aged file, and writes a number of files "rec0001file.bz2", 322 aged file, and writes a number of files "rec0001file.bz2",
308 "rec0002file.bz2", etc, containing the extracted blocks. 323 "rec0002file.bz2", etc, containing the extracted blocks.
309 The output filenames are designed so that the use of wild- 324 The output filenames are designed so that the use of
310 cards in subsequent processing -- for example, "bzip2 -dc
311 rec*file.bz2 > recovered_data" -- lists the files in the
312 "right" order.
313 325
314 _b_z_i_p_2_r_e_c_o_v_e_r should be of most use dealing with large .bz2
315 files, as these will contain many blocks. It is clearly
316 futile to use it on damaged single-block files, since a
317 damaged block cannot be recovered. If you wish to min-
318 imise any potential data loss through media or transmis-
319 sion errors, you might consider compressing with a smaller
320 block size.
321 326
322 327
323PPEERRFFOORRMMAANNCCEE NNOOTTEESS 328 5
324 The sorting phase of compression gathers together similar
325 329
326 330
327 331
328 5
329 332
330 333
334bzip2(1) bzip2(1)
331 335
332 336
337 wildcards in subsequent processing -- for example, "bzip2
338 -dc rec*file.bz2 > recovered_data" -- lists the files in
339 the "right" order.
333 340
334bzip2(1) bzip2(1) 341 _b_z_i_p_2_r_e_c_o_v_e_r should be of most use dealing with large .bz2
342 files, as these will contain many blocks. It is clearly
343 futile to use it on damaged single-block files, since a
344 damaged block cannot be recovered. If you wish to min-
345 imise any potential data loss through media or transmis-
346 sion errors, you might consider compressing with a smaller
347 block size.
335 348
336 349
350PPEERRFFOORRMMAANNCCEE NNOOTTEESS
351 The sorting phase of compression gathers together similar
337 strings in the file. Because of this, files containing 352 strings in the file. Because of this, files containing
338 very long runs of repeated symbols, like "aabaabaabaab 353 very long runs of repeated symbols, like "aabaabaabaab
339 ..." (repeated several hundred times) may compress 354 ..." (repeated several hundred times) may compress
@@ -348,10 +363,6 @@ bzip2(1) bzip2(1)
348 severe slowness in compression, try making the block size 363 severe slowness in compression, try making the block size
349 as small as possible, with flag -1. 364 as small as possible, with flag -1.
350 365
351 Incompressible or virtually-incompressible data may decom-
352 press rather more slowly than one would hope. This is due
353 to a naive implementation of the move-to-front coder.
354
355 _b_z_i_p_2 usually allocates several megabytes of memory to 366 _b_z_i_p_2 usually allocates several megabytes of memory to
356 operate in, and then charges all over it in a fairly ran- 367 operate in, and then charges all over it in a fairly ran-
357 dom fashion. This means that performance, both for com- 368 dom fashion. This means that performance, both for com-
@@ -362,12 +373,6 @@ bzip2(1) bzip2(1)
362 large performance improvements. I imagine _b_z_i_p_2 will per- 373 large performance improvements. I imagine _b_z_i_p_2 will per-
363 form best on machines with very large caches. 374 form best on machines with very large caches.
364 375
365 Test mode (-t) uses the low-memory decompression algorithm
366 (-s). This means test mode does not run as fast as it
367 could; it could run as fast as the normal decompression
368 machinery. This could easily be fixed at the cost of some
369 code bloat.
370
371 376
372CCAAVVEEAATTSS 377CCAAVVEEAATTSS
373 I/O error messages are not as helpful as they could be. 378 I/O error messages are not as helpful as they could be.
@@ -375,19 +380,14 @@ CCAAVVEEAATTSS
375 but the details of what the problem is sometimes seem 380 but the details of what the problem is sometimes seem
376 rather misleading. 381 rather misleading.
377 382
378 This manual page pertains to version 0.1 of _b_z_i_p_2_. It may 383 This manual page pertains to version 0.9.0 of _b_z_i_p_2_. Com-
379 well happen that some future version will use a different 384 pressed data created by this version is entirely forwards
380 compressed file format. If you try to decompress, using 385 and backwards compatible with the previous public release,
381 0.1, a .bz2 file created with some future version which 386 version 0.1pl2, but with the following exception: 0.9.0
382 uses a different compressed file format, 0.1 will complain 387 can correctly decompress multiple concatenated compressed
383 that your file "is not a bzip2 file". If that happens, 388 files. 0.1pl2 cannot do this; it will stop after decom-
384 you should obtain a more recent version of _b_z_i_p_2 and use 389 pressing just the first file in the stream.
385 that to decompress the file.
386 390
387 Wildcard expansion for Windows 95 and NT is flaky.
388
389 _b_z_i_p_2_r_e_c_o_v_e_r uses 32-bit integers to represent bit posi-
390 tions in compressed files, so it cannot handle compressed
391 391
392 392
393 393
@@ -400,61 +400,59 @@ CCAAVVEEAATTSS
400bzip2(1) bzip2(1) 400bzip2(1) bzip2(1)
401 401
402 402
403 files more than 512 megabytes long. This could easily be 403 Wildcard expansion for Windows 95 and NT is flaky.
404
405 _b_z_i_p_2_r_e_c_o_v_e_r uses 32-bit integers to represent bit posi-
406 tions in compressed files, so it cannot handle compressed
407 files more than 512 megabytes long. This could easily be
404 fixed. 408 fixed.
405 409
406 _b_z_i_p_2_r_e_c_o_v_e_r sometimes reports a very small, incomplete 410
407 final block. This is spurious and can be safely ignored. 411AAUUTTHHOORR
412 Julian Seward, jseward@acm.org.
413 http://www.muraroa.demon.co.uk
414
415 The ideas embodied in _b_z_i_p_2 are due to (at least) the fol-
416 lowing people: Michael Burrows and David Wheeler (for the
417 block sorting transformation), David Wheeler (again, for
418 the Huffman coder), Peter Fenwick (for the structured cod-
419 ing model in the original _b_z_i_p_, and many refinements), and
420 Alistair Moffat, Radford Neal and Ian Witten (for the
421 arithmetic coder in the original _b_z_i_p_)_. I am much
422 indebted for their help, support and advice. See the man-
423 ual in the source distribution for pointers to sources of
424 documentation. Christian von Roques encouraged me to look
425 for faster sorting algorithms, so as to speed up compres-
426 sion. Bela Lubkin encouraged me to improve the worst-case
427 compression performance. Many people sent patches, helped
428 with portability problems, lent machines, gave advice and
429 were generally helpful.
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
408 448
409 449
410RREELLAATTIIOONNSSHHIIPP TTOO bbzziipp--00..2211
411 This program is a descendant of the _b_z_i_p program, version
412 0.21, which I released in August 1996. The primary dif-
413 ference of _b_z_i_p_2 is its avoidance of the possibly patented
414 algorithms which were used in 0.21. _b_z_i_p_2 also brings
415 various useful refinements (-s, -t), uses less memory,
416 decompresses significantly faster, and has support for
417 recovering data from damaged files.
418 450
419 Because _b_z_i_p_2 uses Huffman coding to construct the com-
420 pressed bitstream, rather than the arithmetic coding used
421 in 0.21, the compressed representations generated by the
422 two programs are incompatible, and they will not interop-
423 erate. The change in suffix from .bz to .bz2 reflects
424 this. It would have been helpful to at least allow _b_z_i_p_2
425 to decompress files created by 0.21, but this would defeat
426 the primary aim of having a patent-free compressor.
427 451
428 For a more precise statement about patent issues in bzip2,
429 please see the README file in the distribution.
430 452
431 Huffman coding necessarily involves some coding ineffi-
432 ciency compared to arithmetic coding. This means that
433 _b_z_i_p_2 compresses about 1% worse than 0.21, an unfortunate
434 but unavoidable fact-of-life. On the other hand, decom-
435 pression is approximately 50% faster for the same reason,
436 and the change in file format gave an opportunity to add
437 data-recovery features. So it is not all bad.
438 453
439 454
440AAUUTTHHOORR
441 Julian Seward, jseward@acm.org.
442 455
443 The ideas embodied in _b_z_i_p and _b_z_i_p_2 are due to (at least)
444 the following people: Michael Burrows and David Wheeler
445 (for the block sorting transformation), David Wheeler
446 (again, for the Huffman coder), Peter Fenwick (for the
447 structured coding model in 0.21, and many refinements),
448 and Alistair Moffat, Radford Neal and Ian Witten (for the
449 arithmetic coder in 0.21). I am much indebted for their
450 help, support and advice. See the file ALGORITHMS in the
451 source distribution for pointers to sources of documenta-
452 tion. Christian von Roques encouraged me to look for
453 faster sorting algorithms, so as to speed up compression.
454 Bela Lubkin encouraged me to improve the worst-case com-
455 pression performance. Many people sent patches, helped
456 with portability problems, lent machines, gave advice and
457 were generally helpful.
458 456
459 457
460 458
diff --git a/bzip2.c b/bzip2.c
index 53ce10d..6a3ab95 100644
--- a/bzip2.c
+++ b/bzip2.c
@@ -4,28 +4,45 @@
4/*-----------------------------------------------------------*/ 4/*-----------------------------------------------------------*/
5 5
6/*-- 6/*--
7 This program is bzip2, a lossless, block-sorting data compressor, 7 This file is a part of bzip2 and/or libbzip2, a program and
8 version 0.1pl2, dated 29-Aug-1997. 8 library for lossless, block-sorting data compression.
9 9
10 Copyright (C) 1996, 1997 by Julian Seward. 10 Copyright (C) 1996-1998 Julian R Seward. All rights reserved.
11 Guildford, Surrey, UK 11
12 email: jseward@acm.org 12 Redistribution and use in source and binary forms, with or without
13 13 modification, are permitted provided that the following conditions
14 This program is free software; you can redistribute it and/or modify 14 are met:
15 it under the terms of the GNU General Public License as published by 15
16 the Free Software Foundation; either version 2 of the License, or 16 1. Redistributions of source code must retain the above copyright
17 (at your option) any later version. 17 notice, this list of conditions and the following disclaimer.
18 18
19 This program is distributed in the hope that it will be useful, 19 2. The origin of this software must not be misrepresented; you must
20 but WITHOUT ANY WARRANTY; without even the implied warranty of 20 not claim that you wrote the original software. If you use this
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 software in a product, an acknowledgment in the product
22 GNU General Public License for more details. 22 documentation would be appreciated but is not required.
23 23
24 You should have received a copy of the GNU General Public License 24 3. Altered source versions must be plainly marked as such, and must
25 along with this program; if not, write to the Free Software 25 not be misrepresented as being the original software.
26 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 26
27 27 4. The name of the author may not be used to endorse or promote
28 The GNU General Public License is contained in the file LICENSE. 28 products derived from this software without specific prior written
29 permission.
30
31 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
32 OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
33 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
35 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
37 GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
38 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
39 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
40 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
41 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42
43 Julian Seward, Guildford, Surrey, UK.
44 jseward@acm.org
45 bzip2/libbzip2 version 0.9.0c of 18 October 1998
29 46
30 This program is based on (at least) the work of: 47 This program is based on (at least) the work of:
31 Mike Burrows 48 Mike Burrows
@@ -37,21 +54,23 @@
37 Robert Sedgewick 54 Robert Sedgewick
38 Jon L. Bentley 55 Jon L. Bentley
39 56
40 For more information on these sources, see the file ALGORITHMS. 57 For more information on these sources, see the manual.
41--*/ 58--*/
42 59
60
43/*----------------------------------------------------*/ 61/*----------------------------------------------------*/
44/*--- IMPORTANT ---*/ 62/*--- IMPORTANT ---*/
45/*----------------------------------------------------*/ 63/*----------------------------------------------------*/
46 64
47/*-- 65/*--
48 WARNING: 66 WARNING:
49 This program (attempts to) compress data by performing several 67 This program and library (attempts to) compress data by
50 non-trivial transformations on it. Unless you are 100% familiar 68 performing several non-trivial transformations on it.
51 with *all* the algorithms contained herein, and with the 69 Unless you are 100% familiar with *all* the algorithms
52 consequences of modifying them, you should NOT meddle with the 70 contained herein, and with the consequences of modifying them,
53 compression or decompression machinery. Incorrect changes can 71 you should NOT meddle with the compression or decompression
54 and very likely *will* lead to disasterous loss of data. 72 machinery. Incorrect changes can and very likely *will*
73 lead to disasterous loss of data.
55 74
56 DISCLAIMER: 75 DISCLAIMER:
57 I TAKE NO RESPONSIBILITY FOR ANY LOSS OF DATA ARISING FROM THE 76 I TAKE NO RESPONSIBILITY FOR ANY LOSS OF DATA ARISING FROM THE
@@ -65,18 +84,19 @@
65 of various special cases in the code which occur with very low 84 of various special cases in the code which occur with very low
66 but non-zero probability make it impossible to rule out the 85 but non-zero probability make it impossible to rule out the
67 possibility of bugs remaining in the program. DO NOT COMPRESS 86 possibility of bugs remaining in the program. DO NOT COMPRESS
68 ANY DATA WITH THIS PROGRAM UNLESS YOU ARE PREPARED TO ACCEPT THE 87 ANY DATA WITH THIS PROGRAM AND/OR LIBRARY UNLESS YOU ARE PREPARED
69 POSSIBILITY, HOWEVER SMALL, THAT THE DATA WILL NOT BE RECOVERABLE. 88 TO ACCEPT THE POSSIBILITY, HOWEVER SMALL, THAT THE DATA WILL
89 NOT BE RECOVERABLE.
70 90
71 That is not to say this program is inherently unreliable. 91 That is not to say this program is inherently unreliable.
72 Indeed, I very much hope the opposite is true. bzip2 has been 92 Indeed, I very much hope the opposite is true. bzip2/libbzip2
73 carefully constructed and extensively tested. 93 has been carefully constructed and extensively tested.
74 94
75 PATENTS: 95 PATENTS:
76 To the best of my knowledge, bzip2 does not use any patented 96 To the best of my knowledge, bzip2/libbzip2 does not use any
77 algorithms. However, I do not have the resources available to 97 patented algorithms. However, I do not have the resources
78 carry out a full patent search. Therefore I cannot give any 98 available to carry out a full patent search. Therefore I cannot
79 guarantee of the above statement. 99 give any guarantee of the above statement.
80--*/ 100--*/
81 101
82 102
@@ -103,6 +123,10 @@
103--*/ 123--*/
104#define BZ_LCCWIN32 0 124#define BZ_LCCWIN32 0
105 125
126#ifdef _WIN32
127#define BZ_LCCWIN32 1
128#define BZ_UNIX 0
129#endif
106 130
107 131
108/*---------------------------------------------*/ 132/*---------------------------------------------*/
@@ -112,12 +136,10 @@
112 136
113#include <stdio.h> 137#include <stdio.h>
114#include <stdlib.h> 138#include <stdlib.h>
115#if DEBUG
116 #include <assert.h>
117#endif
118#include <string.h> 139#include <string.h>
119#include <signal.h> 140#include <signal.h>
120#include <math.h> 141#include <math.h>
142#include "bzlib.h"
121 143
122#define ERROR_IF_EOF(i) { if ((i) == EOF) ioError(); } 144#define ERROR_IF_EOF(i) { if ((i) == EOF) ioError(); }
123#define ERROR_IF_NOT_ZERO(i) { if ((i) != 0) ioError(); } 145#define ERROR_IF_NOT_ZERO(i) { if ((i) != 0) ioError(); }
@@ -130,68 +152,45 @@
130--*/ 152--*/
131 153
132#if BZ_UNIX 154#if BZ_UNIX
133 #include <sys/types.h> 155# include <sys/types.h>
134 #include <utime.h> 156# include <utime.h>
135 #include <unistd.h> 157# include <unistd.h>
136 #include <malloc.h> 158# include <sys/stat.h>
137 #include <sys/stat.h> 159# include <sys/times.h>
138 #include <sys/times.h> 160
139 161# define PATH_SEP '/'
140 #define Int32 int 162# define MY_LSTAT lstat
141 #define UInt32 unsigned int 163# define MY_S_IFREG S_ISREG
142 #define Char char 164# define MY_STAT stat
143 #define UChar unsigned char 165
144 #define Int16 short 166# define APPEND_FILESPEC(root, name) \
145 #define UInt16 unsigned short
146
147 #define PATH_SEP '/'
148 #define MY_LSTAT lstat
149 #define MY_S_IFREG S_ISREG
150 #define MY_STAT stat
151
152 #define APPEND_FILESPEC(root, name) \
153 root=snocString((root), (name)) 167 root=snocString((root), (name))
154 168
155 #define SET_BINARY_MODE(fd) /**/ 169# define SET_BINARY_MODE(fd) /**/
156 170
157 /*-- 171# ifdef __GNUC__
158 You should try very hard to persuade your C compiler 172# define NORETURN __attribute__ ((noreturn))
159 to inline the bits marked INLINE. Otherwise bzip2 will 173# else
160 run rather slowly. gcc version 2.x is recommended. 174# define NORETURN /**/
161 --*/ 175# endif
162 #ifdef __GNUC__
163 #define INLINE inline
164 #define NORETURN __attribute__ ((noreturn))
165 #else
166 #define INLINE /**/
167 #define NORETURN /**/
168 #endif
169#endif 176#endif
170 177
171 178
172 179
173#if BZ_LCCWIN32 180#if BZ_LCCWIN32
174 #include <io.h> 181# include <io.h>
175 #include <fcntl.h> 182# include <fcntl.h>
176 #include <sys\stat.h> 183# include <sys\stat.h>
177 184
178 #define Int32 int 185# define NORETURN /**/
179 #define UInt32 unsigned int 186# define PATH_SEP '\\'
180 #define Int16 short 187# define MY_LSTAT _stat
181 #define UInt16 unsigned short 188# define MY_STAT _stat
182 #define Char char 189# define MY_S_IFREG(x) ((x) & _S_IFREG)
183 #define UChar unsigned char 190
184 191# if 0
185 #define INLINE /**/
186 #define NORETURN /**/
187 #define PATH_SEP '\\'
188 #define MY_LSTAT _stat
189 #define MY_STAT _stat
190 #define MY_S_IFREG(x) ((x) & _S_IFREG)
191
192 #if 0
193 /*-- lcc-win32 seems to expand wildcards itself --*/ 192 /*-- lcc-win32 seems to expand wildcards itself --*/
194 #define APPEND_FILESPEC(root, spec) \ 193# define APPEND_FILESPEC(root, spec) \
195 do { \ 194 do { \
196 if ((spec)[0] == '-') { \ 195 if ((spec)[0] == '-') { \
197 root = snocString((root), (spec)); \ 196 root = snocString((root), (spec)); \
@@ -211,12 +210,12 @@
211 } \ 210 } \
212 } \ 211 } \
213 } while ( 0 ) 212 } while ( 0 )
214 #else 213# else
215 #define APPEND_FILESPEC(root, name) \ 214# define APPEND_FILESPEC(root, name) \
216 root = snocString ((root), (name)) 215 root = snocString ((root), (name))
217 #endif 216# endif
218 217
219 #define SET_BINARY_MODE(fd) \ 218# define SET_BINARY_MODE(fd) \
220 do { \ 219 do { \
221 int retVal = setmode ( fileno ( fd ), \ 220 int retVal = setmode ( fileno ( fd ), \
222 O_BINARY ); \ 221 O_BINARY ); \
@@ -231,111 +230,32 @@
231 Some more stuff for all platforms :-) 230 Some more stuff for all platforms :-)
232--*/ 231--*/
233 232
234#define Bool unsigned char 233typedef char Char;
235#define True 1 234typedef unsigned char Bool;
236#define False 0 235typedef unsigned char UChar;
236typedef int Int32;
237typedef unsigned int UInt32;
238typedef short Int16;
239typedef unsigned short UInt16;
240
241#define True ((Bool)1)
242#define False ((Bool)0)
237 243
238/*-- 244/*--
239 IntNative is your platform's `native' int size. 245 IntNative is your platform's `native' int size.
240 Only here to avoid probs with 64-bit platforms. 246 Only here to avoid probs with 64-bit platforms.
241--*/ 247--*/
242#define IntNative int 248typedef int IntNative;
243
244
245/*--
246 change to 1, or compile with -DDEBUG=1 to debug
247--*/
248#ifndef DEBUG
249#define DEBUG 0
250#endif
251
252
253/*---------------------------------------------------*/
254/*--- ---*/
255/*---------------------------------------------------*/
256
257/*--
258 Implementation notes, July 1997
259 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
260
261 Memory allocation
262 ~~~~~~~~~~~~~~~~~
263 All large data structures are allocated on the C heap,
264 for better or for worse. That includes the various
265 arrays of pointers, striped words, bytes, frequency
266 tables and buffers for compression and decompression.
267
268 bzip2 can operate at various block-sizes, ranging from
269 100k to 900k in 100k steps, and it allocates only as
270 much as it needs to. When compressing, we know from the
271 command-line options what the block-size is going to be,
272 so all allocation can be done at start-up; if that
273 succeeds, there can be no further allocation problems.
274
275 Decompression is more complicated. Each compressed file
276 contains, in its header, a byte indicating the block
277 size used for compression. This means bzip2 potentially
278 needs to reallocate memory for each file it deals with,
279 which in turn opens the possibility for a memory allocation
280 failure part way through a run of files, by encountering
281 a file requiring a much larger block size than all the
282 ones preceding it.
283
284 The policy is to simply give up if a memory allocation
285 failure occurs. During decompression, it would be
286 possible to move on to subsequent files in the hope that
287 some might ask for a smaller block size, but the
288 complications for doing this seem more trouble than they
289 are worth.
290
291
292 Compressed file formats
293 ~~~~~~~~~~~~~~~~~~~~~~~
294 [This is now entirely different from both 0.21, and from
295 any previous Huffman-coded variant of bzip.
296 See the associated file bzip2.txt for details.]
297
298
299 Error conditions
300 ~~~~~~~~~~~~~~~~
301 Dealing with error conditions is the least satisfactory
302 aspect of bzip2. The policy is to try and leave the
303 filesystem in a consistent state, then quit, even if it
304 means not processing some of the files mentioned in the
305 command line. `A consistent state' means that a file
306 exists either in its compressed or uncompressed form,
307 but not both. This boils down to the rule `delete the
308 output file if an error condition occurs, leaving the
309 input intact'. Input files are only deleted when we can
310 be pretty sure the output file has been written and
311 closed successfully.
312
313 Errors are a dog because there's so many things to
314 deal with. The following can happen mid-file, and
315 require cleaning up.
316
317 internal `panics' -- indicating a bug
318 corrupted or inconsistent compressed file
319 can't allocate enough memory to decompress this file
320 I/O error reading/writing/opening/closing
321 signal catches -- Control-C, SIGTERM, SIGHUP.
322
323 Other conditions, primarily pertaining to file names,
324 can be checked in-between files, which makes dealing
325 with them easier.
326--*/
327
328 249
329 250
330/*---------------------------------------------------*/ 251/*---------------------------------------------------*/
331/*--- Misc (file handling) data decls ---*/ 252/*--- Misc (file handling) data decls ---*/
332/*---------------------------------------------------*/ 253/*---------------------------------------------------*/
333 254
334UInt32 bytesIn, bytesOut;
335Int32 verbosity; 255Int32 verbosity;
336Bool keepInputFiles, smallMode, testFailsExist; 256Bool keepInputFiles, smallMode;
337UInt32 globalCrc; 257Bool forceOverwrite, testFailsExist;
338Int32 numFileNames, numFilesProcessed; 258Int32 numFileNames, numFilesProcessed, blockSize100k;
339 259
340 260
341/*-- source modes; F==file, I==stdin, O==stdout --*/ 261/*-- source modes; F==file, I==stdin, O==stdout --*/
@@ -351,2691 +271,304 @@ Int32 numFileNames, numFilesProcessed;
351Int32 opMode; 271Int32 opMode;
352Int32 srcMode; 272Int32 srcMode;
353 273
274#define FILE_NAME_LEN 1034
354 275
355Int32 longestFileName; 276Int32 longestFileName;
356Char inName[1024]; 277Char inName[FILE_NAME_LEN];
357Char outName[1024]; 278Char outName[FILE_NAME_LEN];
358Char *progName; 279Char *progName;
359Char progNameReally[1024]; 280Char progNameReally[FILE_NAME_LEN];
360FILE *outputHandleJustInCase; 281FILE *outputHandleJustInCase;
361 282Int32 workFactor;
362void panic ( Char* ) NORETURN; 283
363void ioError ( void ) NORETURN; 284void panic ( Char* ) NORETURN;
364void compressOutOfMemory ( Int32, Int32 ) NORETURN; 285void ioError ( void ) NORETURN;
365void uncompressOutOfMemory ( Int32, Int32 ) NORETURN; 286void outOfMemory ( void ) NORETURN;
366void blockOverrun ( void ) NORETURN; 287void blockOverrun ( void ) NORETURN;
367void badBlockHeader ( void ) NORETURN; 288void badBlockHeader ( void ) NORETURN;
368void badBGLengths ( void ) NORETURN; 289void badBGLengths ( void ) NORETURN;
369void crcError ( UInt32, UInt32 ) NORETURN; 290void crcError ( void ) NORETURN;
370void bitStreamEOF ( void ) NORETURN; 291void bitStreamEOF ( void ) NORETURN;
371void cleanUpAndFail ( Int32 ) NORETURN; 292void cleanUpAndFail ( Int32 ) NORETURN;
372void compressedStreamEOF ( void ) NORETURN; 293void compressedStreamEOF ( void ) NORETURN;
373 294
295void copyFileName ( Char*, Char* );
374void* myMalloc ( Int32 ); 296void* myMalloc ( Int32 );
375 297
376 298
377 299
378/*---------------------------------------------------*/ 300/*---------------------------------------------------*/
379/*--- Data decls for the front end ---*/ 301/*--- Processing of complete files and streams ---*/
380/*---------------------------------------------------*/
381
382/*--
383 The overshoot bytes allow us to avoid most of
384 the cost of pointer renormalisation during
385 comparison of rotations in sorting.
386 The figure of 20 is derived as follows:
387 qSort3 allows an overshoot of up to 10.
388 It then calls simpleSort, which calls
389 fullGtU, also with max overshoot 10.
390 fullGtU does up to 10 comparisons without
391 renormalising, giving 10+10 == 20.
392--*/
393#define NUM_OVERSHOOT_BYTES 20
394
395/*--
396 These are the main data structures for
397 the Burrows-Wheeler transform.
398--*/
399
400/*--
401 Pointers to compression and decompression
402 structures. Set by
403 allocateCompressStructures and
404 setDecompressStructureSizes
405
406 The structures are always set to be suitable
407 for a block of size 100000 * blockSize100k.
408--*/
409UChar *block; /*-- compress --*/
410UInt16 *quadrant; /*-- compress --*/
411Int32 *zptr; /*-- compress --*/
412UInt16 *szptr; /*-- overlays zptr ---*/
413Int32 *ftab; /*-- compress --*/
414
415UInt16 *ll16; /*-- small decompress --*/
416UChar *ll4; /*-- small decompress --*/
417
418Int32 *tt; /*-- fast decompress --*/
419UChar *ll8; /*-- fast decompress --*/
420
421
422/*--
423 freq table collected to save a pass over the data
424 during decompression.
425--*/
426Int32 unzftab[256];
427
428
429/*--
430 index of the last char in the block, so
431 the block size == last + 1.
432--*/
433Int32 last;
434
435
436/*--
437 index in zptr[] of original string after sorting.
438--*/
439Int32 origPtr;
440
441
442/*--
443 always: in the range 0 .. 9.
444 The current block size is 100000 * this number.
445--*/
446Int32 blockSize100k;
447
448
449/*--
450 Used when sorting. If too many long comparisons
451 happen, we stop sorting, randomise the block
452 slightly, and try again.
453--*/
454
455Int32 workFactor;
456Int32 workDone;
457Int32 workLimit;
458Bool blockRandomised;
459Bool firstAttempt;
460Int32 nBlocksRandomised;
461
462
463
464/*---------------------------------------------------*/
465/*--- Data decls for the back end ---*/
466/*---------------------------------------------------*/
467
468#define MAX_ALPHA_SIZE 258
469#define MAX_CODE_LEN 23
470
471#define RUNA 0
472#define RUNB 1
473
474#define N_GROUPS 6
475#define G_SIZE 50
476#define N_ITERS 4
477
478#define MAX_SELECTORS (2 + (900000 / G_SIZE))
479
480Bool inUse[256];
481Int32 nInUse;
482
483UChar seqToUnseq[256];
484UChar unseqToSeq[256];
485
486UChar selector [MAX_SELECTORS];
487UChar selectorMtf[MAX_SELECTORS];
488
489Int32 nMTF;
490
491Int32 mtfFreq[MAX_ALPHA_SIZE];
492
493UChar len [N_GROUPS][MAX_ALPHA_SIZE];
494
495/*-- decompress only --*/
496Int32 limit [N_GROUPS][MAX_ALPHA_SIZE];
497Int32 base [N_GROUPS][MAX_ALPHA_SIZE];
498Int32 perm [N_GROUPS][MAX_ALPHA_SIZE];
499Int32 minLens[N_GROUPS];
500
501/*-- compress only --*/
502Int32 code [N_GROUPS][MAX_ALPHA_SIZE];
503Int32 rfreq[N_GROUPS][MAX_ALPHA_SIZE];
504
505
506/*---------------------------------------------------*/
507/*--- 32-bit CRC grunge ---*/
508/*---------------------------------------------------*/
509
510/*--
511 I think this is an implementation of the AUTODIN-II,
512 Ethernet & FDDI 32-bit CRC standard. Vaguely derived
513 from code by Rob Warnock, in Section 51 of the
514 comp.compression FAQ.
515--*/
516
517UInt32 crc32Table[256] = {
518
519 /*-- Ugly, innit? --*/
520
521 0x00000000UL, 0x04c11db7UL, 0x09823b6eUL, 0x0d4326d9UL,
522 0x130476dcUL, 0x17c56b6bUL, 0x1a864db2UL, 0x1e475005UL,
523 0x2608edb8UL, 0x22c9f00fUL, 0x2f8ad6d6UL, 0x2b4bcb61UL,
524 0x350c9b64UL, 0x31cd86d3UL, 0x3c8ea00aUL, 0x384fbdbdUL,
525 0x4c11db70UL, 0x48d0c6c7UL, 0x4593e01eUL, 0x4152fda9UL,
526 0x5f15adacUL, 0x5bd4b01bUL, 0x569796c2UL, 0x52568b75UL,
527 0x6a1936c8UL, 0x6ed82b7fUL, 0x639b0da6UL, 0x675a1011UL,
528 0x791d4014UL, 0x7ddc5da3UL, 0x709f7b7aUL, 0x745e66cdUL,
529 0x9823b6e0UL, 0x9ce2ab57UL, 0x91a18d8eUL, 0x95609039UL,
530 0x8b27c03cUL, 0x8fe6dd8bUL, 0x82a5fb52UL, 0x8664e6e5UL,
531 0xbe2b5b58UL, 0xbaea46efUL, 0xb7a96036UL, 0xb3687d81UL,
532 0xad2f2d84UL, 0xa9ee3033UL, 0xa4ad16eaUL, 0xa06c0b5dUL,
533 0xd4326d90UL, 0xd0f37027UL, 0xddb056feUL, 0xd9714b49UL,
534 0xc7361b4cUL, 0xc3f706fbUL, 0xceb42022UL, 0xca753d95UL,
535 0xf23a8028UL, 0xf6fb9d9fUL, 0xfbb8bb46UL, 0xff79a6f1UL,
536 0xe13ef6f4UL, 0xe5ffeb43UL, 0xe8bccd9aUL, 0xec7dd02dUL,
537 0x34867077UL, 0x30476dc0UL, 0x3d044b19UL, 0x39c556aeUL,
538 0x278206abUL, 0x23431b1cUL, 0x2e003dc5UL, 0x2ac12072UL,
539 0x128e9dcfUL, 0x164f8078UL, 0x1b0ca6a1UL, 0x1fcdbb16UL,
540 0x018aeb13UL, 0x054bf6a4UL, 0x0808d07dUL, 0x0cc9cdcaUL,
541 0x7897ab07UL, 0x7c56b6b0UL, 0x71159069UL, 0x75d48ddeUL,
542 0x6b93dddbUL, 0x6f52c06cUL, 0x6211e6b5UL, 0x66d0fb02UL,
543 0x5e9f46bfUL, 0x5a5e5b08UL, 0x571d7dd1UL, 0x53dc6066UL,
544 0x4d9b3063UL, 0x495a2dd4UL, 0x44190b0dUL, 0x40d816baUL,
545 0xaca5c697UL, 0xa864db20UL, 0xa527fdf9UL, 0xa1e6e04eUL,
546 0xbfa1b04bUL, 0xbb60adfcUL, 0xb6238b25UL, 0xb2e29692UL,
547 0x8aad2b2fUL, 0x8e6c3698UL, 0x832f1041UL, 0x87ee0df6UL,
548 0x99a95df3UL, 0x9d684044UL, 0x902b669dUL, 0x94ea7b2aUL,
549 0xe0b41de7UL, 0xe4750050UL, 0xe9362689UL, 0xedf73b3eUL,
550 0xf3b06b3bUL, 0xf771768cUL, 0xfa325055UL, 0xfef34de2UL,
551 0xc6bcf05fUL, 0xc27dede8UL, 0xcf3ecb31UL, 0xcbffd686UL,
552 0xd5b88683UL, 0xd1799b34UL, 0xdc3abdedUL, 0xd8fba05aUL,
553 0x690ce0eeUL, 0x6dcdfd59UL, 0x608edb80UL, 0x644fc637UL,
554 0x7a089632UL, 0x7ec98b85UL, 0x738aad5cUL, 0x774bb0ebUL,
555 0x4f040d56UL, 0x4bc510e1UL, 0x46863638UL, 0x42472b8fUL,
556 0x5c007b8aUL, 0x58c1663dUL, 0x558240e4UL, 0x51435d53UL,
557 0x251d3b9eUL, 0x21dc2629UL, 0x2c9f00f0UL, 0x285e1d47UL,
558 0x36194d42UL, 0x32d850f5UL, 0x3f9b762cUL, 0x3b5a6b9bUL,
559 0x0315d626UL, 0x07d4cb91UL, 0x0a97ed48UL, 0x0e56f0ffUL,
560 0x1011a0faUL, 0x14d0bd4dUL, 0x19939b94UL, 0x1d528623UL,
561 0xf12f560eUL, 0xf5ee4bb9UL, 0xf8ad6d60UL, 0xfc6c70d7UL,
562 0xe22b20d2UL, 0xe6ea3d65UL, 0xeba91bbcUL, 0xef68060bUL,
563 0xd727bbb6UL, 0xd3e6a601UL, 0xdea580d8UL, 0xda649d6fUL,
564 0xc423cd6aUL, 0xc0e2d0ddUL, 0xcda1f604UL, 0xc960ebb3UL,
565 0xbd3e8d7eUL, 0xb9ff90c9UL, 0xb4bcb610UL, 0xb07daba7UL,
566 0xae3afba2UL, 0xaafbe615UL, 0xa7b8c0ccUL, 0xa379dd7bUL,
567 0x9b3660c6UL, 0x9ff77d71UL, 0x92b45ba8UL, 0x9675461fUL,
568 0x8832161aUL, 0x8cf30badUL, 0x81b02d74UL, 0x857130c3UL,
569 0x5d8a9099UL, 0x594b8d2eUL, 0x5408abf7UL, 0x50c9b640UL,
570 0x4e8ee645UL, 0x4a4ffbf2UL, 0x470cdd2bUL, 0x43cdc09cUL,
571 0x7b827d21UL, 0x7f436096UL, 0x7200464fUL, 0x76c15bf8UL,
572 0x68860bfdUL, 0x6c47164aUL, 0x61043093UL, 0x65c52d24UL,
573 0x119b4be9UL, 0x155a565eUL, 0x18197087UL, 0x1cd86d30UL,
574 0x029f3d35UL, 0x065e2082UL, 0x0b1d065bUL, 0x0fdc1becUL,
575 0x3793a651UL, 0x3352bbe6UL, 0x3e119d3fUL, 0x3ad08088UL,
576 0x2497d08dUL, 0x2056cd3aUL, 0x2d15ebe3UL, 0x29d4f654UL,
577 0xc5a92679UL, 0xc1683bceUL, 0xcc2b1d17UL, 0xc8ea00a0UL,
578 0xd6ad50a5UL, 0xd26c4d12UL, 0xdf2f6bcbUL, 0xdbee767cUL,
579 0xe3a1cbc1UL, 0xe760d676UL, 0xea23f0afUL, 0xeee2ed18UL,
580 0xf0a5bd1dUL, 0xf464a0aaUL, 0xf9278673UL, 0xfde69bc4UL,
581 0x89b8fd09UL, 0x8d79e0beUL, 0x803ac667UL, 0x84fbdbd0UL,
582 0x9abc8bd5UL, 0x9e7d9662UL, 0x933eb0bbUL, 0x97ffad0cUL,
583 0xafb010b1UL, 0xab710d06UL, 0xa6322bdfUL, 0xa2f33668UL,
584 0xbcb4666dUL, 0xb8757bdaUL, 0xb5365d03UL, 0xb1f740b4UL
585};
586
587
588/*---------------------------------------------*/
589void initialiseCRC ( void )
590{
591 globalCrc = 0xffffffffUL;
592}
593
594
595/*---------------------------------------------*/
596UInt32 getFinalCRC ( void )
597{
598 return ~globalCrc;
599}
600
601
602/*---------------------------------------------*/
603UInt32 getGlobalCRC ( void )
604{
605 return globalCrc;
606}
607
608
609/*---------------------------------------------*/
610void setGlobalCRC ( UInt32 newCrc )
611{
612 globalCrc = newCrc;
613}
614
615
616/*---------------------------------------------*/
617#define UPDATE_CRC(crcVar,cha) \
618{ \
619 crcVar = (crcVar << 8) ^ \
620 crc32Table[(crcVar >> 24) ^ \
621 ((UChar)cha)]; \
622}
623
624
625/*---------------------------------------------------*/
626/*--- Bit stream I/O ---*/
627/*---------------------------------------------------*/ 302/*---------------------------------------------------*/
628 303
629
630UInt32 bsBuff;
631Int32 bsLive;
632FILE* bsStream;
633Bool bsWriting;
634
635
636/*---------------------------------------------*/
637void bsSetStream ( FILE* f, Bool wr )
638{
639 if (bsStream != NULL) panic ( "bsSetStream" );
640 bsStream = f;
641 bsLive = 0;
642 bsBuff = 0;
643 bytesOut = 0;
644 bytesIn = 0;
645 bsWriting = wr;
646}
647
648
649/*---------------------------------------------*/
650void bsFinishedWithStream ( void )
651{
652 if (bsWriting)
653 while (bsLive > 0) {
654 fputc ( (UChar)(bsBuff >> 24), bsStream );
655 bsBuff <<= 8;
656 bsLive -= 8;
657 bytesOut++;
658 }
659 bsStream = NULL;
660}
661
662
663/*---------------------------------------------*/
664#define bsNEEDR(nz) \
665{ \
666 while (bsLive < nz) { \
667 Int32 zzi = fgetc ( bsStream ); \
668 if (zzi == EOF) compressedStreamEOF(); \
669 bsBuff = (bsBuff << 8) | (zzi & 0xffL); \
670 bsLive += 8; \
671 } \
672}
673
674
675/*---------------------------------------------*/
676#define bsNEEDW(nz) \
677{ \
678 while (bsLive >= 8) { \
679 fputc ( (UChar)(bsBuff >> 24), \
680 bsStream ); \
681 bsBuff <<= 8; \
682 bsLive -= 8; \
683 bytesOut++; \
684 } \
685}
686
687
688/*---------------------------------------------*/
689#define bsR1(vz) \
690{ \
691 bsNEEDR(1); \
692 vz = (bsBuff >> (bsLive-1)) & 1; \
693 bsLive--; \
694}
695
696
697/*---------------------------------------------*/
698INLINE UInt32 bsR ( Int32 n )
699{
700 UInt32 v;
701 bsNEEDR ( n );
702 v = (bsBuff >> (bsLive-n)) & ((1 << n)-1);
703 bsLive -= n;
704 return v;
705}
706
707
708/*---------------------------------------------*/
709INLINE void bsW ( Int32 n, UInt32 v )
710{
711 bsNEEDW ( n );
712 bsBuff |= (v << (32 - bsLive - n));
713 bsLive += n;
714}
715
716
717/*---------------------------------------------*/
718UChar bsGetUChar ( void )
719{
720 return (UChar)bsR(8);
721}
722
723
724/*---------------------------------------------*/
725void bsPutUChar ( UChar c )
726{
727 bsW(8, (UInt32)c );
728}
729
730
731/*---------------------------------------------*/ 304/*---------------------------------------------*/
732Int32 bsGetUInt32 ( void ) 305Bool myfeof ( FILE* f )
733{ 306{
734 UInt32 u; 307 Int32 c = fgetc ( f );
735 u = 0; 308 if (c == EOF) return True;
736 u = (u << 8) | bsR(8); 309 ungetc ( c, f );
737 u = (u << 8) | bsR(8); 310 return False;
738 u = (u << 8) | bsR(8);
739 u = (u << 8) | bsR(8);
740 return u;
741}
742
743
744/*---------------------------------------------*/
745UInt32 bsGetIntVS ( UInt32 numBits )
746{
747 return (UInt32)bsR(numBits);
748}
749
750
751/*---------------------------------------------*/
752UInt32 bsGetInt32 ( void )
753{
754 return (Int32)bsGetUInt32();
755}
756
757
758/*---------------------------------------------*/
759void bsPutUInt32 ( UInt32 u )
760{
761 bsW ( 8, (u >> 24) & 0xffL );
762 bsW ( 8, (u >> 16) & 0xffL );
763 bsW ( 8, (u >> 8) & 0xffL );
764 bsW ( 8, u & 0xffL );
765}
766
767
768/*---------------------------------------------*/
769void bsPutInt32 ( Int32 c )
770{
771 bsPutUInt32 ( (UInt32)c );
772} 311}
773 312
774 313
775/*---------------------------------------------*/ 314/*---------------------------------------------*/
776void bsPutIntVS ( Int32 numBits, UInt32 c ) 315void compressStream ( FILE *stream, FILE *zStream )
777{ 316{
778 bsW ( numBits, c ); 317 BZFILE* bzf = NULL;
779} 318 UChar ibuf[5000];
780 319 Int32 nIbuf;
781 320 UInt32 nbytes_in, nbytes_out;
782/*---------------------------------------------------*/ 321 Int32 bzerr, bzerr_dummy, ret;
783/*--- Huffman coding low-level stuff ---*/
784/*---------------------------------------------------*/
785
786#define WEIGHTOF(zz0) ((zz0) & 0xffffff00)
787#define DEPTHOF(zz1) ((zz1) & 0x000000ff)
788#define MYMAX(zz2,zz3) ((zz2) > (zz3) ? (zz2) : (zz3))
789
790#define ADDWEIGHTS(zw1,zw2) \
791 (WEIGHTOF(zw1)+WEIGHTOF(zw2)) | \
792 (1 + MYMAX(DEPTHOF(zw1),DEPTHOF(zw2)))
793
794#define UPHEAP(z) \
795{ \
796 Int32 zz, tmp; \
797 zz = z; tmp = heap[zz]; \
798 while (weight[tmp] < weight[heap[zz >> 1]]) { \
799 heap[zz] = heap[zz >> 1]; \
800 zz >>= 1; \
801 } \
802 heap[zz] = tmp; \
803}
804
805#define DOWNHEAP(z) \
806{ \
807 Int32 zz, yy, tmp; \
808 zz = z; tmp = heap[zz]; \
809 while (True) { \
810 yy = zz << 1; \
811 if (yy > nHeap) break; \
812 if (yy < nHeap && \
813 weight[heap[yy+1]] < weight[heap[yy]]) \
814 yy++; \
815 if (weight[tmp] < weight[heap[yy]]) break; \
816 heap[zz] = heap[yy]; \
817 zz = yy; \
818 } \
819 heap[zz] = tmp; \
820}
821 322
323 SET_BINARY_MODE(stream);
324 SET_BINARY_MODE(zStream);
822 325
823/*---------------------------------------------*/ 326 if (ferror(stream)) goto errhandler_io;
824void hbMakeCodeLengths ( UChar *len, 327 if (ferror(zStream)) goto errhandler_io;
825 Int32 *freq,
826 Int32 alphaSize,
827 Int32 maxLen )
828{
829 /*--
830 Nodes and heap entries run from 1. Entry 0
831 for both the heap and nodes is a sentinel.
832 --*/
833 Int32 nNodes, nHeap, n1, n2, i, j, k;
834 Bool tooLong;
835 328
836 Int32 heap [ MAX_ALPHA_SIZE + 2 ]; 329 bzf = bzWriteOpen ( &bzerr, zStream,
837 Int32 weight [ MAX_ALPHA_SIZE * 2 ]; 330 blockSize100k, verbosity, workFactor );
838 Int32 parent [ MAX_ALPHA_SIZE * 2 ]; 331 if (bzerr != BZ_OK) goto errhandler;
839 332
840 for (i = 0; i < alphaSize; i++) 333 if (verbosity >= 2) fprintf ( stderr, "\n" );
841 weight[i+1] = (freq[i] == 0 ? 1 : freq[i]) << 8;
842 334
843 while (True) { 335 while (True) {
844 336
845 nNodes = alphaSize; 337 if (myfeof(stream)) break;
846 nHeap = 0; 338 nIbuf = fread ( ibuf, sizeof(UChar), 5000, stream );
847 339 if (ferror(stream)) goto errhandler_io;
848 heap[0] = 0; 340 if (nIbuf > 0) bzWrite ( &bzerr, bzf, (void*)ibuf, nIbuf );
849 weight[0] = 0; 341 if (bzerr != BZ_OK) goto errhandler;
850 parent[0] = -2;
851
852 for (i = 1; i <= alphaSize; i++) {
853 parent[i] = -1;
854 nHeap++;
855 heap[nHeap] = i;
856 UPHEAP(nHeap);
857 }
858 if (!(nHeap < (MAX_ALPHA_SIZE+2)))
859 panic ( "hbMakeCodeLengths(1)" );
860
861 while (nHeap > 1) {
862 n1 = heap[1]; heap[1] = heap[nHeap]; nHeap--; DOWNHEAP(1);
863 n2 = heap[1]; heap[1] = heap[nHeap]; nHeap--; DOWNHEAP(1);
864 nNodes++;
865 parent[n1] = parent[n2] = nNodes;
866 weight[nNodes] = ADDWEIGHTS(weight[n1], weight[n2]);
867 parent[nNodes] = -1;
868 nHeap++;
869 heap[nHeap] = nNodes;
870 UPHEAP(nHeap);
871 }
872 if (!(nNodes < (MAX_ALPHA_SIZE * 2)))
873 panic ( "hbMakeCodeLengths(2)" );
874
875 tooLong = False;
876 for (i = 1; i <= alphaSize; i++) {
877 j = 0;
878 k = i;
879 while (parent[k] >= 0) { k = parent[k]; j++; }
880 len[i-1] = j;
881 if (j > maxLen) tooLong = True;
882 }
883
884 if (! tooLong) break;
885 342
886 for (i = 1; i < alphaSize; i++) {
887 j = weight[i] >> 8;
888 j = 1 + (j / 2);
889 weight[i] = j << 8;
890 }
891 } 343 }
892}
893
894 344
895/*---------------------------------------------*/ 345 bzWriteClose ( &bzerr, bzf, 0, &nbytes_in, &nbytes_out );
896void hbAssignCodes ( Int32 *code, 346 if (bzerr != BZ_OK) goto errhandler;
897 UChar *length,
898 Int32 minLen,
899 Int32 maxLen,
900 Int32 alphaSize )
901{
902 Int32 n, vec, i;
903 347
904 vec = 0; 348 if (ferror(zStream)) goto errhandler_io;
905 for (n = minLen; n <= maxLen; n++) { 349 ret = fflush ( zStream );
906 for (i = 0; i < alphaSize; i++) 350 if (ret == EOF) goto errhandler_io;
907 if (length[i] == n) { code[i] = vec; vec++; }; 351 if (zStream != stdout) {
908 vec <<= 1; 352 ret = fclose ( zStream );
353 if (ret == EOF) goto errhandler_io;
909 } 354 }
910} 355 if (ferror(stream)) goto errhandler_io;
911 356 ret = fclose ( stream );
912 357 if (ret == EOF) goto errhandler_io;
913/*---------------------------------------------*/
914void hbCreateDecodeTables ( Int32 *limit,
915 Int32 *base,
916 Int32 *perm,
917 UChar *length,
918 Int32 minLen,
919 Int32 maxLen,
920 Int32 alphaSize )
921{
922 Int32 pp, i, j, vec;
923
924 pp = 0;
925 for (i = minLen; i <= maxLen; i++)
926 for (j = 0; j < alphaSize; j++)
927 if (length[j] == i) { perm[pp] = j; pp++; };
928
929 for (i = 0; i < MAX_CODE_LEN; i++) base[i] = 0;
930 for (i = 0; i < alphaSize; i++) base[length[i]+1]++;
931 358
932 for (i = 1; i < MAX_CODE_LEN; i++) base[i] += base[i-1]; 359 if (nbytes_in == 0) nbytes_in = 1;
933 360
934 for (i = 0; i < MAX_CODE_LEN; i++) limit[i] = 0; 361 if (verbosity >= 1)
935 vec = 0; 362 fprintf ( stderr, "%6.3f:1, %6.3f bits/byte, "
936 363 "%5.2f%% saved, %d in, %d out.\n",
937 for (i = minLen; i <= maxLen; i++) { 364 (float)nbytes_in / (float)nbytes_out,
938 vec += (base[i+1] - base[i]); 365 (8.0 * (float)nbytes_out) / (float)nbytes_in,
939 limit[i] = vec-1; 366 100.0 * (1.0 - (float)nbytes_out / (float)nbytes_in),
940 vec <<= 1; 367 nbytes_in,
941 } 368 nbytes_out
942 for (i = minLen + 1; i <= maxLen; i++) 369 );
943 base[i] = ((limit[i-1] + 1) << 1) - base[i];
944}
945
946
947
948/*---------------------------------------------------*/
949/*--- Undoing the reversible transformation ---*/
950/*---------------------------------------------------*/
951
952/*---------------------------------------------*/
953#define SET_LL4(i,n) \
954 { if (((i) & 0x1) == 0) \
955 ll4[(i) >> 1] = (ll4[(i) >> 1] & 0xf0) | (n); else \
956 ll4[(i) >> 1] = (ll4[(i) >> 1] & 0x0f) | ((n) << 4); \
957 }
958
959#define GET_LL4(i) \
960 (((UInt32)(ll4[(i) >> 1])) >> (((i) << 2) & 0x4) & 0xF)
961
962#define SET_LL(i,n) \
963 { ll16[i] = (UInt16)(n & 0x0000ffff); \
964 SET_LL4(i, n >> 16); \
965 }
966
967#define GET_LL(i) \
968 (((UInt32)ll16[i]) | (GET_LL4(i) << 16))
969
970
971/*---------------------------------------------*/
972/*--
973 Manage memory for compression/decompression.
974 When compressing, a single block size applies to
975 all files processed, and that's set when the
976 program starts. But when decompressing, each file
977 processed could have been compressed with a
978 different block size, so we may have to free
979 and reallocate on a per-file basis.
980
981 A call with argument of zero means
982 `free up everything.' And a value of zero for
983 blockSize100k means no memory is currently allocated.
984--*/
985 370
371 return;
986 372
987/*---------------------------------------------*/ 373 errhandler:
988void allocateCompressStructures ( void ) 374 bzWriteClose ( &bzerr_dummy, bzf, 1, &nbytes_in, &nbytes_out );
989{ 375 switch (bzerr) {
990 Int32 n = 100000 * blockSize100k; 376 case BZ_MEM_ERROR:
991 block = malloc ( (n + 1 + NUM_OVERSHOOT_BYTES) * sizeof(UChar) ); 377 outOfMemory ();
992 quadrant = malloc ( (n + NUM_OVERSHOOT_BYTES) * sizeof(Int16) ); 378 case BZ_IO_ERROR:
993 zptr = malloc ( n * sizeof(Int32) ); 379 errhandler_io:
994 ftab = malloc ( 65537 * sizeof(Int32) ); 380 ioError(); break;
995 381 default:
996 if (block == NULL || quadrant == NULL || 382 panic ( "compress:unexpected error" );
997 zptr == NULL || ftab == NULL) {
998 Int32 totalDraw
999 = (n + 1 + NUM_OVERSHOOT_BYTES) * sizeof(UChar) +
1000 (n + NUM_OVERSHOOT_BYTES) * sizeof(Int16) +
1001 n * sizeof(Int32) +
1002 65537 * sizeof(Int32);
1003
1004 compressOutOfMemory ( totalDraw, n );
1005 } 383 }
1006 384
1007 /*-- 385 panic ( "compress:end" );
1008 Since we want valid indexes for block of 386 /*notreached*/
1009 -1 to n + NUM_OVERSHOOT_BYTES - 1
1010 inclusive.
1011 --*/
1012 block++;
1013
1014 /*--
1015 The back end needs a place to store the MTF values
1016 whilst it calculates the coding tables. We could
1017 put them in the zptr array. However, these values
1018 will fit in a short, so we overlay szptr at the
1019 start of zptr, in the hope of reducing the number
1020 of cache misses induced by the multiple traversals
1021 of the MTF values when calculating coding tables.
1022 Seems to improve compression speed by about 1%.
1023 --*/
1024 szptr = (UInt16*)zptr;
1025}
1026
1027
1028/*---------------------------------------------*/
1029void setDecompressStructureSizes ( Int32 newSize100k )
1030{
1031 if (! (0 <= newSize100k && newSize100k <= 9 &&
1032 0 <= blockSize100k && blockSize100k <= 9))
1033 panic ( "setDecompressStructureSizes" );
1034
1035 if (newSize100k == blockSize100k) return;
1036
1037 blockSize100k = newSize100k;
1038
1039 if (ll16 != NULL) free ( ll16 );
1040 if (ll4 != NULL) free ( ll4 );
1041 if (ll8 != NULL) free ( ll8 );
1042 if (tt != NULL) free ( tt );
1043
1044 if (newSize100k == 0) return;
1045
1046 if (smallMode) {
1047
1048 Int32 n = 100000 * newSize100k;
1049 ll16 = malloc ( n * sizeof(UInt16) );
1050 ll4 = malloc ( ((n+1) >> 1) * sizeof(UChar) );
1051
1052 if (ll4 == NULL || ll16 == NULL) {
1053 Int32 totalDraw
1054 = n * sizeof(Int16) + ((n+1) >> 1) * sizeof(UChar);
1055 uncompressOutOfMemory ( totalDraw, n );
1056 }
1057
1058 } else {
1059
1060 Int32 n = 100000 * newSize100k;
1061 ll8 = malloc ( n * sizeof(UChar) );
1062 tt = malloc ( n * sizeof(Int32) );
1063
1064 if (ll8 == NULL || tt == NULL) {
1065 Int32 totalDraw
1066 = n * sizeof(UChar) + n * sizeof(UInt32);
1067 uncompressOutOfMemory ( totalDraw, n );
1068 }
1069
1070 }
1071} 387}
1072 388
1073 389
1074 390
1075/*---------------------------------------------------*/
1076/*--- The new back end ---*/
1077/*---------------------------------------------------*/
1078
1079/*---------------------------------------------*/
1080void makeMaps ( void )
1081{
1082 Int32 i;
1083 nInUse = 0;
1084 for (i = 0; i < 256; i++)
1085 if (inUse[i]) {
1086 seqToUnseq[nInUse] = i;
1087 unseqToSeq[i] = nInUse;
1088 nInUse++;
1089 }
1090}
1091
1092
1093/*---------------------------------------------*/ 391/*---------------------------------------------*/
1094void generateMTFValues ( void ) 392Bool uncompressStream ( FILE *zStream, FILE *stream )
1095{
1096 UChar yy[256];
1097 Int32 i, j;
1098 UChar tmp;
1099 UChar tmp2;
1100 Int32 zPend;
1101 Int32 wr;
1102 Int32 EOB;
1103
1104 makeMaps();
1105 EOB = nInUse+1;
1106
1107 for (i = 0; i <= EOB; i++) mtfFreq[i] = 0;
1108
1109 wr = 0;
1110 zPend = 0;
1111 for (i = 0; i < nInUse; i++) yy[i] = (UChar) i;
1112
1113
1114 for (i = 0; i <= last; i++) {
1115 UChar ll_i;
1116
1117 #if DEBUG
1118 assert (wr <= i);
1119 #endif
1120
1121 ll_i = unseqToSeq[block[zptr[i] - 1]];
1122 #if DEBUG
1123 assert (ll_i < nInUse);
1124 #endif
1125
1126 j = 0;
1127 tmp = yy[j];
1128 while ( ll_i != tmp ) {
1129 j++;
1130 tmp2 = tmp;
1131 tmp = yy[j];
1132 yy[j] = tmp2;
1133 };
1134 yy[0] = tmp;
1135
1136 if (j == 0) {
1137 zPend++;
1138 } else {
1139 if (zPend > 0) {
1140 zPend--;
1141 while (True) {
1142 switch (zPend % 2) {
1143 case 0: szptr[wr] = RUNA; wr++; mtfFreq[RUNA]++; break;
1144 case 1: szptr[wr] = RUNB; wr++; mtfFreq[RUNB]++; break;
1145 };
1146 if (zPend < 2) break;
1147 zPend = (zPend - 2) / 2;
1148 };
1149 zPend = 0;
1150 }
1151 szptr[wr] = j+1; wr++; mtfFreq[j+1]++;
1152 }
1153 }
1154
1155 if (zPend > 0) {
1156 zPend--;
1157 while (True) {
1158 switch (zPend % 2) {
1159 case 0: szptr[wr] = RUNA; wr++; mtfFreq[RUNA]++; break;
1160 case 1: szptr[wr] = RUNB; wr++; mtfFreq[RUNB]++; break;
1161 };
1162 if (zPend < 2) break;
1163 zPend = (zPend - 2) / 2;
1164 };
1165 }
1166
1167 szptr[wr] = EOB; wr++; mtfFreq[EOB]++;
1168
1169 nMTF = wr;
1170}
1171
1172
1173/*---------------------------------------------*/
1174#define LESSER_ICOST 0
1175#define GREATER_ICOST 15
1176
1177void sendMTFValues ( void )
1178{ 393{
1179 Int32 v, t, i, j, gs, ge, totc, bt, bc, iter; 394 BZFILE* bzf = NULL;
1180 Int32 nSelectors, alphaSize, minLen, maxLen, selCtr; 395 Int32 bzerr, bzerr_dummy, ret, nread, streamNo, i;
1181 Int32 nGroups, nBytes; 396 UChar obuf[5000];
1182 397 UChar unused[BZ_MAX_UNUSED];
1183 /*-- 398 Int32 nUnused;
1184 UChar len [N_GROUPS][MAX_ALPHA_SIZE]; 399 UChar* unusedTmp;
1185 is a global since the decoder also needs it.
1186
1187 Int32 code[N_GROUPS][MAX_ALPHA_SIZE];
1188 Int32 rfreq[N_GROUPS][MAX_ALPHA_SIZE];
1189 are also globals only used in this proc.
1190 Made global to keep stack frame size small.
1191 --*/
1192
1193
1194 UInt16 cost[N_GROUPS];
1195 Int32 fave[N_GROUPS];
1196
1197 if (verbosity >= 3)
1198 fprintf ( stderr,
1199 " %d in block, %d after MTF & 1-2 coding, %d+2 syms in use\n",
1200 last+1, nMTF, nInUse );
1201
1202 alphaSize = nInUse+2;
1203 for (t = 0; t < N_GROUPS; t++)
1204 for (v = 0; v < alphaSize; v++)
1205 len[t][v] = GREATER_ICOST;
1206
1207 /*--- Decide how many coding tables to use ---*/
1208 if (nMTF <= 0) panic ( "sendMTFValues(0)" );
1209 if (nMTF < 200) nGroups = 2; else
1210 if (nMTF < 800) nGroups = 4; else
1211 nGroups = 6;
1212
1213 /*--- Generate an initial set of coding tables ---*/
1214 {
1215 Int32 nPart, remF, tFreq, aFreq;
1216
1217 nPart = nGroups;
1218 remF = nMTF;
1219 gs = 0;
1220 while (nPart > 0) {
1221 tFreq = remF / nPart;
1222 ge = gs-1;
1223 aFreq = 0;
1224 while (aFreq < tFreq && ge < alphaSize-1) {
1225 ge++;
1226 aFreq += mtfFreq[ge];
1227 }
1228
1229 if (ge > gs
1230 && nPart != nGroups && nPart != 1
1231 && ((nGroups-nPart) % 2 == 1)) {
1232 aFreq -= mtfFreq[ge];
1233 ge--;
1234 }
1235 400
1236 if (verbosity >= 3) 401 nUnused = 0;
1237 fprintf ( stderr, 402 streamNo = 0;
1238 " initial group %d, [%d .. %d], has %d syms (%4.1f%%)\n",
1239 nPart, gs, ge, aFreq,
1240 (100.0 * (float)aFreq) / (float)nMTF );
1241
1242 for (v = 0; v < alphaSize; v++)
1243 if (v >= gs && v <= ge)
1244 len[nPart-1][v] = LESSER_ICOST; else
1245 len[nPart-1][v] = GREATER_ICOST;
1246
1247 nPart--;
1248 gs = ge+1;
1249 remF -= aFreq;
1250 }
1251 }
1252
1253 /*---
1254 Iterate up to N_ITERS times to improve the tables.
1255 ---*/
1256 for (iter = 0; iter < N_ITERS; iter++) {
1257
1258 for (t = 0; t < nGroups; t++) fave[t] = 0;
1259
1260 for (t = 0; t < nGroups; t++)
1261 for (v = 0; v < alphaSize; v++)
1262 rfreq[t][v] = 0;
1263
1264 nSelectors = 0;
1265 totc = 0;
1266 gs = 0;
1267 while (True) {
1268
1269 /*--- Set group start & end marks. --*/
1270 if (gs >= nMTF) break;
1271 ge = gs + G_SIZE - 1;
1272 if (ge >= nMTF) ge = nMTF-1;
1273
1274 /*--
1275 Calculate the cost of this group as coded
1276 by each of the coding tables.
1277 --*/
1278 for (t = 0; t < nGroups; t++) cost[t] = 0;
1279
1280 if (nGroups == 6) {
1281 register UInt16 cost0, cost1, cost2, cost3, cost4, cost5;
1282 cost0 = cost1 = cost2 = cost3 = cost4 = cost5 = 0;
1283 for (i = gs; i <= ge; i++) {
1284 UInt16 icv = szptr[i];
1285 cost0 += len[0][icv];
1286 cost1 += len[1][icv];
1287 cost2 += len[2][icv];
1288 cost3 += len[3][icv];
1289 cost4 += len[4][icv];
1290 cost5 += len[5][icv];
1291 }
1292 cost[0] = cost0; cost[1] = cost1; cost[2] = cost2;
1293 cost[3] = cost3; cost[4] = cost4; cost[5] = cost5;
1294 } else {
1295 for (i = gs; i <= ge; i++) {
1296 UInt16 icv = szptr[i];
1297 for (t = 0; t < nGroups; t++) cost[t] += len[t][icv];
1298 }
1299 }
1300
1301 /*--
1302 Find the coding table which is best for this group,
1303 and record its identity in the selector table.
1304 --*/
1305 bc = 999999999; bt = -1;
1306 for (t = 0; t < nGroups; t++)
1307 if (cost[t] < bc) { bc = cost[t]; bt = t; };
1308 totc += bc;
1309 fave[bt]++;
1310 selector[nSelectors] = bt;
1311 nSelectors++;
1312
1313 /*--
1314 Increment the symbol frequencies for the selected table.
1315 --*/
1316 for (i = gs; i <= ge; i++)
1317 rfreq[bt][ szptr[i] ]++;
1318
1319 gs = ge+1;
1320 }
1321 if (verbosity >= 3) {
1322 fprintf ( stderr,
1323 " pass %d: size is %d, grp uses are ",
1324 iter+1, totc/8 );
1325 for (t = 0; t < nGroups; t++)
1326 fprintf ( stderr, "%d ", fave[t] );
1327 fprintf ( stderr, "\n" );
1328 }
1329
1330 /*--
1331 Recompute the tables based on the accumulated frequencies.
1332 --*/
1333 for (t = 0; t < nGroups; t++)
1334 hbMakeCodeLengths ( &len[t][0], &rfreq[t][0], alphaSize, 20 );
1335 }
1336 403
404 SET_BINARY_MODE(stream);
405 SET_BINARY_MODE(zStream);
1337 406
1338 if (!(nGroups < 8)) panic ( "sendMTFValues(1)" ); 407 if (ferror(stream)) goto errhandler_io;
1339 if (!(nSelectors < 32768 && 408 if (ferror(zStream)) goto errhandler_io;
1340 nSelectors <= (2 + (900000 / G_SIZE))))
1341 panic ( "sendMTFValues(2)" );
1342
1343
1344 /*--- Compute MTF values for the selectors. ---*/
1345 {
1346 UChar pos[N_GROUPS], ll_i, tmp2, tmp;
1347 for (i = 0; i < nGroups; i++) pos[i] = i;
1348 for (i = 0; i < nSelectors; i++) {
1349 ll_i = selector[i];
1350 j = 0;
1351 tmp = pos[j];
1352 while ( ll_i != tmp ) {
1353 j++;
1354 tmp2 = tmp;
1355 tmp = pos[j];
1356 pos[j] = tmp2;
1357 };
1358 pos[0] = tmp;
1359 selectorMtf[i] = j;
1360 }
1361 };
1362
1363 /*--- Assign actual codes for the tables. --*/
1364 for (t = 0; t < nGroups; t++) {
1365 minLen = 32;
1366 maxLen = 0;
1367 for (i = 0; i < alphaSize; i++) {
1368 if (len[t][i] > maxLen) maxLen = len[t][i];
1369 if (len[t][i] < minLen) minLen = len[t][i];
1370 }
1371 if (maxLen > 20) panic ( "sendMTFValues(3)" );
1372 if (minLen < 1) panic ( "sendMTFValues(4)" );
1373 hbAssignCodes ( &code[t][0], &len[t][0],
1374 minLen, maxLen, alphaSize );
1375 }
1376
1377 /*--- Transmit the mapping table. ---*/
1378 {
1379 Bool inUse16[16];
1380 for (i = 0; i < 16; i++) {
1381 inUse16[i] = False;
1382 for (j = 0; j < 16; j++)
1383 if (inUse[i * 16 + j]) inUse16[i] = True;
1384 }
1385
1386 nBytes = bytesOut;
1387 for (i = 0; i < 16; i++)
1388 if (inUse16[i]) bsW(1,1); else bsW(1,0);
1389
1390 for (i = 0; i < 16; i++)
1391 if (inUse16[i])
1392 for (j = 0; j < 16; j++)
1393 if (inUse[i * 16 + j]) bsW(1,1); else bsW(1,0);
1394
1395 if (verbosity >= 3)
1396 fprintf ( stderr, " bytes: mapping %d, ", bytesOut-nBytes );