summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJulian Seward <jseward@acm.org>1999-09-04 22:13:13 +0200
committerJulian Seward <jseward@acm.org>1999-09-04 22:13:13 +0200
commitf93cd82a9a7094ad90fd19bbc6ccf6f4627f8060 (patch)
treec95407df5665f5a7395683f07552f2b13f2e501f
parentbzip2-0.9.0c (diff)
downloadbzip2-f93cd82a9a7094ad90fd19bbc6ccf6f4627f8060.tar.gz
bzip2-f93cd82a9a7094ad90fd19bbc6ccf6f4627f8060.tar.bz2
bzip2-f93cd82a9a7094ad90fd19bbc6ccf6f4627f8060.tar.xz
bzip2-0.9.5dbzip2-0.9.5d
-rw-r--r--CHANGES55
-rw-r--r--LICENSE6
-rw-r--r--Makefile89
-rw-r--r--README78
-rw-r--r--Y2K_INFO34
-rw-r--r--blocksort.c1242
-rw-r--r--bzip2.1610
-rw-r--r--bzip2.1.preformatted440
-rw-r--r--bzip2.c545
-rw-r--r--bzip2.txt336
-rw-r--r--bzip2recover.c21
-rw-r--r--bzlib.c138
-rw-r--r--bzlib.h13
-rw-r--r--bzlib_private.h47
-rw-r--r--compress.c141
-rw-r--r--crctable.c6
-rw-r--r--decompress.c8
-rw-r--r--dlltest.c242
-rw-r--r--howbig.c37
-rw-r--r--huffman.c6
-rw-r--r--makefile.msc65
-rw-r--r--manual.texi869
-rw-r--r--randtable.c6
-rw-r--r--sample3.bz2bin0 -> 235 bytes
-rw-r--r--sample3.ref30007
-rw-r--r--words12
-rw-r--r--words321
27 files changed, 32912 insertions, 2152 deletions
diff --git a/CHANGES b/CHANGES
index ac00f3a..0acb1c2 100644
--- a/CHANGES
+++ b/CHANGES
@@ -43,3 +43,58 @@ In compress.c:
43 do a bit better on small files. This _does_ effect 43 do a bit better on small files. This _does_ effect
44 bzip2.c. 44 bzip2.c.
45 45
46
470.9.5a
48~~~~~~
49Major change: add a fallback sorting algorithm (blocksort.c)
50to give reasonable behaviour even for very repetitive inputs.
51Nuked --repetitive-best and --repetitive-fast since they are
52no longer useful.
53
54Minor changes: mostly a whole bunch of small changes/
55bugfixes in the driver (bzip2.c). Changes pertaining to the
56user interface are:
57
58 allow decompression of symlink'd files to stdout
59 decompress/test files even without .bz2 extension
60 give more accurate error messages for I/O errors
61 when compressing/decompressing to stdout, don't catch control-C
62 read flags from BZIP2 and BZIP environment variables
63 decline to break hard links to a file unless forced with -f
64 allow -c flag even with no filenames
65 preserve file ownerships as far as possible
66 make -s -1 give the expected block size (100k)
67 add a flag -q --quiet to suppress nonessential warnings
68 stop decoding flags after --, so files beginning in - can be handled
69 resolved inconsistent naming: bzcat or bz2cat ?
70 bzip2 --help now returns 0
71
72Programming-level changes are:
73
74 fixed syntax error in GET_LL4 for Borland C++ 5.02
75 let bzBuffToBuffDecompress return BZ_DATA_ERROR{_MAGIC}
76 fix overshoot of mode-string end in bzopen_or_bzdopen
77 wrapped bzlib.h in #ifdef __cplusplus ... extern "C" { ... }
78 close file handles under all error conditions
79 added minor mods so it compiles with DJGPP out of the box
80 fixed Makefile so it doesn't give problems with BSD make
81 fix uninitialised memory reads in dlltest.c
82
830.9.5b
84~~~~~~
85Open stdin/stdout in binary mode for DJGPP.
86
870.9.5c
88~~~~~~
89Changed BZ_N_OVERSHOOT to be ... + 2 instead of ... + 1. The + 1
90version could cause the sorted order to be wrong in some extremely
91obscure cases. Also changed setting of quadrant in blocksort.c.
92
930.9.5d
94~~~~~~
95The only functional change is to make bzlibVersion() in the library
96return the correct string. This has no effect whatsoever on the
97functioning of the bzip2 program or library. Added a couple of casts
98so the library compiles without warnings at level 3 in MS Visual
99Studio 6.0. Included a Y2K statement in the file Y2K_INFO. All other
100changes are minor documentation changes.
diff --git a/LICENSE b/LICENSE
index 3de0301..bc0069a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
1 1
2This program, "bzip2" and associated library "libbzip2", are 2This program, "bzip2" and associated library "libbzip2", are
3copyright (C) 1996-1998 Julian R Seward. All rights reserved. 3copyright (C) 1996-1999 Julian R Seward. All rights reserved.
4 4
5Redistribution and use in source and binary forms, with or without 5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions 6modification, are permitted provided that the following conditions
@@ -33,7 +33,7 @@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
33NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 35
36Julian Seward, Guildford, Surrey, UK. 36Julian Seward, Cambridge, UK.
37jseward@acm.org 37jseward@acm.org
38bzip2/libbzip2 version 0.9.0 of 28 June 1998 38bzip2/libbzip2 version 0.9.5 of 24 May 1999
39 39
diff --git a/Makefile b/Makefile
index 8ebea66..8a1235d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,7 @@
1 1
2SHELL=/bin/sh
2CC=gcc 3CC=gcc
3CFLAGS=-Wall -O2 -fomit-frame-pointer -fno-strength-reduce 4CFLAGS=-Wall -Winline -O2 -fomit-frame-pointer -fno-strength-reduce
4 5
5OBJS= blocksort.o \ 6OBJS= blocksort.o \
6 huffman.o \ 7 huffman.o \
@@ -10,37 +11,93 @@ OBJS= blocksort.o \
10 decompress.o \ 11 decompress.o \
11 bzlib.o 12 bzlib.o
12 13
13all: lib bzip2 test 14all: libbz2.a bzip2 bzip2recover test
14 15
15bzip2: lib 16bzip2: libbz2.a bzip2.o
16 $(CC) $(CFLAGS) -c bzip2.c
17 $(CC) $(CFLAGS) -o bzip2 bzip2.o -L. -lbz2 17 $(CC) $(CFLAGS) -o bzip2 bzip2.o -L. -lbz2
18 $(CC) $(CFLAGS) -o bzip2recover bzip2recover.c
19 18
20lib: $(OBJS) 19bzip2recover: bzip2recover.o
20 $(CC) $(CFLAGS) -o bzip2recover bzip2recover.o
21
22libbz2.a: $(OBJS)
21 rm -f libbz2.a 23 rm -f libbz2.a
22 ar clq libbz2.a $(OBJS) 24 ar cq libbz2.a $(OBJS)
25 @if ( test -f /usr/bin/ranlib -o -f /bin/ranlib -o \
26 -f /usr/ccs/bin/ranlib ) ; then \
27 echo ranlib libbz2.a ; \
28 ranlib libbz2.a ; \
29 fi
23 30
24test: bzip2 31test: bzip2
25 @cat words1 32 @cat words1
26 ./bzip2 -1 < sample1.ref > sample1.rb2 33 ./bzip2 -1 < sample1.ref > sample1.rb2
27 ./bzip2 -2 < sample2.ref > sample2.rb2 34 ./bzip2 -2 < sample2.ref > sample2.rb2
28 ./bzip2 -d < sample1.bz2 > sample1.tst 35 ./bzip2 -3 < sample3.ref > sample3.rb2
29 ./bzip2 -d < sample2.bz2 > sample2.tst 36 ./bzip2 -d < sample1.bz2 > sample1.tst
30 @cat words2 37 ./bzip2 -d < sample2.bz2 > sample2.tst
38 ./bzip2 -ds < sample3.bz2 > sample3.tst
31 cmp sample1.bz2 sample1.rb2 39 cmp sample1.bz2 sample1.rb2
32 cmp sample2.bz2 sample2.rb2 40 cmp sample2.bz2 sample2.rb2
41 cmp sample3.bz2 sample3.rb2
33 cmp sample1.tst sample1.ref 42 cmp sample1.tst sample1.ref
34 cmp sample2.tst sample2.ref 43 cmp sample2.tst sample2.ref
44 cmp sample3.tst sample3.ref
35 @cat words3 45 @cat words3
36 46
47PREFIX=/usr
48
49install: bzip2 bzip2recover
50 if ( test ! -d $(PREFIX)/bin ) ; then mkdir $(PREFIX)/bin ; fi
51 if ( test ! -d $(PREFIX)/lib ) ; then mkdir $(PREFIX)/lib ; fi
52 if ( test ! -d $(PREFIX)/man ) ; then mkdir $(PREFIX)/man ; fi
53 if ( test ! -d $(PREFIX)/man/man1 ) ; then mkdir $(PREFIX)/man/man1 ; fi
54 if ( test ! -d $(PREFIX)/include ) ; then mkdir $(PREFIX)/include ; fi
55 cp -f bzip2 $(PREFIX)/bin/bzip2
56 cp -f bzip2 $(PREFIX)/bin/bunzip2
57 cp -f bzip2 $(PREFIX)/bin/bzcat
58 cp -f bzip2recover $(PREFIX)/bin/bzip2recover
59 chmod a+x $(PREFIX)/bin/bzip2
60 chmod a+x $(PREFIX)/bin/bunzip2
61 chmod a+x $(PREFIX)/bin/bzcat
62 chmod a+x $(PREFIX)/bin/bzip2recover
63 cp -f bzip2.1 $(PREFIX)/man/man1
64 chmod a+r $(PREFIX)/man/man1/bzip2.1
65 cp -f bzlib.h $(PREFIX)/include
66 chmod a+r $(PREFIX)/include/bzlib.h
67 cp -f libbz2.a $(PREFIX)/lib
68 chmod a+r $(PREFIX)/lib/libbz2.a
37 69
38clean: 70clean:
39 rm -f *.o libbz2.a bzip2 bzip2recover sample1.rb2 sample2.rb2 sample1.tst sample2.tst 71 rm -f *.o libbz2.a bzip2 bzip2recover \
72 sample1.rb2 sample2.rb2 sample3.rb2 \
73 sample1.tst sample2.tst sample3.tst
40 74
41.c.o: $*.o bzlib.h bzlib_private.h 75blocksort.o: blocksort.c
42 $(CC) $(CFLAGS) -c $*.c -o $*.o 76 $(CC) $(CFLAGS) -c blocksort.c
77huffman.o: huffman.c
78 $(CC) $(CFLAGS) -c huffman.c
79crctable.o: crctable.c
80 $(CC) $(CFLAGS) -c crctable.c
81randtable.o: randtable.c
82 $(CC) $(CFLAGS) -c randtable.c
83compress.o: compress.c
84 $(CC) $(CFLAGS) -c compress.c
85decompress.o: decompress.c
86 $(CC) $(CFLAGS) -c decompress.c
87bzlib.o: bzlib.c
88 $(CC) $(CFLAGS) -c bzlib.c
89bzip2.o: bzip2.c
90 $(CC) $(CFLAGS) -c bzip2.c
91bzip2recover.o: bzip2recover.c
92 $(CC) $(CFLAGS) -c bzip2recover.c
43 93
44tarfile: 94tarfile:
45 tar cvf interim.tar *.c *.h Makefile manual.texi manual.ps LICENSE bzip2.1 bzip2.1.preformatted bzip2.txt words1 words2 words3 sample1.ref sample2.ref sample1.bz2 sample2.bz2 *.html README CHANGES libbz2.def libbz2.dsp dlltest.dsp 95 tar cvf interim.tar blocksort.c huffman.c crctable.c \
96 randtable.c compress.c decompress.c bzlib.c bzip2.c \
97 bzip2recover.c bzlib.h bzlib_private.h Makefile manual.texi \
98 manual.ps LICENSE bzip2.1 bzip2.1.preformatted bzip2.txt \
99 words1 words2 words3 sample1.ref sample2.ref sample3.ref \
100 sample1.bz2 sample2.bz2 sample3.bz2 dlltest.c \
101 *.html README CHANGES libbz2.def libbz2.dsp \
102 dlltest.dsp makefile.msc Y2K_INFO
46 103
diff --git a/README b/README
index 2f59ef7..ee70649 100644
--- a/README
+++ b/README
@@ -1,48 +1,44 @@
1 1
2
3This is the README for bzip2, a block-sorting file compressor, version 2This is the README for bzip2, a block-sorting file compressor, version
40.9.0. This version is fully compatible with the previous public 30.9.5d. This version is fully compatible with the previous public
5release, bzip2-0.1pl2. 4releases, bzip2-0.1pl2 and bzip2-0.9.0.
6 5
7bzip2-0.9.0 is distributed under a BSD-style license. For details, 6bzip2-0.9.5 is distributed under a BSD-style license. For details,
8see the file LICENSE. 7see the file LICENSE.
9 8
10Complete documentation is available in Postscript form (manual.ps) 9Complete documentation is available in Postscript form (manual.ps) or
11or html (manual_toc.html). A plain-text version of the manual page is 10html (manual_toc.html). A plain-text version of the manual page is
12available as bzip2.txt. 11available as bzip2.txt. A statement about Y2K issues is now included
12in the file Y2K_INFO.
13 13
14 14
15HOW TO BUILD -- UNIX 15HOW TO BUILD -- UNIX
16 16
17Type `make'. 17Type `make'. This builds the library libbz2.a and then the
18 18programs bzip2 and bzip2recover. Six self-tests are run.
19This creates binaries "bzip2" and "bzip2recover". 19If the self-tests complete ok, carry on to installation:
20
21It also runs four compress-decompress tests to make sure things are
22working properly. If all goes well, you should be up & running.
23Please be sure to read the output from `make' just to be sure that the
24tests went ok.
25
26To install bzip2 properly:
27
28* Copy the binaries "bzip2" and "bzip2recover" to a publically visible
29 place, possibly /usr/bin or /usr/local/bin.
30 20
31* In that directory, make "bunzip2" and "bzcat" be symbolic links 21To install in /usr/bin, /usr/lib, /usr/man and /usr/include, type
32 to "bzip2". 22 make install
23To install somewhere else, eg, /xxx/yyy/{bin,lib,man,include}, type
24 make install PREFIX=/xxx/yyy
25If you are (justifiably) paranoid and want to see what 'make install'
26is going to do, you can first do
27 make -n install or
28 make -n install PREFIX=/xxx/yyy respectively.
29The -n instructs make to show the commands it would execute, but
30not actually execute them.
33 31
34* Copy the manual page, bzip2.1, to the relevant place.
35 Probably the right place is /usr/man/man1/.
36
37If you want to program with the library, you'll need to copy libbz2.a
38and bzlib.h to /usr/lib and /usr/include respectively.
39
40 32
41HOW TO BUILD -- Windows 95, NT, DOS, Mac, etc. 33HOW TO BUILD -- Windows 95, NT, DOS, Mac, etc.
42 34
43It's difficult for me to support compilation on all these platforms. 35It's difficult for me to support compilation on all these platforms.
44My approach is to collect binaries for these platforms, and put them 36My approach is to collect binaries for these platforms, and put them
45on my web page (http://www.muraroa.demon.co.uk). Look there. 37on my web page (http://www.muraroa.demon.co.uk). Look there. However
38(FWIW), bzip2-0.9.5 is very standard ANSI C and should compile
39unmodified with MS Visual C. For Win32, there is one important
40caveat: in bzip2.c, you must set BZ_UNIX to 0 and BZ_LCCWIN32 to 1
41before building.
46 42
47 43
48VALIDATION 44VALIDATION
@@ -112,26 +108,32 @@ WHAT'S NEW IN 0.9.0 (as compared to 0.1pl2) ?
112 * Much more documentation, i.e., a proper user manual 108 * Much more documentation, i.e., a proper user manual
113 * Hopefully, improved portability (at least of the library) 109 * Hopefully, improved portability (at least of the library)
114 110
111WHAT'S NEW IN 0.9.5 ?
112
113 * Compression speed is much less sensitive to the input
114 data than in previous versions. Specifically, the very
115 slow performance caused by repetitive data is fixed.
116 * Many small improvements in file and flag handling.
117 * A Y2K statement.
115 118
116I hope you find bzip2 useful. Feel free to contact me at 119I hope you find bzip2 useful. Feel free to contact me at
117 jseward@acm.org 120 jseward@acm.org
118if you have any suggestions or queries. Many people mailed me with 121if you have any suggestions or queries. Many people mailed me with
119comments, suggestions and patches after the releases of bzip-0.15, 122comments, suggestions and patches after the releases of bzip-0.15,
120bzip-0.21 and bzip2-0.1pl2, and the changes in bzip2 are largely a 123bzip-0.21, bzip2-0.1pl2 and bzip2-0.9.0, and the changes in bzip2 are
121result of this feedback. I thank you for your comments. 124largely a result of this feedback. I thank you for your comments.
122 125
123At least for the time being, bzip2's "home" is 126At least for the time being, bzip2's "home" is (or can be reached via)
124http://www.muraroa.demon.co.uk. 127http://www.muraroa.demon.co.uk.
125 128
126Julian Seward 129Julian Seward
127jseward@acm.org 130jseward@acm.org
128 131
129Manchester, UK 132Cambridge, UK
13018 July 1996 (version 0.15) 13318 July 1996 (version 0.15)
13125 August 1996 (version 0.21) 13425 August 1996 (version 0.21)
132 135 7 August 1997 (bzip2, version 0.1)
133Guildford, Surrey, UK
1347 August 1997 (bzip2, version 0.1)
13529 August 1997 (bzip2, version 0.1pl2) 13629 August 1997 (bzip2, version 0.1pl2)
13623 August 1998 (bzip2, version 0.9.0) 13723 August 1998 (bzip2, version 0.9.0)
137 138 8 June 1999 (bzip2, version 0.9.5)
139 4 Sept 1999 (bzip2, version 0.9.5d)
diff --git a/Y2K_INFO b/Y2K_INFO
new file mode 100644
index 0000000..55fd56a
--- /dev/null
+++ b/Y2K_INFO
@@ -0,0 +1,34 @@
1
2Y2K status of bzip2 and libbzip2, versions 0.1, 0.9.0 and 0.9.5
3~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
5Informally speaking:
6 bzip2 is a compression program built on top of libbzip2,
7 a library which does the real work of compression and
8 decompression. As far as I am aware, libbzip2 does not have
9 any date-related code at all.
10
11 bzip2 itself copies dates from source to destination files
12 when compressing or decompressing, using the 'stat' and 'utime'
13 UNIX system calls. It doesn't examine, manipulate or store the
14 dates in any way. So as far as I can see, there shouldn't be any
15 problem with bzip2 providing 'stat' and 'utime' work correctly
16 on your system.
17
18 On non-unix platforms (those for which BZ_UNIX in bzip2.c is
19 not set to 1), bzip2 doesn't even do the date copying.
20
21 Overall, informally speaking, I don't think bzip2 or libbzip2
22 have a Y2K problem.
23
24Formally speaking:
25 I am not prepared to offer you any assurance whatsoever
26 regarding Y2K issues in my software. You alone assume the
27 entire risk of using the software. The disclaimer of liability
28 in the LICENSE file in the bzip2 source distribution continues
29 to apply on this issue as with every other issue pertaining
30 to the software.
31
32Julian Seward
33Cambridge, UK
3425 August 1999
diff --git a/blocksort.c b/blocksort.c
index d8bb26a..85a02de 100644
--- a/blocksort.c
+++ b/blocksort.c
@@ -8,7 +8,7 @@
8 This file is a part of bzip2 and/or libbzip2, a program and 8 This file is a part of bzip2 and/or libbzip2, a program and
9 library for lossless, block-sorting data compression. 9 library for lossless, block-sorting data compression.
10 10
11 Copyright (C) 1996-1998 Julian R Seward. All rights reserved. 11 Copyright (C) 1996-1999 Julian R Seward. All rights reserved.
12 12
13 Redistribution and use in source and binary forms, with or without 13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions 14 modification, are permitted provided that the following conditions
@@ -41,9 +41,9 @@
41 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 41 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 42 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 43
44 Julian Seward, Guildford, Surrey, UK. 44 Julian Seward, Cambridge, UK.
45 jseward@acm.org 45 jseward@acm.org
46 bzip2/libbzip2 version 0.9.0c of 18 October 1998 46 bzip2/libbzip2 version 0.9.5 of 24 May 1999
47 47
48 This program is based on (at least) the work of: 48 This program is based on (at least) the work of:
49 Mike Burrows 49 Mike Burrows
@@ -62,106 +62,404 @@
62#include "bzlib_private.h" 62#include "bzlib_private.h"
63 63
64/*---------------------------------------------*/ 64/*---------------------------------------------*/
65/*-- 65/*--- Fallback O(N log(N)^2) sorting ---*/
66 Compare two strings in block. We assume (see 66/*--- algorithm, for repetitive blocks ---*/
67 discussion above) that i1 and i2 have a max 67/*---------------------------------------------*/
68 offset of 10 on entry, and that the first 68
69 bytes of both block and quadrant have been 69/*---------------------------------------------*/
70 copied into the "overshoot area", ie 70static
71 into the subscript range 71__inline__
72 [nblock .. nblock+NUM_OVERSHOOT_BYTES-1]. 72void fallbackSimpleSort ( UInt32* fmap,
73--*/ 73 UInt32* eclass,
74static __inline__ Bool fullGtU ( UChar* block, 74 Int32 lo,
75 UInt16* quadrant, 75 Int32 hi )
76 UInt32 nblock, 76{
77 Int32* workDone, 77 Int32 i, j, tmp;
78 Int32 i1, 78 UInt32 ec_tmp;
79 Int32 i2 79
80 ) 80 if (lo == hi) return;
81
82 if (hi - lo > 3) {
83 for ( i = hi-4; i >= lo; i-- ) {
84 tmp = fmap[i];
85 ec_tmp = eclass[tmp];
86 for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
87 fmap[j-4] = fmap[j];
88 fmap[j-4] = tmp;
89 }
90 }
91
92 for ( i = hi-1; i >= lo; i-- ) {
93 tmp = fmap[i];
94 ec_tmp = eclass[tmp];
95 for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
96 fmap[j-1] = fmap[j];
97 fmap[j-1] = tmp;
98 }
99}
100
101
102/*---------------------------------------------*/
103#define fswap(zz1, zz2) \
104 { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
105
106#define fvswap(zzp1, zzp2, zzn) \
107{ \
108 Int32 yyp1 = (zzp1); \
109 Int32 yyp2 = (zzp2); \
110 Int32 yyn = (zzn); \
111 while (yyn > 0) { \
112 fswap(fmap[yyp1], fmap[yyp2]); \
113 yyp1++; yyp2++; yyn--; \
114 } \
115}
116
117
118#define fmin(a,b) ((a) < (b)) ? (a) : (b)
119
120#define fpush(lz,hz) { stackLo[sp] = lz; \
121 stackHi[sp] = hz; \
122 sp++; }
123
124#define fpop(lz,hz) { sp--; \
125 lz = stackLo[sp]; \
126 hz = stackHi[sp]; }
127
128#define FALLBACK_QSORT_SMALL_THRESH 10
129#define FALLBACK_QSORT_STACK_SIZE 100
130
131
132static
133void fallbackQSort3 ( UInt32* fmap,
134 UInt32* eclass,
135 Int32 loSt,
136 Int32 hiSt )
137{
138 Int32 unLo, unHi, ltLo, gtHi, n, m;
139 Int32 sp, lo, hi;
140 UInt32 med, r, r3;
141 Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
142 Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
143
144 r = 0;
145
146 sp = 0;
147 fpush ( loSt, hiSt );
148
149 while (sp > 0) {
150
151 AssertH ( sp < FALLBACK_QSORT_STACK_SIZE, 1004 );
152
153 fpop ( lo, hi );
154 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
155 fallbackSimpleSort ( fmap, eclass, lo, hi );
156 continue;
157 }
158
159 /* Random partitioning. Median of 3 sometimes fails to
160 avoid bad cases. Median of 9 seems to help but
161 looks rather expensive. This too seems to work but
162 is cheaper. Guidance for the magic constants
163 7621 and 32768 is taken from Sedgewick's algorithms
164 book, chapter 35.
165 */
166 r = ((r * 7621) + 1) % 32768;
167 r3 = r % 3;
168 if (r3 == 0) med = eclass[fmap[lo]]; else
169 if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
170 med = eclass[fmap[hi]];
171
172 unLo = ltLo = lo;
173 unHi = gtHi = hi;
174
175 while (1) {
176 while (1) {
177 if (unLo > unHi) break;
178 n = (Int32)eclass[fmap[unLo]] - (Int32)med;
179 if (n == 0) {
180 fswap(fmap[unLo], fmap[ltLo]);
181 ltLo++; unLo++;
182 continue;
183 };
184 if (n > 0) break;
185 unLo++;
186 }
187 while (1) {
188 if (unLo > unHi) break;
189 n = (Int32)eclass[fmap[unHi]] - (Int32)med;
190 if (n == 0) {
191 fswap(fmap[unHi], fmap[gtHi]);
192 gtHi--; unHi--;
193 continue;
194 };
195 if (n < 0) break;
196 unHi--;
197 }
198 if (unLo > unHi) break;
199 fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
200 }
201
202 AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
203
204 if (gtHi < ltLo) continue;
205
206 n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
207 m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
208
209 n = lo + unLo - ltLo - 1;
210 m = hi - (gtHi - unHi) + 1;
211
212 if (n - lo > hi - m) {
213 fpush ( lo, n );
214 fpush ( m, hi );
215 } else {
216 fpush ( m, hi );
217 fpush ( lo, n );
218 }
219 }
220}
221
222#undef fmin
223#undef fpush
224#undef fpop
225#undef fswap
226#undef fvswap
227#undef FALLBACK_QSORT_SMALL_THRESH
228#undef FALLBACK_QSORT_STACK_SIZE
229
230
231/*---------------------------------------------*/
232/* Pre:
233 nblock > 0
234 eclass exists for [0 .. nblock-1]
235 ((UInt16*)eclass) [0 .. nblock-1] [15:8] holds block
236 ptr exists for [0 .. nblock-1]
237
238 Post:
239 ((UInt16*)eclass) [0 .. nblock-1] [15:8] holds block
240 All other areas of eclass destroyed
241 fmap [0 .. nblock-1] holds sorted order
242 bhtab [ 0 .. 2+(nblock/32) ] destroyed
243*/
244
245#define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
246#define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
247#define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
248#define WORD_BH(zz) bhtab[(zz) >> 5]
249#define UNALIGNED_BH(zz) ((zz) & 0x01f)
250
251static
252void fallbackSort ( UInt32* fmap,
253 UInt32* eclass,
254 UInt32* bhtab,
255 Int32 nblock,
256 Int32 verb )
257{
258 Int32 ftab[257];
259 Int32 ftabCopy[256];
260 Int32 H, i, j, k, l, r, cc, cc1;
261 Int32 nNotDone;
262 Int32 nBhtab;
263 UInt16* eclass16 = (UInt16*)eclass;
264
265 /*--
266 Initial 1-char radix sort to generate
267 initial fmap and initial BH bits.
268 --*/
269 if (verb >= 4)
270 VPrintf0 ( " bucket sorting ...\n" );
271 for (i = 0; i < 257; i++) ftab[i] = 0;
272 for (i = 0; i < nblock; i++) ftab[eclass16[i] >> 8]++;
273 for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i];
274 for (i = 1; i < 257; i++) ftab[i] += ftab[i-1];
275
276 for (i = 0; i < nblock; i++) {
277 j = eclass16[i] >> 8;
278 k = ftab[j] - 1;
279 ftab[j] = k;
280 fmap[k] = i;
281 }
282
283 nBhtab = 2 + (nblock / 32);
284 for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
285 for (i = 0; i < 256; i++) SET_BH(ftab[i]);
286
287 /*--
288 Inductively refine the buckets. Kind-of an
289 "exponential radix sort" (!), inspired by the
290 Manber-Myers suffix array construction algorithm.
291 --*/
292
293 /*-- set sentinel bits for block-end detection --*/
294 for (i = 0; i < 32; i++) {
295 SET_BH(nblock + 2*i);
296 CLEAR_BH(nblock + 2*i + 1);
297 }
298
299 /*-- the log(N) loop --*/
300 H = 1;
301 while (1) {
302
303 if (verb >= 4)
304 VPrintf1 ( " depth %6d has ", H );
305
306 j = 0;
307 for (i = 0; i < nblock; i++) {
308 if (ISSET_BH(i)) j = i;
309 k = fmap[i] - H; if (k < 0) k += nblock;
310 eclass[k] = j;
311 }
312
313 nNotDone = 0;
314 r = -1;
315 while (1) {
316
317 /*-- find the next non-singleton bucket --*/
318 k = r + 1;
319 while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
320 if (ISSET_BH(k)) {
321 while (WORD_BH(k) == 0xffffffff) k += 32;
322 while (ISSET_BH(k)) k++;
323 }
324 l = k - 1;
325 if (l >= nblock) break;
326 while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
327 if (!ISSET_BH(k)) {
328 while (WORD_BH(k) == 0x00000000) k += 32;
329 while (!ISSET_BH(k)) k++;
330 }
331 r = k - 1;
332 if (r >= nblock) break;
333
334 /*-- now [l, r] bracket current bucket --*/
335 if (r > l) {
336 nNotDone += (r - l + 1);
337 fallbackQSort3 ( fmap, eclass, l, r );
338
339 /*-- scan bucket and generate header bits-- */
340 cc = -1;
341 for (i = l; i <= r; i++) {
342 cc1 = eclass[fmap[i]];
343 if (cc != cc1) { SET_BH(i); cc = cc1; };
344 }
345 }
346 }
347
348 if (verb >= 4)
349 VPrintf1 ( "%6d unresolved strings\n", nNotDone );
350
351 H *= 2;
352 if (H > nblock || nNotDone == 0) break;
353 }
354
355 /*--
356 Reconstruct the original block in
357 eclass16 [0 .. nblock-1] [15:8], since the
358 previous phase destroyed it.
359 --*/
360 if (verb >= 4)
361 VPrintf0 ( " reconstructing block ...\n" );
362 j = 0;
363 for (i = 0; i < nblock; i++) {
364 while (ftabCopy[j] == 0) j++;
365 ftabCopy[j]--;
366 eclass16[fmap[i]] = j << 8;
367 }
368 AssertH ( j < 256, 1005 );
369}
370
371#undef SET_BH
372#undef CLEAR_BH
373#undef ISSET_BH
374#undef WORD_BH
375#undef UNALIGNED_BH
376
377
378/*---------------------------------------------*/
379/*--- The main, O(N^2 log(N)) sorting ---*/
380/*--- algorithm. Faster for "normal" ---*/
381/*--- non-repetitive blocks. ---*/
382/*---------------------------------------------*/
383
384/*---------------------------------------------*/
385static
386__inline__
387Bool mainGtU ( UInt32 i1,
388 UInt32 i2,
389 UInt16* block,
390 UInt16* quadrant,
391 UInt32 nblock,
392 Int32* budget )
81{ 393{
82 Int32 k; 394 Int32 k;
83 UChar c1, c2;
84 UInt16 s1, s2; 395 UInt16 s1, s2;
85 396
86 AssertD ( i1 != i2, "fullGtU(1)" ); 397 AssertD ( i1 != i2, "mainGtU" );
87 398
88 c1 = block[i1]; 399 s1 = block[i1]; s2 = block[i2];
89 c2 = block[i2]; 400 if (s1 != s2) return (s1 > s2);
90 if (c1 != c2) return (c1 > c2); 401 i1 += 2; i2 += 2;
91 i1++; i2++;
92 402
93 c1 = block[i1]; 403 s1 = block[i1]; s2 = block[i2];
94 c2 = block[i2]; 404 if (s1 != s2) return (s1 > s2);
95 if (c1 != c2) return (c1 > c2); 405 i1 += 2; i2 += 2;
96 i1++; i2++;
97 406
98 c1 = block[i1]; 407 s1 = block[i1]; s2 = block[i2];
99 c2 = block[i2]; 408 if (s1 != s2) return (s1 > s2);
100 if (c1 != c2) return (c1 > c2); 409 i1 += 2; i2 += 2;
101 i1++; i2++;
102 410
103 c1 = block[i1]; 411 s1 = block[i1]; s2 = block[i2];
104 c2 = block[i2]; 412 if (s1 != s2) return (s1 > s2);
105 if (c1 != c2) return (c1 > c2); 413 i1 += 2; i2 += 2;
106 i1++; i2++;
107 414
108 c1 = block[i1]; 415 s1 = block[i1]; s2 = block[i2];
109 c2 = block[i2]; 416 if (s1 != s2) return (s1 > s2);
110 if (c1 != c2) return (c1 > c2); 417 i1 += 2; i2 += 2;
111 i1++; i2++;
112 418
113 c1 = block[i1]; 419 s1 = block[i1]; s2 = block[i2];
114 c2 = block[i2]; 420 if (s1 != s2) return (s1 > s2);
115 if (c1 != c2) return (c1 > c2); 421 i1 += 2; i2 += 2;
116 i1++; i2++;
117 422
118 k = nblock; 423 k = nblock + 8;
119 424
120 do { 425 do {
121 426
122 c1 = block[i1]; 427 s1 = block[i1]; s2 = block[i2];
123 c2 = block[i2];
124 if (c1 != c2) return (c1 > c2);
125 s1 = quadrant[i1];
126 s2 = quadrant[i2];
127 if (s1 != s2) return (s1 > s2); 428 if (s1 != s2) return (s1 > s2);
128 i1++; i2++; 429 s1 = quadrant[i1]; s2 = quadrant[i2];
430 if (s1 != s2) return (s1 > s2);
431 i1 += 2; i2 += 2;
129 432
130 c1 = block[i1]; 433 s1 = block[i1]; s2 = block[i2];
131 c2 = block[i2]; 434 if (s1 != s2) return (s1 > s2);
132 if (c1 != c2) return (c1 > c2); 435 s1 = quadrant[i1]; s2 = quadrant[i2];
133 s1 = quadrant[i1];
134 s2 = quadrant[i2];
135 if (s1 != s2) return (s1 > s2); 436 if (s1 != s2) return (s1 > s2);
136 i1++; i2++; 437 i1 += 2; i2 += 2;
137 438
138 c1 = block[i1]; 439 s1 = block[i1]; s2 = block[i2];
139 c2 = block[i2];
140 if (c1 != c2) return (c1 > c2);
141 s1 = quadrant[i1];
142 s2 = quadrant[i2];
143 if (s1 != s2) return (s1 > s2); 440 if (s1 != s2) return (s1 > s2);
144 i1++; i2++; 441 s1 = quadrant[i1]; s2 = quadrant[i2];
442 if (s1 != s2) return (s1 > s2);
443 i1 += 2; i2 += 2;
145 444
146 c1 = block[i1]; 445 s1 = block[i1]; s2 = block[i2];
147 c2 = block[i2]; 446 if (s1 != s2) return (s1 > s2);
148 if (c1 != c2) return (c1 > c2); 447 s1 = quadrant[i1]; s2 = quadrant[i2];
149 s1 = quadrant[i1];
150 s2 = quadrant[i2];
151 if (s1 != s2) return (s1 > s2); 448 if (s1 != s2) return (s1 > s2);
152 i1++; i2++; 449 i1 += 2; i2 += 2;
153 450
154 if (i1 >= nblock) i1 -= nblock; 451 if (i1 >= nblock) i1 -= nblock;
155 if (i2 >= nblock) i2 -= nblock; 452 if (i2 >= nblock) i2 -= nblock;
156 453
157 k -= 4; 454 k -= 8;
158 (*workDone)++; 455 (*budget)--;
159 } 456 }
160 while (k >= 0); 457 while (k >= 0);
161 458
162 return False; 459 return False;
163} 460}
164 461
462
165/*---------------------------------------------*/ 463/*---------------------------------------------*/
166/*-- 464/*--
167 Knuth's increments seem to work better 465 Knuth's increments seem to work better
@@ -169,22 +467,22 @@ static __inline__ Bool fullGtU ( UChar* block,
169 because the number of elems to sort is 467 because the number of elems to sort is
170 usually small, typically <= 20. 468 usually small, typically <= 20.
171--*/ 469--*/
172static Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280, 470Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
173 9841, 29524, 88573, 265720, 471 9841, 29524, 88573, 265720,
174 797161, 2391484 }; 472 797161, 2391484 };
175 473
176static void simpleSort ( EState* s, Int32 lo, Int32 hi, Int32 d ) 474static
475void mainSimpleSort ( UInt32* ptr,
476 UInt16* block,
477 UInt16* quadrant,
478 Int32 nblock,
479 Int32 lo,
480 Int32 hi,
481 Int32 d,
482 Int32* budget )
177{ 483{
178 Int32 i, j, h, bigN, hp; 484 Int32 i, j, h, bigN, hp;
179 Int32 v; 485 UInt32 v;
180
181 UChar* block = s->block;
182 UInt32* zptr = s->zptr;
183 UInt16* quadrant = s->quadrant;
184 Int32* workDone = &(s->workDone);
185 Int32 nblock = s->nblock;
186 Int32 workLimit = s->workLimit;
187 Bool firstAttempt = s->firstAttempt;
188 486
189 bigN = hi - lo + 1; 487 bigN = hi - lo + 1;
190 if (bigN < 2) return; 488 if (bigN < 2) return;
@@ -195,49 +493,53 @@ static void simpleSort ( EState* s, Int32 lo, Int32 hi, Int32 d )
195 493
196 for (; hp >= 0; hp--) { 494 for (; hp >= 0; hp--) {
197 h = incs[hp]; 495 h = incs[hp];
496
198 i = lo + h; 497 i = lo + h;
199 while (True) { 498 while (True) {
200 499
201 /*-- copy 1 --*/ 500 /*-- copy 1 --*/
202 if (i > hi) break; 501 if (i > hi) break;
203 v = zptr[i]; 502 v = ptr[i];
204 j = i; 503 j = i;
205 while ( fullGtU ( block, quadrant, nblock, workDone, 504 while ( mainGtU (
206 zptr[j-h]+d, v+d ) ) { 505 ptr[j-h]+d, v+d, block, quadrant, nblock, budget
207 zptr[j] = zptr[j-h]; 506 ) ) {
507 ptr[j] = ptr[j-h];
208 j = j - h; 508 j = j - h;
209 if (j <= (lo + h - 1)) break; 509 if (j <= (lo + h - 1)) break;
210 } 510 }
211 zptr[j] = v; 511 ptr[j] = v;
212 i++; 512 i++;
213 513
214 /*-- copy 2 --*/ 514 /*-- copy 2 --*/
215 if (i > hi) break; 515 if (i > hi) break;
216 v = zptr[i]; 516 v = ptr[i];
217 j = i; 517 j = i;
218 while ( fullGtU ( block, quadrant, nblock, workDone, 518 while ( mainGtU (
219 zptr[j-h]+d, v+d ) ) { 519 ptr[j-h]+d, v+d, block, quadrant, nblock, budget
220 zptr[j] = zptr[j-h]; 520 ) ) {
521 ptr[j] = ptr[j-h];
221 j = j - h; 522 j = j - h;
222 if (j <= (lo + h - 1)) break; 523 if (j <= (lo + h - 1)) break;
223 } 524 }
224 zptr[j] = v; 525 ptr[j] = v;
225 i++; 526 i++;
226 527
227 /*-- copy 3 --*/ 528 /*-- copy 3 --*/
228 if (i > hi) break; 529 if (i > hi) break;
229 v = zptr[i]; 530 v = ptr[i];
230 j = i; 531 j = i;
231 while ( fullGtU ( block, quadrant, nblock, workDone, 532 while ( mainGtU (
232 zptr[j-h]+d, v+d ) ) { 533 ptr[j-h]+d, v+d, block, quadrant, nblock, budget
233 zptr[j] = zptr[j-h]; 534 ) ) {
535 ptr[j] = ptr[j-h];
234 j = j - h; 536 j = j - h;
235 if (j <= (lo + h - 1)) break; 537 if (j <= (lo + h - 1)) break;
236 } 538 }
237 zptr[j] = v; 539 ptr[j] = v;
238 i++; 540 i++;
239 541
240 if (*workDone > workLimit && firstAttempt) return; 542 if (*budget < 0) return;
241 } 543 }
242 } 544 }
243} 545}
@@ -252,20 +554,26 @@ static void simpleSort ( EState* s, Int32 lo, Int32 hi, Int32 d )
252 Sedgewick and Jon L. Bentley. 554 Sedgewick and Jon L. Bentley.
253--*/ 555--*/
254 556
255#define swap(lv1, lv2) \ 557#define mswap(zz1, zz2) \
256 { Int32 tmp = lv1; lv1 = lv2; lv2 = tmp; } 558 { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
257 559
258static void vswap ( UInt32* zptr, Int32 p1, Int32 p2, Int32 n ) 560#define mvswap(zzp1, zzp2, zzn) \
259{ 561{ \
260 while (n > 0) { 562 Int32 yyp1 = (zzp1); \
261 swap(zptr[p1], zptr[p2]); 563 Int32 yyp2 = (zzp2); \
262 p1++; p2++; n--; 564 Int32 yyn = (zzn); \
263 } 565 while (yyn > 0) { \
566 mswap(ptr[yyp1], ptr[yyp2]); \
567 yyp1++; yyp2++; yyn--; \
568 } \
264} 569}
265 570
266static UChar med3 ( UChar a, UChar b, UChar c ) 571
572static
573__inline__
574UInt16 mmed3 ( UInt16 a, UInt16 b, UInt16 c )
267{ 575{
268 UChar t; 576 UInt16 t;
269 if (a > b) { t = a; a = b; b = t; }; 577 if (a > b) { t = a; a = b; b = t; };
270 if (b > c) { t = b; b = c; c = t; }; 578 if (b > c) { t = b; b = c; c = t; };
271 if (a > b) b = a; 579 if (a > b) b = a;
@@ -273,66 +581,72 @@ static UChar med3 ( UChar a, UChar b, UChar c )
273} 581}
274 582
275 583
276#define min(a,b) ((a) < (b)) ? (a) : (b) 584#define mmin(a,b) ((a) < (b)) ? (a) : (b)
277 585
278typedef 586#define mpush(lz,hz,dz) { stackLo[sp] = lz; \
279 struct { Int32 ll; Int32 hh; Int32 dd; } 587 stackHi[sp] = hz; \
280 StackElem; 588 stackD [sp] = dz; \
589 sp++; }
281 590
282#define push(lz,hz,dz) { stack[sp].ll = lz; \ 591#define mpop(lz,hz,dz) { sp--; \
283 stack[sp].hh = hz; \ 592 lz = stackLo[sp]; \
284 stack[sp].dd = dz; \ 593 hz = stackHi[sp]; \
285 sp++; } 594 dz = stackD [sp]; }
286 595
287#define pop(lz,hz,dz) { sp--; \
288 lz = stack[sp].ll; \
289 hz = stack[sp].hh; \
290 dz = stack[sp].dd; }
291 596
292#define SMALL_THRESH 20 597#define mnextsize(az) (nextHi[az]-nextLo[az])
293#define DEPTH_THRESH 10 598
599#define mnextswap(az,bz) \
600 { Int32 tz; \
601 tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
602 tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
603 tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
294 604
295/*--
296 If you are ever unlucky/improbable enough
297 to get a stack overflow whilst sorting,
298 increase the following constant and try
299 again. In practice I have never seen the
300 stack go above 27 elems, so the following
301 limit seems very generous.
302--*/
303#define QSORT_STACK_SIZE 1000
304 605
606#define MAIN_QSORT_SMALL_THRESH 20
607#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
608#define MAIN_QSORT_STACK_SIZE 100
305 609
306static void qSort3 ( EState* s, Int32 loSt, Int32 hiSt, Int32 dSt ) 610static
611void mainQSort3 ( UInt32* ptr,
612 UInt16* block,
613 UInt16* quadrant,
614 Int32 nblock,
615 Int32 loSt,
616 Int32 hiSt,
617 Int32 dSt,
618 Int32* budget )
307{ 619{
308 Int32 unLo, unHi, ltLo, gtHi, med, n, m; 620 Int32 unLo, unHi, ltLo, gtHi, n, m, med;
309 Int32 sp, lo, hi, d; 621 Int32 sp, lo, hi, d;
310 StackElem stack[QSORT_STACK_SIZE];
311 622
312 UChar* block = s->block; 623 Int32 stackLo[MAIN_QSORT_STACK_SIZE];
313 UInt32* zptr = s->zptr; 624 Int32 stackHi[MAIN_QSORT_STACK_SIZE];
314 Int32* workDone = &(s->workDone); 625 Int32 stackD [MAIN_QSORT_STACK_SIZE];
315 Int32 workLimit = s->workLimit; 626
316 Bool firstAttempt = s->firstAttempt; 627 Int32 nextLo[3];
628 Int32 nextHi[3];
629 Int32 nextD [3];
317 630
318 sp = 0; 631 sp = 0;
319 push ( loSt, hiSt, dSt ); 632 mpush ( loSt, hiSt, dSt );
320 633
321 while (sp > 0) { 634 while (sp > 0) {
322 635
323 AssertH ( sp < QSORT_STACK_SIZE, 1001 ); 636 AssertH ( sp < MAIN_QSORT_STACK_SIZE, 1001 );
324 637
325 pop ( lo, hi, d ); 638 mpop ( lo, hi, d );
326 639 if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
327 if (hi - lo < SMALL_THRESH || d > DEPTH_THRESH) { 640 d > MAIN_QSORT_DEPTH_THRESH) {
328 simpleSort ( s, lo, hi, d ); 641 mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
329 if (*workDone > workLimit && firstAttempt) return; 642 if (*budget < 0) return;
330 continue; 643 continue;
331 } 644 }
332 645
333 med = med3 ( block[zptr[ lo ]+d], 646 med = (Int32)
334 block[zptr[ hi ]+d], 647 mmed3 ( block[ptr[ lo ]+d],
335 block[zptr[ (lo+hi)>>1 ]+d] ); 648 block[ptr[ hi ]+d],
649 block[ptr[ (lo+hi)>>1 ]+d] );
336 650
337 unLo = ltLo = lo; 651 unLo = ltLo = lo;
338 unHi = gtHi = hi; 652 unHi = gtHi = hi;
@@ -340,370 +654,412 @@ static void qSort3 ( EState* s, Int32 loSt, Int32 hiSt, Int32 dSt )
340 while (True) { 654 while (True) {
341 while (True) { 655 while (True) {
342 if (unLo > unHi) break; 656 if (unLo > unHi) break;
343 n = ((Int32)block[zptr[unLo]+d]) - med; 657 n = ((Int32)block[ptr[unLo]+d]) - med;
344 if (n == 0) { swap(zptr[unLo], zptr[ltLo]); ltLo++; unLo++; continue; }; 658 if (n == 0) {
659 mswap(ptr[unLo], ptr[ltLo]);
660 ltLo++; unLo++; continue;
661 };
345 if (n > 0) break; 662 if (n > 0) break;
346 unLo++; 663 unLo++;
347 } 664 }
348 while (True) { 665 while (True) {
349 if (unLo > unHi) break; 666 if (unLo > unHi) break;
350 n = ((Int32)block[zptr[unHi]+d]) - med; 667 n = ((Int32)block[ptr[unHi]+d]) - med;
351 if (n == 0) { swap(zptr[unHi], zptr[gtHi]); gtHi--; unHi--; continue; }; 668 if (n == 0) {
669 mswap(ptr[unHi], ptr[gtHi]);
670 gtHi--; unHi--; continue;
671 };
352 if (n < 0) break; 672 if (n < 0) break;
353 unHi--; 673 unHi--;
354 } 674 }
355 if (unLo > unHi) break; 675 if (unLo > unHi) break;
356 swap(zptr[unLo], zptr[unHi]); unLo++; unHi--; 676 mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
357 } 677 }
358 678
359 AssertD ( unHi == unLo-1, "bad termination in qSort3" ); 679 AssertD ( unHi == unLo-1, "mainQSort3(2)" );
360 680
361 if (gtHi < ltLo) { 681 if (gtHi < ltLo) {
362 push(lo, hi, d+1 ); 682 mpush(lo, hi, d+2 );
363 continue; 683 continue;
364 } 684 }
365 685
366 n = min(ltLo-lo, unLo-ltLo); vswap(zptr, lo, unLo-n, n); 686 n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
367 m = min(hi-gtHi, gtHi-unHi); vswap(zptr, unLo, hi-m+1, m); 687 m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
368 688
369 n = lo + unLo - ltLo - 1; 689 n = lo + unLo - ltLo - 1;
370 m = hi - (gtHi - unHi) + 1; 690 m = hi - (gtHi - unHi) + 1;
371 691
372 push ( lo, n, d ); 692 nextLo[0] = lo; nextHi[0] = n; nextD[0] = d;
373 push ( n+1, m-1, d+1 ); 693 nextLo[1] = m; nextHi[1] = hi; nextD[1] = d;
374 push ( m, hi, d ); 694 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+2;
695
696 if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
697 if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
698 if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
699
700 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
701 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
702
703 mpush (nextLo[0], nextHi[0], nextD[0]);
704 mpush (nextLo[1], nextHi[1], nextD[1]);
705 mpush (nextLo[2], nextHi[2], nextD[2]);
375 } 706 }
376} 707}
377 708
709#undef mswap
710#undef mvswap
711#undef mpush
712#undef mpop
713#undef mmin
714#undef mnextsize
715#undef mnextswap
716#undef MAIN_QSORT_SMALL_THRESH
717#undef MAIN_QSORT_DEPTH_THRESH
718#undef MAIN_QSORT_STACK_SIZE
719
378 720
379/*---------------------------------------------*/ 721/*---------------------------------------------*/
722/* Pre:
723 nblock > N_OVERSHOOT
724 block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
725 ((UInt16*)block32) [0 .. nblock-1] [15:8] holds block
726 ptr exists for [0 .. nblock-1]
727
728 Post:
729 ((UInt16*)block32) [0 .. nblock-1] [15:8] holds block
730 All other areas of block32 destroyed
731 ftab [0 .. 65536 ] destroyed
732 ptr [0 .. nblock-1] holds sorted order
733 if (*budget < 0), sorting was abandoned
734*/
380 735
381#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) 736#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
382
383#define SETMASK (1 << 21) 737#define SETMASK (1 << 21)
384#define CLEARMASK (~(SETMASK)) 738#define CLEARMASK (~(SETMASK))
385 739
386static void sortMain ( EState* s ) 740static
741void mainSort ( UInt32* ptr,
742 UInt16* block,
743 UInt16* quadrant,
744 UInt32* ftab,
745 Int32 nblock,
746 Int32 verb,
747 Int32* budget )
387{ 748{
388 Int32 i, j, k, ss, sb; 749 Int32 i, j, k, m, ss, sb;
389 Int32 runningOrder[256]; 750 Int32 runningOrder[256];
390 Int32 copy[256]; 751 Int32 copy[256];
391 Bool bigDone[256]; 752 Bool bigDone[256];
392 UChar c1, c2; 753 UChar c1;
393 Int32 numQSorted; 754 Int32 numQSorted;
394 755 Int32 biggestSoFar;
395 UChar* block = s->block; 756 UInt16 s;
396 UInt32* zptr = s->zptr; 757
397 UInt16* quadrant = s->quadrant; 758 if (verb >= 4) VPrintf0 ( " main sort initialise ...\n" );
398 Int32* ftab = s->ftab; 759
399 Int32* workDone = &(s->workDone); 760 /*-- Stripe the block data into 16 bits, and at the
400 Int32 nblock = s->nblock; 761 same time set up the 2-byte frequency table
401 Int32 workLimit = s->workLimit;
402 Bool firstAttempt = s->firstAttempt;
403
404 /*--
405 In the various block-sized structures, live data runs
406 from 0 to last+NUM_OVERSHOOT_BYTES inclusive. First,
407 set up the overshoot area for block.
408 --*/ 762 --*/
763 for (i = 65536; i >= 0; i--) ftab[i] = 0;
764
765 s = block[0];
766 for (i = 1; i < nblock; i++) {
767 quadrant[i] = 0;
768 s = (s << 8) | block[i];
769 block[i-1] = s;
770 ftab[s]++;
771 }
772 quadrant[0] = 0;
773 s = (s << 8) | (block[0] >> 8);
774 block[nblock-1] = s;
775 ftab[s]++;
776
777 /*-- (emphasises close relationship of block & quadrant) --*/
778 for (i = 0; i < BZ_N_OVERSHOOT; i++) {
779 block [nblock+i] = block[i];
780 quadrant[nblock+i] = 0;
781 }
409 782
410 if (s->verbosity >= 4) 783 if (verb >= 4) VPrintf0 ( " bucket sorting ...\n" );
411 VPrintf0( " sort initialise ...\n" );
412
413 for (i = 0; i < BZ_NUM_OVERSHOOT_BYTES; i++)
414 block[nblock+i] = block[i % nblock];
415 for (i = 0; i < nblock+BZ_NUM_OVERSHOOT_BYTES; i++)
416 quadrant[i] = 0;
417
418
419 if (nblock <= 4000) {
420
421 /*--
422 Use simpleSort(), since the full sorting mechanism
423 has quite a large constant overhead.
424 --*/
425 if (s->verbosity >= 4) VPrintf0( " simpleSort ...\n" );
426 for (i = 0; i < nblock; i++) zptr[i] = i;
427 firstAttempt = False;
428 *workDone = workLimit = 0;
429 simpleSort ( s, 0, nblock-1, 0 );
430 if (s->verbosity >= 4) VPrintf0( " simpleSort done.\n" );
431 784
432 } else { 785 /*-- Complete the initial radix sort --*/
786 for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
433 787
434 numQSorted = 0; 788 for (i = 0; i < nblock; i++) {
435 for (i = 0; i <= 255; i++) bigDone[i] = False; 789 s = block[i];
790 j = ftab[s] - 1;
791 ftab[s] = j;
792 ptr[j] = i;
793 }
436 794
437 if (s->verbosity >= 4) VPrintf0( " bucket sorting ...\n" ); 795 /*--
796 Now ftab contains the first loc of every small bucket.
797 Calculate the running order, from smallest to largest
798 big bucket.
799 --*/
800 for (i = 0; i <= 255; i++) {
801 bigDone [i] = False;
802 runningOrder[i] = i;
803 }
438 804
439 for (i = 0; i <= 65536; i++) ftab[i] = 0; 805 {
806 Int32 vv;
807 Int32 h = 1;
808 do h = 3 * h + 1; while (h <= 256);
809 do {
810 h = h / 3;
811 for (i = h; i <= 255; i++) {
812 vv = runningOrder[i];
813 j = i;
814 while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
815 runningOrder[j] = runningOrder[j-h];
816 j = j - h;
817 if (j <= (h - 1)) goto zero;
818 }
819 zero:
820 runningOrder[j] = vv;
821 }
822 } while (h != 1);
823 }
440 824
441 c1 = block[nblock-1]; 825 /*--
442 for (i = 0; i < nblock; i++) { 826 The main sorting loop.
443 c2 = block[i]; 827 --*/
444 ftab[(c1 << 8) + c2]++;
445 c1 = c2;
446 }
447 828
448 for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1]; 829 biggestSoFar = numQSorted = 0;
449 830
450 c1 = block[0]; 831 for (i = 0; i <= 255; i++) {
451 for (i = 0; i < nblock-1; i++) {
452 c2 = block[i+1];
453 j = (c1 << 8) + c2;
454 c1 = c2;
455 ftab[j]--;
456 zptr[ftab[j]] = i;
457 }
458 j = (block[nblock-1] << 8) + block[0];
459 ftab[j]--;
460 zptr[ftab[j]] = nblock-1;
461 832
462 /*-- 833 /*--
463 Now ftab contains the first loc of every small bucket. 834 Process big buckets, starting with the least full.
464 Calculate the running order, from smallest to largest 835 Basically this is a 4-step process in which we call
465 big bucket. 836 mainQSort3 to sort the small buckets [ss, j], but
837 also make a big effort to avoid the calls if we can.
466 --*/ 838 --*/
839 ss = runningOrder[i];
467 840
468 for (i = 0; i <= 255; i++) runningOrder[i] = i; 841 /*--
469 842 Step 1:
470 { 843 Complete the big bucket [ss] by quicksorting
471 Int32 vv; 844 any unsorted small buckets [ss, j], for j != ss.
472 Int32 h = 1; 845 Hopefully previous pointer-scanning phases have already
473 do h = 3 * h + 1; while (h <= 256); 846 completed many of the small buckets [ss, j], so
474 do { 847 we don't have to sort them at all.
475 h = h / 3; 848 --*/
476 for (i = h; i <= 255; i++) { 849 for (j = 0; j <= 255; j++) {
477 vv = runningOrder[i]; 850 if (j != ss) {
478 j = i; 851 sb = (ss << 8) + j;
479 while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) { 852 if ( ! (ftab[sb] & SETMASK) ) {
480 runningOrder[j] = runningOrder[j-h]; 853 Int32 lo = ftab[sb] & CLEARMASK;
481 j = j - h; 854 Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
482 if (j <= (h - 1)) goto zero; 855 if (hi > lo) {
856 if (verb >= 4)
857 VPrintf4 ( " qsort [0x%x, 0x%x] "
858 "done %d this %d\n",
859 ss, j, numQSorted, hi - lo + 1 );
860 mainQSort3 (
861 ptr, block, quadrant, nblock,
862 lo, hi, BZ_N_RADIX, budget
863 );
864 numQSorted += (hi - lo + 1);
865 if (*budget < 0) return;
483 } 866 }
484 zero:
485 runningOrder[j] = vv;
486 } 867 }
487 } while (h != 1); 868 ftab[sb] |= SETMASK;
869 }
488 } 870 }
489 871
490 /*-- 872 /*--
491 The main sorting loop. 873 Step 2:
874 Deal specially with case [ss, ss]. This establishes the
875 sorted order for [ss, ss] without any comparisons.
876 A clever trick, cryptically described as steps Q6b and Q6c
877 in SRC-124 (aka BW94). Compared to bzip2, this makes it
878 practical not to use a preliminary run-length coder.
492 --*/ 879 --*/
880 {
881 Int32 put0, get0, put1, get1;
882 Int32 sbn = (ss << 8) + ss;
883 Int32 lo = ftab[sbn] & CLEARMASK;
884 Int32 hi = (ftab[sbn+1] & CLEARMASK) - 1;
885 UChar ssc = (UChar)ss;
886 put0 = lo;
887 get0 = ftab[ss << 8] & CLEARMASK;
888 put1 = hi;
889 get1 = (ftab[(ss+1) << 8] & CLEARMASK) - 1;
890 while (get0 < put0) {
891 j = ptr[get0]-1; if (j < 0) j += nblock;
892 c1 = (UChar)(block[j] >> 8);
893 if (c1 == ssc) { ptr[put0] = j; put0++; };
894 get0++;
895 }
896 while (get1 > put1) {
897 j = ptr[get1]-1; if (j < 0) j += nblock;
898 c1 = (UChar)(block[j] >> 8);
899 if (c1 == ssc) { ptr[put1] = j; put1--; };
900 get1--;
901 }
902 ftab[sbn] |= SETMASK;
903 }
493 904
494 for (i = 0; i <= 255; i++) { 905 /*--
495 906 Step 3:
496 /*-- 907 The [ss] big bucket is now done. Record this fact,
497 Process big buckets, starting with the least full. 908 and update the quadrant descriptors. Remember to
498 Basically this is a 4-step process in which we call 909 update quadrants in the overshoot area too, if
499 qSort3 to sort the small buckets [ss, j], but 910 necessary. The "if (i < 255)" test merely skips
500 also make a big effort to avoid the calls if we can. 911 this updating for the last bucket processed, since
501 --*/ 912 updating for the last bucket is pointless.
502 ss = runningOrder[i]; 913
503 914 The quadrant array provides a way to incrementally
504 /*-- 915 cache sort orderings, as they appear, so as to
505 Step 1: 916 make subsequent comparisons in fullGtU() complete
506 Complete the big bucket [ss] by quicksorting 917 faster. For repetitive blocks this makes a big
507 any unsorted small buckets [ss, j], for j != ss. 918 difference (but not big enough to be able to avoid
508 Hopefully previous pointer-scanning phases have already 919 the fallback sorting mechanism, exponential radix sort).
509 completed many of the small buckets [ss, j], so 920
510 we don't have to sort them at all. 921 The precise meaning is: at all times:
511 --*/ 922
512 for (j = 0; j <= 255; j++) { 923 for 0 <= i < nblock and 0 <= j <= nblock
513 if (j != ss) { 924
514 sb = (ss << 8) + j; 925 if block[i] != block[j],
515 if ( ! (ftab[sb] & SETMASK) ) { 926
516 Int32 lo = ftab[sb] & CLEARMASK; 927 then the relative values of quadrant[i] and
517 Int32 hi = (ftab[sb+1] & CLEARMASK) - 1; 928 quadrant[j] are meaningless.
518 if (hi > lo) { 929
519 if (s->verbosity >= 4) 930 else {
520 VPrintf4( " qsort [0x%x, 0x%x] done %d this %d\n", 931 if quadrant[i] < quadrant[j]
521 ss, j, numQSorted, hi - lo + 1 ); 932 then the string starting at i lexicographically
522 qSort3 ( s, lo, hi, 2 ); 933 precedes the string starting at j
523 numQSorted += ( hi - lo + 1 ); 934
524 if (*workDone > workLimit && firstAttempt) return; 935 else if quadrant[i] > quadrant[j]
525 } 936 then the string starting at j lexicographically
937 precedes the string starting at i
938
939 else
940 the relative ordering of the strings starting
941 at i and j has not yet been determined.
526 } 942 }
527 ftab[sb] |= SETMASK; 943 --*/
528 } 944 bigDone[ss] = True;
529 }
530 945
531 /*-- 946 if (i < 255) {
532 Step 2: 947 Int32 bbStart = ftab[ss << 8] & CLEARMASK;
533 Deal specially with case [ss, ss]. This establishes the 948 Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
534 sorted order for [ss, ss] without any comparisons. 949 Int32 shifts = 0;
535 A clever trick, cryptically described as steps Q6b and Q6c
536 in SRC-124 (aka BW94). This makes it entirely practical to
537 not use a preliminary run-length coder, but unfortunately
538 we are now stuck with the .bz2 file format.
539 --*/
540 {
541 Int32 put0, get0, put1, get1;
542 Int32 sbn = (ss << 8) + ss;
543 Int32 lo = ftab[sbn] & CLEARMASK;
544 Int32 hi = (ftab[sbn+1] & CLEARMASK) - 1;
545 UChar ssc = (UChar)ss;
546 put0 = lo;
547 get0 = ftab[ss << 8] & CLEARMASK;
548 put1 = hi;
549 get1 = (ftab[(ss+1) << 8] & CLEARMASK) - 1;
550 while (get0 < put0) {
551 j = zptr[get0]-1; if (j < 0) j += nblock;
552 c1 = block[j];
553 if (c1 == ssc) { zptr[put0] = j; put0++; };
554 get0++;
555 }
556 while (get1 > put1) {
557 j = zptr[get1]-1; if (j < 0) j += nblock;
558 c1 = block[j];
559 if (c1 == ssc) { zptr[put1] = j; put1--; };
560 get1--;
561 }
562 ftab[sbn] |= SETMASK;
563 }
564 950
565 /*-- 951 while ((bbSize >> shifts) > 65534) shifts++;
566 Step 3:
567 The [ss] big bucket is now done. Record this fact,
568 and update the quadrant descriptors. Remember to
569 update quadrants in the overshoot area too, if
570 necessary. The "if (i < 255)" test merely skips
571 this updating for the last bucket processed, since
572 updating for the last bucket is pointless.
573
574 The quadrant array provides a way to incrementally
575 cache sort orderings, as they appear, so as to
576 make subsequent comparisons in fullGtU() complete
577 faster. For repetitive blocks this makes a big
578 difference (but not big enough to be able to avoid
579 randomisation for very repetitive data.)
580
581 The precise meaning is: at all times:
582
583 for 0 <= i < nblock and 0 <= j <= nblock
584
585 if block[i] != block[j],
586
587 then the relative values of quadrant[i] and
588 quadrant[j] are meaningless.
589
590 else {
591 if quadrant[i] < quadrant[j]
592 then the string starting at i lexicographically
593 precedes the string starting at j
594
595 else if quadrant[i] > quadrant[j]
596 then the string starting at j lexicographically
597 precedes the string starting at i
598
599 else
600 the relative ordering of the strings starting
601 at i and j has not yet been determined.
602 }
603 --*/
604 bigDone[ss] = True;
605
606 if (i < 255) {
607 Int32 bbStart = ftab[ss << 8] & CLEARMASK;
608 Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
609 Int32 shifts = 0;
610
611 while ((bbSize >> shifts) > 65534) shifts++;
612
613 for (j = 0; j < bbSize; j++) {
614 Int32 a2update = zptr[bbStart + j];
615 UInt16 qVal = (UInt16)(j >> shifts);
616 quadrant[a2update] = qVal;
617 if (a2update < BZ_NUM_OVERSHOOT_BYTES)
618 quadrant[a2update + nblock] = qVal;
619 }
620 952
621 AssertH ( ( ((bbSize-1) >> shifts) <= 65535 ), 1002 ); 953 for (j = 0; j < bbSize; j++) {
954 Int32 a2update = ptr[bbStart + j];
955 UInt16 qVal = (UInt16)(j >> shifts);
956 quadrant[a2update] = qVal;
957 if (a2update < BZ_N_OVERSHOOT)
958 quadrant[a2update + nblock] = qVal;
622 } 959 }
960 AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
961 }
623 962
624 /*-- 963 /*--
625 Step 4: 964 Step 4:
626 Now scan this big bucket [ss] so as to synthesise the 965 Now scan this big bucket [ss] so as to synthesise the
627 sorted order for small buckets [t, ss] for all t != ss. 966 sorted order for small buckets [t, ss] for all t != ss.
628 This will avoid doing Real Work in subsequent Step 1's. 967 This will avoid doing Real Work in subsequent Step 1's.
629 --*/ 968 --*/
630 for (j = 0; j <= 255; j++) 969 for (j = 0; j <= 255; j++)
631 copy[j] = ftab[(j << 8) + ss] & CLEARMASK; 970 copy[j] = ftab[(j << 8) + ss] & CLEARMASK;
632 971
633 for (j = ftab[ss << 8] & CLEARMASK; 972 m = ftab[(ss+1) << 8] & CLEARMASK;
634 j < (ftab[(ss+1) << 8] & CLEARMASK); 973 for (j = ftab[ss << 8] & CLEARMASK; j < m; j++) {
635 j++) { 974 k = ptr[j] - 1; if (k < 0) k += nblock;
636 k = zptr[j]-1; if (k < 0) k += nblock; 975 c1 = (UChar)(block[k] >> 8);
637 c1 = block[k]; 976 if ( ! bigDone[c1] ) {
638 if ( ! bigDone[c1] ) { 977 ptr[copy[c1]] = k;
639 zptr[copy[c1]] = k; 978 copy[c1] ++;
640 copy[c1] ++;
641 }
642 } 979 }
643
644 for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
645 } 980 }
646 if (s->verbosity >= 4)
647 VPrintf3( " %d pointers, %d sorted, %d scanned\n",
648 nblock, numQSorted, nblock - numQSorted );
649 }
650}
651
652 981
653/*---------------------------------------------*/ 982 for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
654static void randomiseBlock ( EState* s )
655{
656 Int32 i;
657 BZ_RAND_INIT_MASK;
658 for (i = 0; i < 256; i++) s->inUse[i] = False;
659
660 for (i = 0; i < s->nblock; i++) {
661 BZ_RAND_UPD_MASK;
662 s->block[i] ^= BZ_RAND_MASK;
663 s->inUse[s->block[i]] = True;
664 } 983 }
984
985 if (verb >= 4)
986 VPrintf3 ( " %d pointers, %d sorted, %d scanned\n",
987 nblock, numQSorted, nblock - numQSorted );
665} 988}
666 989
990#undef BIGFREQ
991#undef SETMASK
992#undef CLEARMASK
993
667 994
668/*---------------------------------------------*/ 995/*---------------------------------------------*/
996/* Pre:
997 nblock > 0
998 arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
999 ((UInt16*)arr2) [0 .. nblock-1] [15:8] holds block
1000 arr1 exists for [0 .. nblock-1]
1001
1002 Post:
1003 ((UInt16*)arr2) [0 .. nblock-1] [15:8] holds block
1004 All other areas of block destroyed
1005 ftab [ 0 .. 65536 ] destroyed
1006 arr1 [0 .. nblock-1] holds sorted order
1007*/
669void blockSort ( EState* s ) 1008void blockSort ( EState* s )
670{ 1009{
671 Int32 i; 1010 UInt32* ptr = s->ptr;
672 1011 UInt16* block = s->block;
673 s->workLimit = s->workFactor * (s->nblock - 1); 1012 UInt32* ftab = s->ftab;
674 s->workDone = 0; 1013 Int32 nblock = s->nblock;
675 s->blockRandomised = False; 1014 Int32 verb = s->verbosity;
676 s->firstAttempt = True; 1015 Int32 wfact = s->workFactor;
677 1016 UInt16* quadrant;
678 sortMain ( s ); 1017 Int32 budget;
679 1018 Int32 budgetInit;
680 if (s->verbosity >= 3) 1019 Int32 i;
681 VPrintf3( " %d work, %d block, ratio %5.2f\n", 1020
682 s->workDone, s->nblock-1, 1021 if (nblock < 10000) {
683 (float)(s->workDone) / (float)(s->nblock-1) ); 1022 for (i = 0; i < nblock; i++) block[i] <<= 8;
684 1023 fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
685 if (s->workDone > s->workLimit && s->firstAttempt) { 1024 } else {
686 if (s->verbosity >= 2) 1025 quadrant = &(block[nblock+BZ_N_OVERSHOOT]);
687 VPrintf0( " sorting aborted; randomising block\n" ); 1026
688 randomiseBlock ( s ); 1027 /* (wfact-1) / 3 puts the default-factor-30
689 s->workLimit = s->workDone = 0; 1028 transition point at very roughly the same place as
690 s->blockRandomised = True; 1029 with v0.1 and v0.9.0.
691 s->firstAttempt = False; 1030 Not that it particularly matters any more, since the
692 sortMain ( s ); 1031 resulting compressed stream is now the same regardless
693 if (s->verbosity >= 3) 1032 of whether or not we use the main sort or fallback sort.
694 VPrintf3( " %d work, %d block, ratio %f\n", 1033 */
695 s->workDone, s->nblock-1, 1034 if (wfact < 1 ) wfact = 1;
696 (float)(s->workDone) / (float)(s->nblock-1) ); 1035 if (wfact > 100) wfact = 100;
1036 budgetInit = nblock * ((wfact-1) / 3);
1037 budget = budgetInit;
1038
1039 mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1040 if (verb >= 3)
1041 VPrintf3 ( " %d work, %d block, ratio %5.2f\n",
1042 budgetInit - budget,
1043 nblock,
1044 (float)(budgetInit - budget) /
1045 (float)(nblock==0 ? 1 : nblock) );
1046 if (budget < 0) {
1047 if (verb >= 2)
1048 VPrintf0 ( " too repetitive; using fallback"
1049 " sorting algorithm\n" );
1050 fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1051 }
697 } 1052 }
698 1053
699 s->origPtr = -1; 1054 s->origPtr = -1;
700 for (i = 0; i < s->nblock; i++) 1055 for (i = 0; i < s->nblock; i++)
701 if (s->zptr[i] == 0) 1056 if (ptr[i] == 0)
702 { s->origPtr = i; break; }; 1057 { s->origPtr = i; break; };
703 1058
704 AssertH( s->origPtr != -1, 1003 ); 1059 AssertH( s->origPtr != -1, 1003 );
705} 1060}
706 1061
1062
707/*-------------------------------------------------------------*/ 1063/*-------------------------------------------------------------*/
708/*--- end blocksort.c ---*/ 1064/*--- end blocksort.c ---*/
709/*-------------------------------------------------------------*/ 1065/*-------------------------------------------------------------*/
diff --git a/bzip2.1 b/bzip2.1
index a6789a4..99eda9b 100644
--- a/bzip2.1
+++ b/bzip2.1
@@ -1,7 +1,7 @@
1.PU 1.PU
2.TH bzip2 1 2.TH bzip2 1
3.SH NAME 3.SH NAME
4bzip2, bunzip2 \- a block-sorting file compressor, v0.9.0 4bzip2, bunzip2 \- a block-sorting file compressor, v0.9.5
5.br 5.br
6bzcat \- decompresses files to stdout 6bzcat \- decompresses files to stdout
7.br 7.br
@@ -10,7 +10,7 @@ bzip2recover \- recovers data from damaged bzip2 files
10.SH SYNOPSIS 10.SH SYNOPSIS
11.ll +8 11.ll +8
12.B bzip2 12.B bzip2
13.RB [ " \-cdfkstvzVL123456789 " ] 13.RB [ " \-cdfkqstvzVL123456789 " ]
14[ 14[
15.I "filenames \&..." 15.I "filenames \&..."
16] 16]
@@ -18,13 +18,13 @@ bzip2recover \- recovers data from damaged bzip2 files
18.br 18.br
19.B bunzip2 19.B bunzip2
20.RB [ " \-fkvsVL " ] 20.RB [ " \-fkvsVL " ]
21[ 21[
22.I "filenames \&..." 22.I "filenames \&..."
23] 23]
24.br 24.br
25.B bzcat 25.B bzcat
26.RB [ " \-s " ] 26.RB [ " \-s " ]
27[ 27[
28.I "filenames \&..." 28.I "filenames \&..."
29] 29]
30.br 30.br
@@ -33,211 +33,171 @@ bzip2recover \- recovers data from damaged bzip2 files
33 33
34.SH DESCRIPTION 34.SH DESCRIPTION
35.I bzip2 35.I bzip2
36compresses files using the Burrows-Wheeler block-sorting 36compresses files using the Burrows-Wheeler block sorting
37text compression algorithm, and Huffman coding. 37text compression algorithm, and Huffman coding. Compression is
38Compression is generally considerably 38generally considerably better than that achieved by more conventional
39better than that 39LZ77/LZ78-based compressors, and approaches the performance of the PPM
40achieved by more conventional LZ77/LZ78-based compressors, 40family of statistical compressors.
41and approaches the performance of the PPM family of statistical
42compressors.
43 41
44The command-line options are deliberately very similar to 42The command-line options are deliberately very similar to
45those of 43those of
46.I GNU Gzip, 44.I GNU gzip,
47but they are not identical. 45but they are not identical.
48 46
49.I bzip2 47.I bzip2
50expects a list of file names to accompany the command-line flags. 48expects a list of file names to accompany the
51Each file is replaced by a compressed version of itself, 49command-line flags. Each file is replaced by a compressed version of
52with the name "original_name.bz2". 50itself, with the name "original_name.bz2".
53Each compressed file has the same modification date and permissions 51Each compressed file
54as the corresponding original, so that these properties can be 52has the same modification date, permissions, and, when possible,
55correctly restored at decompression time. File name handling is 53ownership as the corresponding original, so that these properties can
56naive in the sense that there is no mechanism for preserving 54be correctly restored at decompression time. File name handling is
57original file names, permissions and dates in filesystems 55naive in the sense that there is no mechanism for preserving original
58which lack these concepts, or have serious file name length 56file names, permissions, ownerships or dates in filesystems which lack
59restrictions, such as MS-DOS. 57these concepts, or have serious file name length restrictions, such as
58MS-DOS.
60 59
61.I bzip2 60.I bzip2
62and 61and
63.I bunzip2 62.I bunzip2
64will by default not overwrite existing files; 63will by default not overwrite existing
65if you want this to happen, specify the \-f flag. 64files. If you want this to happen, specify the \-f flag.
66 65
67If no file names are specified, 66If no file names are specified,
68.I bzip2 67.I bzip2
69compresses from standard input to standard output. 68compresses from standard
70In this case, 69input to standard output. In this case,
71.I bzip2 70.I bzip2
72will decline to write compressed output to a terminal, as 71will decline to
73this would be entirely incomprehensible and therefore pointless. 72write compressed output to a terminal, as this would be entirely
73incomprehensible and therefore pointless.
74 74
75.I bunzip2 75.I bunzip2
76(or 76(or
77.I bzip2 \-d 77.I bzip2 \-d)
78) decompresses and restores all specified files whose names 78decompresses all
79end in ".bz2". 79specified files. Files which were not created by
80Files without this suffix are ignored.
81Again, supplying no filenames
82causes decompression from standard input to standard output.
83
84.I bunzip2
85will correctly decompress a file which is the concatenation
86of two or more compressed files. The result is the concatenation
87of the corresponding uncompressed files. Integrity testing
88(\-t) of concatenated compressed files is also supported.
89
90You can also compress or decompress files to
91the standard output by giving the \-c flag.
92Multiple files may be compressed and decompressed like this.
93The resulting outputs are fed sequentially to stdout.
94Compression of multiple files in this manner generates
95a stream containing multiple compressed file representations.
96Such a stream can be decompressed correctly only by
97.I bzip2 80.I bzip2
98version 0.9.0 or later. Earlier versions of 81will be detected and ignored, and a warning issued.
99.I bzip2 82.I bzip2
100will stop after decompressing the first file in the stream. 83attempts to guess the filename for the decompressed file
84from that of the compressed file as follows:
85
86 filename.bz2 becomes filename
87 filename.bz becomes filename
88 filename.tbz2 becomes filename.tar
89 filename.tbz becomes filename.tar
90 anyothername becomes anyothername.out
91
92If the file does not end in one of the recognised endings,
93.I .bz2,
94.I .bz,
95.I .tbz2
96or
97.I .tbz,
98.I bzip2
99complains that it cannot
100guess the name of the original file, and uses the original name
101with
102.I .out
103appended.
104
105As with compression, supplying no
106filenames causes decompression from
107standard input to standard output.
108
109.I bunzip2
110will correctly decompress a file which is the
111concatenation of two or more compressed files. The result is the
112concatenation of the corresponding uncompressed files. Integrity
113testing (\-t)
114of concatenated
115compressed files is also supported.
116
117You can also compress or decompress files to the standard output by
118giving the \-c flag. Multiple files may be compressed and
119decompressed like this. The resulting outputs are fed sequentially to
120stdout. Compression of multiple files
121in this manner generates a stream
122containing multiple compressed file representations. Such a stream
123can be decompressed correctly only by
124.I bzip2
125version 0.9.0 or
126later. Earlier versions of
127.I bzip2
128will stop after decompressing
129the first file in the stream.
101 130
102.I bzcat 131.I bzcat
103(or 132(or
104.I bzip2 \-dc 133.I bzip2 -dc)
105) decompresses all specified files to the standard output. 134decompresses all specified files to
106 135the standard output.
107Compression is always performed, even if the compressed file is 136
108slightly larger than the original. Files of less than about
109one hundred bytes tend to get larger, since the compression
110mechanism has a constant overhead in the region of 50 bytes.
111Random data (including the output of most file compressors)
112is coded at about 8.05 bits per byte, giving an expansion of
113around 0.5%.
114
115As a self-check for your protection,
116.I bzip2 137.I bzip2
117uses 32-bit CRCs to make sure that the decompressed 138will read arguments from the environment variables
118version of a file is identical to the original. 139.I BZIP2
119This guards against corruption of the compressed data, 140and
120and against undetected bugs in 141.I BZIP,
142in that order, and will process them
143before any arguments read from the command line. This gives a
144convenient way to supply default arguments.
145
146Compression is always performed, even if the compressed
147file is slightly
148larger than the original. Files of less than about one hundred bytes
149tend to get larger, since the compression mechanism has a constant
150overhead in the region of 50 bytes. Random data (including the output
151of most file compressors) is coded at about 8.05 bits per byte, giving
152an expansion of around 0.5%.
153
154As a self-check for your protection,
155.I
156bzip2
157uses 32-bit CRCs to
158make sure that the decompressed version of a file is identical to the
159original. This guards against corruption of the compressed data, and
160against undetected bugs in
121.I bzip2 161.I bzip2
122(hopefully very unlikely). 162(hopefully very unlikely). The
123The chances of data corruption going undetected is 163chances of data corruption going undetected is microscopic, about one
124microscopic, about one chance in four billion 164chance in four billion for each file processed. Be aware, though, that
125for each file processed. Be aware, though, that the check 165the check occurs upon decompression, so it can only tell you that
126occurs upon decompression, so it can only tell you that 166something is wrong. It can't help you
127that something is wrong. It can't help you recover the 167recover the original uncompressed
128original uncompressed data. 168data. You can use
129You can use
130.I bzip2recover 169.I bzip2recover
131to try to recover data from damaged files. 170to try to recover data from
132 171damaged files.
133Return values:
1340 for a normal exit,
1351 for environmental
136problems (file not found, invalid flags, I/O errors, &c),
1372 to indicate a corrupt compressed file,
1383 for an internal consistency error (eg, bug) which caused
139.I bzip2
140to panic.
141 172
142.SH MEMORY MANAGEMENT 173Return values: 0 for a normal exit, 1 for environmental problems (file
143.I Bzip2 174not found, invalid flags, I/O errors, &c), 2 to indicate a corrupt
144compresses large files in blocks. The block size affects both the 175compressed file, 3 for an internal consistency error (eg, bug) which
145compression ratio achieved, and the amount of memory needed both for 176caused
146compression and decompression. The flags \-1 through \-9
147specify the block size to be 100,000 bytes through 900,000 bytes
148(the default) respectively. At decompression-time, the block size used for
149compression is read from the header of the compressed file, and
150.I bunzip2
151then allocates itself just enough memory to decompress the file.
152Since block sizes are stored in compressed files, it follows that the flags
153\-1 to \-9
154are irrelevant to and so ignored during decompression.
155Compression and decompression requirements, in bytes, can be estimated as:
156
157 Compression: 400k + ( 7 x block size )
158
159 Decompression: 100k + ( 4 x block size ), or
160.br
161 100k + ( 2.5 x block size )
162
163Larger block sizes give rapidly diminishing marginal returns; most
164of the
165compression comes from the first two or three hundred k of block size,
166a fact worth bearing in mind when using
167.I bzip2 177.I bzip2
168on small machines. It is also important to appreciate that the 178to panic.
169decompression memory requirement is set at compression-time by the
170choice of block size.
171
172For files compressed with the default 900k block size,
173.I bunzip2
174will require about 3700 kbytes to decompress.
175To support decompression of any file on a 4 megabyte machine,
176.I bunzip2
177has an option to decompress using approximately half this
178amount of memory, about 2300 kbytes. Decompression speed is
179also halved, so you should use this option only where necessary.
180The relevant flag is \-s.
181
182In general, try and use the largest block size
183memory constraints allow, since that maximises the compression
184achieved. Compression and decompression
185speed are virtually unaffected by block size.
186
187Another significant point applies to files which fit in a single
188block -- that means most files you'd encounter using a large
189block size. The amount of real memory touched is proportional
190to the size of the file, since the file is smaller than a block.
191For example, compressing a file 20,000 bytes long with the flag
192\-9
193will cause the compressor to allocate around
1946700k of memory, but only touch 400k + 20000 * 7 = 540
195kbytes of it. Similarly, the decompressor will allocate 3700k but
196only touch 100k + 20000 * 4 = 180 kbytes.
197
198Here is a table which summarises the maximum memory usage for
199different block sizes. Also recorded is the total compressed
200size for 14 files of the Calgary Text Compression Corpus
201totalling 3,141,622 bytes. This column gives some feel for how
202compression varies with block size. These figures tend to understate
203the advantage of larger block sizes for larger files, since the
204Corpus is dominated by smaller files.
205
206 Compress Decompress Decompress Corpus
207 Flag usage usage -s usage Size
208
209 -1 1100k 500k 350k 914704
210 -2 1800k 900k 600k 877703
211 -3 2500k 1300k 850k 860338
212 -4 3200k 1700k 1100k 846899
213 -5 3900k 2100k 1350k 845160
214 -6 4600k 2500k 1600k 838626
215 -7 5400k 2900k 1850k 834096
216 -8 6000k 3300k 2100k 828642
217 -9 6700k 3700k 2350k 828642
218 179
219.SH OPTIONS 180.SH OPTIONS
220.TP 181.TP
221.B \-c --stdout 182.B \-c --stdout
222Compress or decompress to standard output. \-c will decompress 183Compress or decompress to standard output.
223multiple files to stdout, but will only compress a single file to
224stdout.
225.TP 184.TP
226.B \-d --decompress 185.B \-d --decompress
227Force decompression. 186Force decompression.
228.I bzip2, 187.I bzip2,
229.I bunzip2 188.I bunzip2
230and 189and
231.I bzcat 190.I bzcat
232are really the same program, and the decision about what actions 191are
233to take is done on the basis of which name is 192really the same program, and the decision about what actions to take is
234used. This flag overrides that mechanism, and forces 193done on the basis of which name is used. This flag overrides that
194mechanism, and forces
235.I bzip2 195.I bzip2
236to decompress. 196to decompress.
237.TP 197.TP
238.B \-z --compress 198.B \-z --compress
239The complement to \-d: forces compression, regardless of the invokation 199The complement to \-d: forces compression, regardless of the
240name. 200invokation name.
241.TP 201.TP
242.B \-t --test 202.B \-t --test
243Check integrity of the specified file(s), but don't decompress them. 203Check integrity of the specified file(s), but don't decompress them.
@@ -245,25 +205,31 @@ This really performs a trial decompression and throws away the result.
245.TP 205.TP
246.B \-f --force 206.B \-f --force
247Force overwrite of output files. Normally, 207Force overwrite of output files. Normally,
248.I bzip2 208.I bzip2
249will not overwrite existing output files. 209will not overwrite
210existing output files. Also forces
211.I bzip2
212to break hard links
213to files, which it otherwise wouldn't do.
250.TP 214.TP
251.B \-k --keep 215.B \-k --keep
252Keep (don't delete) input files during compression or decompression. 216Keep (don't delete) input files during compression
217or decompression.
253.TP 218.TP
254.B \-s --small 219.B \-s --small
255Reduce memory usage, for compression, decompression and 220Reduce memory usage, for compression, decompression and testing. Files
256testing. 221are decompressed and tested using a modified algorithm which only
257Files are decompressed and tested using a modified algorithm which only
258requires 2.5 bytes per block byte. This means any file can be 222requires 2.5 bytes per block byte. This means any file can be
259decompressed in 2300k of memory, albeit at about half the normal 223decompressed in 2300k of memory, albeit at about half the normal speed.
260speed. 224
261 225During compression, \-s selects a block size of 200k, which limits
262During compression, -s selects a block size of 200k, which limits 226memory use to around the same figure, at the expense of your compression
263memory use to around the same figure, at the expense of your 227ratio. In short, if your machine is low on memory (8 megabytes or
264compression ratio. In short, if your machine is low on memory 228less), use \-s for everything. See MEMORY MANAGEMENT below.
265(8 megabytes or less), use -s for everything. See 229.TP
266MEMORY MANAGEMENT above. 230.B \-q --quiet
231Suppress non-essential warning messages. Messages pertaining to
232I/O errors and other critical events will not be suppressed.
267.TP 233.TP
268.B \-v --verbose 234.B \-v --verbose
269Verbose mode -- show the compression ratio for each file processed. 235Verbose mode -- show the compression ratio for each file processed.
@@ -273,147 +239,199 @@ information which is primarily of interest for diagnostic purposes.
273.B \-L --license -V --version 239.B \-L --license -V --version
274Display the software version, license terms and conditions. 240Display the software version, license terms and conditions.
275.TP 241.TP
276.B \-1 to \-9 242.B \-1 to \-9
277Set the block size to 100 k, 200 k .. 900 k when 243Set the block size to 100 k, 200 k .. 900 k when compressing. Has no
278compressing. Has no effect when decompressing. 244effect when decompressing. See MEMORY MANAGEMENT below.
279See MEMORY MANAGEMENT above.
280.TP 245.TP
281.B \--repetitive-fast 246.B \--
282.I bzip2 247Treats all subsequent arguments as file names, even if they start
283injects some small pseudo-random variations 248with a dash. This is so you can handle files with names beginning
284into very repetitive blocks to limit 249with a dash, for example: bzip2 \-- \-myfilename.
285worst-case performance during compression. 250.TP
286If sorting runs into difficulties, the block 251.B \--repetitive-fast --repetitive-best
287is randomised, and sorting is restarted. 252These flags are redundant in versions 0.9.5 and above. They provided
288Very roughly, 253some coarse control over the behaviour of the sorting algorithm in
254earlier versions, which was sometimes useful. 0.9.5 and above have an
255improved algorithm which renders these flags irrelevant.
256
257.SH MEMORY MANAGEMENT
258.I bzip2
259compresses large files in blocks. The block size affects
260both the compression ratio achieved, and the amount of memory needed for
261compression and decompression. The flags \-1 through \-9
262specify the block size to be 100,000 bytes through 900,000 bytes (the
263default) respectively. At decompression time, the block size used for
264compression is read from the header of the compressed file, and
265.I bunzip2
266then allocates itself just enough memory to decompress
267the file. Since block sizes are stored in compressed files, it follows
268that the flags \-1 to \-9 are irrelevant to and so ignored
269during decompression.
270
271Compression and decompression requirements,
272in bytes, can be estimated as:
273
274 Compression: 400k + ( 8 x block size )
275
276 Decompression: 100k + ( 4 x block size ), or
277 100k + ( 2.5 x block size )
278
279Larger block sizes give rapidly diminishing marginal returns. Most of
280the compression comes from the first two or three hundred k of block
281size, a fact worth bearing in mind when using
289.I bzip2 282.I bzip2
290persists for three times as long as a well-behaved input 283on small machines.
291would take before resorting to randomisation. 284It is also important to appreciate that the decompression memory
292This flag makes it give up much sooner. 285requirement is set at compression time by the choice of block size.
293 286
294.TP 287For files compressed with the default 900k block size,
295.B \--repetitive-best 288.I bunzip2
296Opposite of \--repetitive-fast; try a lot harder before 289will require about 3700 kbytes to decompress. To support decompression
297resorting to randomisation. 290of any file on a 4 megabyte machine,
291.I bunzip2
292has an option to
293decompress using approximately half this amount of memory, about 2300
294kbytes. Decompression speed is also halved, so you should use this
295option only where necessary. The relevant flag is -s.
296
297In general, try and use the largest block size memory constraints allow,
298since that maximises the compression achieved. Compression and
299decompression speed are virtually unaffected by block size.
300
301Another significant point applies to files which fit in a single block
302-- that means most files you'd encounter using a large block size. The
303amount of real memory touched is proportional to the size of the file,
304since the file is smaller than a block. For example, compressing a file
30520,000 bytes long with the flag -9 will cause the compressor to
306allocate around 7600k of memory, but only touch 400k + 20000 * 8 = 560
307kbytes of it. Similarly, the decompressor will allocate 3700k but only
308touch 100k + 20000 * 4 = 180 kbytes.
309
310Here is a table which summarises the maximum memory usage for different
311block sizes. Also recorded is the total compressed size for 14 files of
312the Calgary Text Compression Corpus totalling 3,141,622 bytes. This
313column gives some feel for how compression varies with block size.
314These figures tend to understate the advantage of larger block sizes for
315larger files, since the Corpus is dominated by smaller files.
316
317 Compress Decompress Decompress Corpus
318 Flag usage usage -s usage Size
319
320 -1 1200k 500k 350k 914704
321 -2 2000k 900k 600k 877703
322 -3 2800k 1300k 850k 860338
323 -4 3600k 1700k 1100k 846899
324 -5 4400k 2100k 1350k 845160
325 -6 5200k 2500k 1600k 838626
326 -7 6100k 2900k 1850k 834096
327 -8 6800k 3300k 2100k 828642
328 -9 7600k 3700k 2350k 828642
298 329
299.SH RECOVERING DATA FROM DAMAGED FILES 330.SH RECOVERING DATA FROM DAMAGED FILES
300.I bzip2 331.I bzip2
301compresses files in blocks, usually 900kbytes long. 332compresses files in blocks, usually 900kbytes long. Each
302Each block is handled independently. If a media or 333block is handled independently. If a media or transmission error causes
303transmission error causes a multi-block .bz2 334a multi-block .bz2
304file to become damaged, 335file to become damaged, it may be possible to
305it may be possible to recover data from the undamaged blocks 336recover data from the undamaged blocks in the file.
306in the file. 337
307 338The compressed representation of each block is delimited by a 48-bit
308The compressed representation of each block is delimited by 339pattern, which makes it possible to find the block boundaries with
309a 48-bit pattern, which makes it possible to find the block 340reasonable certainty. Each block also carries its own 32-bit CRC, so
310boundaries with reasonable certainty. Each block also carries 341damaged blocks can be distinguished from undamaged ones.
311its own 32-bit CRC, so damaged blocks can be
312distinguished from undamaged ones.
313 342
314.I bzip2recover 343.I bzip2recover
315is a simple program whose purpose is to search for 344is a simple program whose purpose is to search for
316blocks in .bz2 files, and write each block out into 345blocks in .bz2 files, and write each block out into its own .bz2
317its own .bz2 file. You can then use 346file. You can then use
318.I bzip2 -t 347.I bzip2
319to test the integrity of the resulting files, 348\-t
320and decompress those which are undamaged. 349to test the
350integrity of the resulting files, and decompress those which are
351undamaged.
321 352
322.I bzip2recover 353.I bzip2recover
323takes a single argument, the name of the damaged file, 354takes a single argument, the name of the damaged file,
324and writes a number of files "rec0001file.bz2", "rec0002file.bz2", 355and writes a number of files "rec0001file.bz2",
325etc, containing the extracted blocks. The output filenames 356"rec0002file.bz2", etc, containing the extracted blocks.
326are designed so that the use of wildcards in subsequent processing 357The output filenames are designed so that the use of
327-- for example, "bzip2 -dc rec*file.bz2 > recovered_data" -- 358wildcards in subsequent processing -- for example,
328lists the files in the "right" order. 359"bzip2 -dc rec*file.bz2 > recovered_data" -- lists the files in
360the correct order.
329 361
330.I bzip2recover 362.I bzip2recover
331should be of most use dealing with large .bz2 files, as 363should be of most use dealing with large .bz2
332these will contain many blocks. It is clearly futile to 364files, as these will contain many blocks. It is clearly
333use it on damaged single-block files, since a damaged 365futile to use it on damaged single-block files, since a
334block cannot be recovered. If you wish to minimise 366damaged block cannot be recovered. If you wish to minimise
335any potential data loss through media or transmission 367any potential data loss through media or transmission errors,
336errors, you might consider compressing with a smaller 368you might consider compressing with a smaller
337block size. 369block size.
338 370
339.SH PERFORMANCE NOTES 371.SH PERFORMANCE NOTES
340The sorting phase of compression gathers together similar strings 372The sorting phase of compression gathers together similar strings in the
341in the file. Because of this, files containing very long 373file. Because of this, files containing very long runs of repeated
342runs of repeated symbols, like "aabaabaabaab ..." (repeated 374symbols, like "aabaabaabaab ..." (repeated several hundred times) may
343several hundred times) may compress extraordinarily slowly. 375compress more slowly than normal. Versions 0.9.5 and above fare much
344You can use the 376better than previous versions in this respect. The ratio between
345\-vvvvv 377worst-case and average-case compression time is in the region of 10:1.
346option to monitor progress in great detail, if you want. 378For previous versions, this figure was more like 100:1. You can use the
347Decompression speed is unaffected. 379\-vvvv option to monitor progress in great detail, if you want.
348 380
349Such pathological cases 381Decompression speed is unaffected by these phenomena.
350seem rare in practice, appearing mostly in artificially-constructed
351test files, and in low-level disk images. It may be inadvisable to
352use
353.I bzip2
354to compress the latter.
355If you do get a file which causes severe slowness in compression,
356try making the block size as small as possible, with flag \-1.
357 382
358.I bzip2 383.I bzip2
359usually allocates several megabytes of memory to operate in, 384usually allocates several megabytes of memory to operate
360and then charges all over it in a fairly random fashion. This 385in, and then charges all over it in a fairly random fashion. This means
361means that performance, both for compressing and decompressing, 386that performance, both for compressing and decompressing, is largely
362is largely determined by the speed 387determined by the speed at which your machine can service cache misses.
363at which your machine can service cache misses. 388Because of this, small changes to the code to reduce the miss rate have
364Because of this, small changes 389been observed to give disproportionately large performance improvements.
365to the code to reduce the miss rate have been observed to give
366disproportionately large performance improvements.
367I imagine 390I imagine
368.I bzip2 391.I bzip2
369will perform best on machines with very large caches. 392will perform best on machines with very large caches.
370 393
371.SH CAVEATS 394.SH CAVEATS
372I/O error messages are not as helpful as they could be. 395I/O error messages are not as helpful as they could be.
373.I Bzip2 396.I bzip2
374tries hard to detect I/O errors and exit cleanly, but the 397tries hard to detect I/O errors and exit cleanly, but the details of
375details of what the problem is sometimes seem rather misleading. 398what the problem is sometimes seem rather misleading.
376 399
377This manual page pertains to version 0.9.0 of 400This manual page pertains to version 0.9.5 of
378.I bzip2. 401.I bzip2.
379Compressed data created by this version is entirely forwards and 402Compressed
380backwards compatible with the previous public release, version 0.1pl2, 403data created by this version is entirely forwards and backwards
381but with the following exception: 0.9.0 can correctly decompress 404compatible with the previous public releases, versions 0.1pl2 and 0.9.0,
382multiple concatenated compressed files. 0.1pl2 cannot do this; it 405but with the following exception: 0.9.0 and above can correctly
383will stop after decompressing just the first file in the stream. 406decompress multiple concatenated compressed files. 0.1pl2 cannot do
384 407this; it will stop after decompressing just the first file in the
385Wildcard expansion for Windows 95 and NT 408stream.
386is flaky.
387 409
388.I bzip2recover 410.I bzip2recover
389uses 32-bit integers to represent bit positions in 411uses 32-bit integers to represent bit positions in
390compressed files, so it cannot handle compressed files 412compressed files, so it cannot handle compressed files more than 512
391more than 512 megabytes long. This could easily be fixed. 413megabytes long. This could easily be fixed.
392 414
393.SH AUTHOR 415.SH AUTHOR
394Julian Seward, jseward@acm.org. 416Julian Seward, jseward@acm.org.
395 417
396http://www.muraroa.demon.co.uk 418http://www.muraroa.demon.co.uk
397 419
398The ideas embodied in 420The ideas embodied in
399.I bzip2 421.I bzip2
400are due to (at least) the following people: 422are due to (at least) the following
401Michael Burrows and David Wheeler (for the block sorting 423people: Michael Burrows and David Wheeler (for the block sorting
402transformation), David Wheeler (again, for the Huffman coder), 424transformation), David Wheeler (again, for the Huffman coder), Peter
403Peter Fenwick (for the structured coding model in the original 425Fenwick (for the structured coding model in the original
404.I bzip, 426.I bzip,
405and many refinements), 427and many refinements), and Alistair Moffat, Radford Neal and Ian Witten
406and 428(for the arithmetic coder in the original
407Alistair Moffat, Radford Neal and Ian Witten (for the arithmetic
408coder in the original
409.I bzip). 429.I bzip).
410I am much indebted for their help, support and advice. 430I am much
411See the manual in the source distribution for pointers to 431indebted for their help, support and advice. See the manual in the
412sources of documentation. 432source distribution for pointers to sources of documentation. Christian
413Christian von Roques encouraged me to look for faster 433von Roques encouraged me to look for faster sorting algorithms, so as to
414sorting algorithms, so as to speed up compression. 434speed up compression. Bela Lubkin encouraged me to improve the
415Bela Lubkin encouraged me to improve the worst-case 435worst-case compression performance. Many people sent patches, helped
416compression performance. 436with portability problems, lent machines, gave advice and were generally
417Many people sent patches, helped with portability problems, 437helpful.
418lent machines, gave advice and were generally helpful.
419
diff --git a/bzip2.1.preformatted b/bzip2.1.preformatted
index 8c4fab1..96b44be 100644
--- a/bzip2.1.preformatted
+++ b/bzip2.1.preformatted
@@ -1,24 +1,20 @@
1 1
2 2
3
4bzip2(1) bzip2(1)
5
6
7NNAAMMEE 3NNAAMMEE
8 bzip2, bunzip2 - a block-sorting file compressor, v0.9.0 4 bzip2, bunzip2 - a block-sorting file compressor, v0.9.5
9 bzcat - decompresses files to stdout 5 bzcat - decompresses files to stdout
10 bzip2recover - recovers data from damaged bzip2 files 6 bzip2recover - recovers data from damaged bzip2 files
11 7
12 8
13SSYYNNOOPPSSIISS 9SSYYNNOOPPSSIISS
14 bbzziipp22 [ --ccddffkkssttvvzzVVLL112233445566778899 ] [ _f_i_l_e_n_a_m_e_s _._._. ] 10 bbzziipp22 [ --ccddffkkqqssttvvzzVVLL112233445566778899 ] [ _f_i_l_e_n_a_m_e_s _._._. ]
15 bbuunnzziipp22 [ --ffkkvvssVVLL ] [ _f_i_l_e_n_a_m_e_s _._._. ] 11 bbuunnzziipp22 [ --ffkkvvssVVLL ] [ _f_i_l_e_n_a_m_e_s _._._. ]
16 bbzzccaatt [ --ss ] [ _f_i_l_e_n_a_m_e_s _._._. ] 12 bbzzccaatt [ --ss ] [ _f_i_l_e_n_a_m_e_s _._._. ]
17 bbzziipp22rreeccoovveerr _f_i_l_e_n_a_m_e 13 bbzziipp22rreeccoovveerr _f_i_l_e_n_a_m_e
18 14
19 15
20DDEESSCCRRIIPPTTIIOONN 16DDEESSCCRRIIPPTTIIOONN
21 _b_z_i_p_2 compresses files using the Burrows-Wheeler block- 17 _b_z_i_p_2 compresses files using the Burrows-Wheeler block
22 sorting text compression algorithm, and Huffman coding. 18 sorting text compression algorithm, and Huffman coding.
23 Compression is generally considerably better than that 19 Compression is generally considerably better than that
24 achieved by more conventional LZ77/LZ78-based compressors, 20 achieved by more conventional LZ77/LZ78-based compressors,
@@ -26,22 +22,22 @@ DDEESSCCRRIIPPTTIIOONN
26 tistical compressors. 22 tistical compressors.
27 23
28 The command-line options are deliberately very similar to 24 The command-line options are deliberately very similar to
29 those of _G_N_U _G_z_i_p_, but they are not identical. 25 those of _G_N_U _g_z_i_p_, but they are not identical.
30 26
31 _b_z_i_p_2 expects a list of file names to accompany the com- 27 _b_z_i_p_2 expects a list of file names to accompany the com-
32 mand-line flags. Each file is replaced by a compressed 28 mand-line flags. Each file is replaced by a compressed
33 version of itself, with the name "original_name.bz2". 29 version of itself, with the name "original_name.bz2".
34 Each compressed file has the same modification date and 30 Each compressed file has the same modification date, per-
35 permissions as the corresponding original, so that these 31 missions, and, when possible, ownership as the correspond-
36 properties can be correctly restored at decompression 32 ing original, so that these properties can be correctly
37 time. File name handling is naive in the sense that there 33 restored at decompression time. File name handling is
38 is no mechanism for preserving original file names, per- 34 naive in the sense that there is no mechanism for preserv-
39 missions and dates in filesystems which lack these con- 35 ing original file names, permissions, ownerships or dates
40 cepts, or have serious file name length restrictions, such 36 in filesystems which lack these concepts, or have serious
41 as MS-DOS. 37 file name length restrictions, such as MS-DOS.
42 38
43 _b_z_i_p_2 and _b_u_n_z_i_p_2 will by default not overwrite existing 39 _b_z_i_p_2 and _b_u_n_z_i_p_2 will by default not overwrite existing
44 files; if you want this to happen, specify the -f flag. 40 files. If you want this to happen, specify the -f flag.
45 41
46 If no file names are specified, _b_z_i_p_2 compresses from 42 If no file names are specified, _b_z_i_p_2 compresses from
47 standard input to standard output. In this case, _b_z_i_p_2 43 standard input to standard output. In this case, _b_z_i_p_2
@@ -49,42 +45,50 @@ DDEESSCCRRIIPPTTIIOONN
49 this would be entirely incomprehensible and therefore 45 this would be entirely incomprehensible and therefore
50 pointless. 46 pointless.
51 47
52 _b_u_n_z_i_p_2 (or _b_z_i_p_2 _-_d ) decompresses and restores all spec- 48 _b_u_n_z_i_p_2 (or _b_z_i_p_2 _-_d_) decompresses all specified files.
53 ified files whose names end in ".bz2". Files without this 49 Files which were not created by _b_z_i_p_2 will be detected and
54 suffix are ignored. Again, supplying no filenames causes 50 ignored, and a warning issued. _b_z_i_p_2 attempts to guess
55 decompression from standard input to standard output. 51 the filename for the decompressed file from that of the
52 compressed file as follows:
53
54 filename.bz2 becomes filename
55 filename.bz becomes filename
56 filename.tbz2 becomes filename.tar
57 filename.tbz becomes filename.tar
58 anyothername becomes anyothername.out
59
60 If the file does not end in one of the recognised endings,
61 _._b_z_2_, _._b_z_, _._t_b_z_2 or _._t_b_z_, _b_z_i_p_2 complains that it cannot
62 guess the name of the original file, and uses the original
63 name with _._o_u_t appended.
64
65 As with compression, supplying no filenames causes decom-
66 pression from standard input to standard output.
56 67
57 _b_u_n_z_i_p_2 will correctly decompress a file which is the con- 68 _b_u_n_z_i_p_2 will correctly decompress a file which is the con-
58 catenation of two or more compressed files. The result is 69 catenation of two or more compressed files. The result is
59 the concatenation of the corresponding uncompressed files. 70 the concatenation of the corresponding uncompressed files.
60 Integrity testing (-t) of concatenated compressed files is 71 Integrity testing (-t) of concatenated compressed files is
61
62
63
64 1
65
66
67
68
69
70bzip2(1) bzip2(1)
71
72
73 also supported. 72 also supported.
74 73
75 You can also compress or decompress files to the standard 74 You can also compress or decompress files to the standard
76 output by giving the -c flag. Multiple files may be com- 75 output by giving the -c flag. Multiple files may be com-
77 pressed and decompressed like this. The resulting outputs 76 pressed and decompressed like this. The resulting outputs
78 are fed sequentially to stdout. Compression of multiple 77 are fed sequentially to stdout. Compression of multiple
79 files in this manner generates a stream containing multi- 78 files in this manner generates a stream containing multi-
80 ple compressed file representations. Such a stream can be 79 ple compressed file representations. Such a stream can be
81 decompressed correctly only by _b_z_i_p_2 version 0.9.0 or 80 decompressed correctly only by _b_z_i_p_2 version 0.9.0 or
82 later. Earlier versions of _b_z_i_p_2 will stop after decom- 81 later. Earlier versions of _b_z_i_p_2 will stop after decom-
83 pressing the first file in the stream. 82 pressing the first file in the stream.
84 83
85 _b_z_c_a_t (or _b_z_i_p_2 _-_d_c ) decompresses all specified files to 84 _b_z_c_a_t (or _b_z_i_p_2 _-_d_c_) decompresses all specified files to
86 the standard output. 85 the standard output.
87 86
87 _b_z_i_p_2 will read arguments from the environment variables
88 _B_Z_I_P_2 and _B_Z_I_P_, in that order, and will process them
89 before any arguments read from the command line. This
90 gives a convenient way to supply default arguments.
91
88 Compression is always performed, even if the compressed 92 Compression is always performed, even if the compressed
89 file is slightly larger than the original. Files of less 93 file is slightly larger than the original. Files of less
90 than about one hundred bytes tend to get larger, since the 94 than about one hundred bytes tend to get larger, since the
@@ -101,121 +105,19 @@ bzip2(1) bzip2(1)
101 corruption going undetected is microscopic, about one 105 corruption going undetected is microscopic, about one
102 chance in four billion for each file processed. Be aware, 106 chance in four billion for each file processed. Be aware,
103 though, that the check occurs upon decompression, so it 107 though, that the check occurs upon decompression, so it
104 can only tell you that that something is wrong. It can't 108 can only tell you that something is wrong. It can't help
105 help you recover the original uncompressed data. You can 109 you recover the original uncompressed data. You can use
106 use _b_z_i_p_2_r_e_c_o_v_e_r to try to recover data from damaged 110 _b_z_i_p_2_r_e_c_o_v_e_r to try to recover data from damaged files.
107 files.
108 111
109 Return values: 0 for a normal exit, 1 for environmental 112 Return values: 0 for a normal exit, 1 for environmental
110 problems (file not found, invalid flags, I/O errors, &c), 113 problems (file not found, invalid flags, I/O errors, &c),
111 2 to indicate a corrupt compressed file, 3 for an internal 114 2 to indicate a corrupt compressed file, 3 for an internal
112 consistency error (eg, bug) which caused _b_z_i_p_2 to panic. 115 consistency error (eg, bug) which caused _b_z_i_p_2 to panic.
113 116
114 117
115MMEEMMOORRYY MMAANNAAGGEEMMEENNTT
116 _B_z_i_p_2 compresses large files in blocks. The block size
117 affects both the compression ratio achieved, and the
118 amount of memory needed both for compression and decom-
119 pression. The flags -1 through -9 specify the block size
120 to be 100,000 bytes through 900,000 bytes (the default)
121 respectively. At decompression-time, the block size used
122 for compression is read from the header of the compressed
123 file, and _b_u_n_z_i_p_2 then allocates itself just enough memory
124 to decompress the file. Since block sizes are stored in
125 compressed files, it follows that the flags -1 to -9 are
126 irrelevant to and so ignored during decompression.
127
128
129
130 2
131
132
133
134
135
136bzip2(1) bzip2(1)
137
138
139 Compression and decompression requirements, in bytes, can
140 be estimated as:
141
142 Compression: 400k + ( 7 x block size )
143
144 Decompression: 100k + ( 4 x block size ), or
145 100k + ( 2.5 x block size )
146
147 Larger block sizes give rapidly diminishing marginal
148 returns; most of the compression comes from the first two
149 or three hundred k of block size, a fact worth bearing in
150 mind when using _b_z_i_p_2 on small machines. It is also
151 important to appreciate that the decompression memory
152 requirement is set at compression-time by the choice of
153 block size.
154
155 For files compressed with the default 900k block size,
156 _b_u_n_z_i_p_2 will require about 3700 kbytes to decompress. To
157 support decompression of any file on a 4 megabyte machine,
158 _b_u_n_z_i_p_2 has an option to decompress using approximately
159 half this amount of memory, about 2300 kbytes. Decompres-
160 sion speed is also halved, so you should use this option
161 only where necessary. The relevant flag is -s.
162
163 In general, try and use the largest block size memory con-
164 straints allow, since that maximises the compression
165 achieved. Compression and decompression speed are virtu-
166 ally unaffected by block size.
167
168 Another significant point applies to files which fit in a
169 single block -- that means most files you'd encounter
170 using a large block size. The amount of real memory
171 touched is proportional to the size of the file, since the
172 file is smaller than a block. For example, compressing a
173 file 20,000 bytes long with the flag -9 will cause the
174 compressor to allocate around 6700k of memory, but only
175 touch 400k + 20000 * 7 = 540 kbytes of it. Similarly, the
176 decompressor will allocate 3700k but only touch 100k +
177 20000 * 4 = 180 kbytes.
178
179 Here is a table which summarises the maximum memory usage
180 for different block sizes. Also recorded is the total
181 compressed size for 14 files of the Calgary Text Compres-
182 sion Corpus totalling 3,141,622 bytes. This column gives
183 some feel for how compression varies with block size.
184 These figures tend to understate the advantage of larger
185 block sizes for larger files, since the Corpus is domi-
186 nated by smaller files.
187
188 Compress Decompress Decompress Corpus
189 Flag usage usage -s usage Size
190
191 -1 1100k 500k 350k 914704
192 -2 1800k 900k 600k 877703
193
194
195
196 3
197
198
199
200
201
202bzip2(1) bzip2(1)
203
204
205 -3 2500k 1300k 850k 860338
206 -4 3200k 1700k 1100k 846899
207 -5 3900k 2100k 1350k 845160
208 -6 4600k 2500k 1600k 838626
209 -7 5400k 2900k 1850k 834096
210 -8 6000k 3300k 2100k 828642
211 -9 6700k 3700k 2350k 828642
212
213
214OOPPTTIIOONNSS 118OOPPTTIIOONNSS
215 --cc ----ssttddoouutt 119 --cc ----ssttddoouutt
216 Compress or decompress to standard output. -c will 120 Compress or decompress to standard output.
217 decompress multiple files to stdout, but will only
218 compress a single file to stdout.
219 121
220 --dd ----ddeeccoommpprreessss 122 --dd ----ddeeccoommpprreessss
221 Force decompression. _b_z_i_p_2_, _b_u_n_z_i_p_2 and _b_z_c_a_t are 123 Force decompression. _b_z_i_p_2_, _b_u_n_z_i_p_2 and _b_z_c_a_t are
@@ -235,7 +137,9 @@ OOPPTTIIOONNSS
235 137
236 --ff ----ffoorrccee 138 --ff ----ffoorrccee
237 Force overwrite of output files. Normally, _b_z_i_p_2 139 Force overwrite of output files. Normally, _b_z_i_p_2
238 will not overwrite existing output files. 140 will not overwrite existing output files. Also
141 forces _b_z_i_p_2 to break hard links to files, which it
142 otherwise wouldn't do.
239 143
240 --kk ----kkeeeepp 144 --kk ----kkeeeepp
241 Keep (don't delete) input files during compression 145 Keep (don't delete) input files during compression
@@ -254,19 +158,12 @@ OOPPTTIIOONNSS
254 figure, at the expense of your compression ratio. 158 figure, at the expense of your compression ratio.
255 In short, if your machine is low on memory (8 159 In short, if your machine is low on memory (8
256 megabytes or less), use -s for everything. See 160 megabytes or less), use -s for everything. See
257 MEMORY MANAGEMENT above. 161 MEMORY MANAGEMENT below.
258
259
260
261
262 4
263
264
265
266
267
268bzip2(1) bzip2(1)
269 162
163 --qq ----qquuiieett
164 Suppress non-essential warning messages. Messages
165 pertaining to I/O errors and other critical events
166 will not be suppressed.
270 167
271 --vv ----vveerrbboossee 168 --vv ----vveerrbboossee
272 Verbose mode -- show the compression ratio for each 169 Verbose mode -- show the compression ratio for each
@@ -281,22 +178,96 @@ bzip2(1) bzip2(1)
281 --11 ttoo --99 178 --11 ttoo --99
282 Set the block size to 100 k, 200 k .. 900 k when 179 Set the block size to 100 k, 200 k .. 900 k when
283 compressing. Has no effect when decompressing. 180 compressing. Has no effect when decompressing.
284 See MEMORY MANAGEMENT above. 181 See MEMORY MANAGEMENT below.
182
183 ---- Treats all subsequent arguments as file names, even
184 if they start with a dash. This is so you can han-
185 dle files with names beginning with a dash, for
186 example: bzip2 -- -myfilename.
187
188 ----rreeppeettiittiivvee--ffaasstt ----rreeppeettiittiivvee--bbeesstt
189 These flags are redundant in versions 0.9.5 and
190 above. They provided some coarse control over the
191 behaviour of the sorting algorithm in earlier ver-
192 sions, which was sometimes useful. 0.9.5 and above
193 have an improved algorithm which renders these
194 flags irrelevant.
195
196
197MMEEMMOORRYY MMAANNAAGGEEMMEENNTT
198 _b_z_i_p_2 compresses large files in blocks. The block size
199 affects both the compression ratio achieved, and the
200 amount of memory needed for compression and decompression.
201 The flags -1 through -9 specify the block size to be
202 100,000 bytes through 900,000 bytes (the default) respec-
203 tively. At decompression time, the block size used for
204 compression is read from the header of the compressed
205 file, and _b_u_n_z_i_p_2 then allocates itself just enough memory
206 to decompress the file. Since block sizes are stored in
207 compressed files, it follows that the flags -1 to -9 are
208 irrelevant to and so ignored during decompression.
285 209
286 ----rreeppeettiittiivvee--ffaasstt 210 Compression and decompression requirements, in bytes, can
287 _b_z_i_p_2 injects some small pseudo-random variations 211 be estimated as:
288 into very repetitive blocks to limit worst-case
289 performance during compression. If sorting runs
290 into difficulties, the block is randomised, and
291 sorting is restarted. Very roughly, _b_z_i_p_2 persists
292 for three times as long as a well-behaved input
293 would take before resorting to randomisation. This
294 flag makes it give up much sooner.
295 212
213 Compression: 400k + ( 8 x block size )
296 214
297 ----rreeppeettiittiivvee--bbeesstt 215 Decompression: 100k + ( 4 x block size ), or
298 Opposite of --repetitive-fast; try a lot harder 216 100k + ( 2.5 x block size )
299 before resorting to randomisation. 217
218 Larger block sizes give rapidly diminishing marginal
219 returns. Most of the compression comes from the first two
220 or three hundred k of block size, a fact worth bearing in
221 mind when using _b_z_i_p_2 on small machines. It is also
222 important to appreciate that the decompression memory
223 requirement is set at compression time by the choice of
224 block size.
225
226 For files compressed with the default 900k block size,
227 _b_u_n_z_i_p_2 will require about 3700 kbytes to decompress. To
228 support decompression of any file on a 4 megabyte machine,
229 _b_u_n_z_i_p_2 has an option to decompress using approximately
230 half this amount of memory, about 2300 kbytes. Decompres-
231 sion speed is also halved, so you should use this option
232 only where necessary. The relevant flag is -s.
233
234 In general, try and use the largest block size memory con-
235 straints allow, since that maximises the compression
236 achieved. Compression and decompression speed are virtu-
237 ally unaffected by block size.
238
239 Another significant point applies to files which fit in a
240 single block -- that means most files you'd encounter
241 using a large block size. The amount of real memory
242 touched is proportional to the size of the file, since the
243 file is smaller than a block. For example, compressing a
244 file 20,000 bytes long with the flag -9 will cause the
245 compressor to allocate around 7600k of memory, but only
246 touch 400k + 20000 * 8 = 560 kbytes of it. Similarly, the
247 decompressor will allocate 3700k but only touch 100k +
248 20000 * 4 = 180 kbytes.
249
250 Here is a table which summarises the maximum memory usage
251 for different block sizes. Also recorded is the total
252 compressed size for 14 files of the Calgary Text Compres-
253 sion Corpus totalling 3,141,622 bytes. This column gives
254 some feel for how compression varies with block size.
255 These figures tend to understate the advantage of larger
256 block sizes for larger files, since the Corpus is domi-
257 nated by smaller files.
258
259 Compress Decompress Decompress Corpus
260 Flag usage usage -s usage Size
261
262 -1 1200k 500k 350k 914704
263 -2 2000k 900k 600k 877703
264 -3 2800k 1300k 850k 860338
265 -4 3600k 1700k 1100k 846899
266 -5 4400k 2100k 1350k 845160
267 -6 5200k 2500k 1600k 838626
268 -7 6100k 2900k 1850k 834096
269 -8 6800k 3300k 2100k 828642
270 -9 7600k 3700k 2350k 828642
300 271
301 272
302RREECCOOVVEERRIINNGG DDAATTAA FFRROOMM DDAAMMAAGGEEDD FFIILLEESS 273RREECCOOVVEERRIINNGG DDAATTAA FFRROOMM DDAAMMAAGGEEDD FFIILLEESS
@@ -314,7 +285,7 @@ RREECCOOVVEERRIINNGG DDAATTAA FFRROOMM DDAAMMAAGGEEDD F
314 285
315 _b_z_i_p_2_r_e_c_o_v_e_r is a simple program whose purpose is to 286 _b_z_i_p_2_r_e_c_o_v_e_r is a simple program whose purpose is to
316 search for blocks in .bz2 files, and write each block out 287 search for blocks in .bz2 files, and write each block out
317 into its own .bz2 file. You can then use _b_z_i_p_2 _-_t to test 288 into its own .bz2 file. You can then use _b_z_i_p_2 -t to test
318 the integrity of the resulting files, and decompress those 289 the integrity of the resulting files, and decompress those
319 which are undamaged. 290 which are undamaged.
320 291
@@ -322,21 +293,9 @@ RREECCOOVVEERRIINNGG DDAATTAA FFRROOMM DDAAMMAAGGEEDD F
322 aged file, and writes a number of files "rec0001file.bz2", 293 aged file, and writes a number of files "rec0001file.bz2",
323 "rec0002file.bz2", etc, containing the extracted blocks. 294 "rec0002file.bz2", etc, containing the extracted blocks.
324 The output filenames are designed so that the use of 295 The output filenames are designed so that the use of
325
326
327
328 5
329
330
331
332
333
334bzip2(1) bzip2(1)
335
336
337 wildcards in subsequent processing -- for example, "bzip2 296 wildcards in subsequent processing -- for example, "bzip2
338 -dc rec*file.bz2 > recovered_data" -- lists the files in 297 -dc rec*file.bz2 > recovered_data" -- lists the files in
339 the "right" order. 298 the correct order.
340 299
341 _b_z_i_p_2_r_e_c_o_v_e_r should be of most use dealing with large .bz2 300 _b_z_i_p_2_r_e_c_o_v_e_r should be of most use dealing with large .bz2
342 files, as these will contain many blocks. It is clearly 301 files, as these will contain many blocks. It is clearly
@@ -351,17 +310,15 @@ PPEERRFFOORRMMAANNCCEE NNOOTTEESS
351 The sorting phase of compression gathers together similar 310 The sorting phase of compression gathers together similar
352 strings in the file. Because of this, files containing 311 strings in the file. Because of this, files containing
353 very long runs of repeated symbols, like "aabaabaabaab 312 very long runs of repeated symbols, like "aabaabaabaab
354 ..." (repeated several hundred times) may compress 313 ..." (repeated several hundred times) may compress more
355 extraordinarily slowly. You can use the -vvvvv option to 314 slowly than normal. Versions 0.9.5 and above fare much
356 monitor progress in great detail, if you want. Decompres- 315 better than previous versions in this respect. The ratio
357 sion speed is unaffected. 316 between worst-case and average-case compression time is in
358 317 the region of 10:1. For previous versions, this figure
359 Such pathological cases seem rare in practice, appearing 318 was more like 100:1. You can use the -vvvv option to mon-
360 mostly in artificially-constructed test files, and in low- 319 itor progress in great detail, if you want.
361 level disk images. It may be inadvisable to use _b_z_i_p_2 to 320
362 compress the latter. If you do get a file which causes 321 Decompression speed is unaffected by these phenomena.
363 severe slowness in compression, try making the block size
364 as small as possible, with flag -1.
365 322
366 _b_z_i_p_2 usually allocates several megabytes of memory to 323 _b_z_i_p_2 usually allocates several megabytes of memory to
367 operate in, and then charges all over it in a fairly ran- 324 operate in, and then charges all over it in a fairly ran-
@@ -376,88 +333,43 @@ PPEERRFFOORRMMAANNCCEE NNOOTTEESS
376 333
377CCAAVVEEAATTSS 334CCAAVVEEAATTSS
378 I/O error messages are not as helpful as they could be. 335 I/O error messages are not as helpful as they could be.
379 _B_z_i_p_2 tries hard to detect I/O errors and exit cleanly, 336 _b_z_i_p_2 tries hard to detect I/O errors and exit cleanly,
380 but the details of what the problem is sometimes seem 337 but the details of what the problem is sometimes seem
381 rather misleading. 338 rather misleading.
382 339
383 This manual page pertains to version 0.9.0 of _b_z_i_p_2_. Com- 340 This manual page pertains to version 0.9.5 of _b_z_i_p_2_. Com-
384 pressed data created by this version is entirely forwards 341 pressed data created by this version is entirely forwards
385 and backwards compatible with the previous public release, 342 and backwards compatible with the previous public
386 version 0.1pl2, but with the following exception: 0.9.0 343 releases, versions 0.1pl2 and 0.9.0, but with the follow-
387 can correctly decompress multiple concatenated compressed 344 ing exception: 0.9.0 and above can correctly decompress
388 files. 0.1pl2 cannot do this; it will stop after decom- 345 multiple concatenated compressed files. 0.1pl2 cannot do
389 pressing just the first file in the stream. 346 this; it will stop after decompressing just the first file
390 347 in the stream.
391 348
392 349 _b_z_i_p_2_r_e_c_o_v_e_r uses 32-bit integers to represent bit posi-
393 350 tions in compressed files, so it cannot handle compressed
394 6 351 files more than 512 megabytes long. This could easily be
395
396
397
398
399
400bzip2(1) bzip2(1)
401
402
403 Wildcard expansion for Windows 95 and NT is flaky.
404
405 _b_z_i_p_2_r_e_c_o_v_e_r uses 32-bit integers to represent bit posi-
406 tions in compressed files, so it cannot handle compressed
407 files more than 512 megabytes long. This could easily be
408 fixed. 352 fixed.
409 353
410 354
411AAUUTTHHOORR 355AAUUTTHHOORR
412 Julian Seward, jseward@acm.org. 356 Julian Seward, jseward@acm.org.
357
413 http://www.muraroa.demon.co.uk 358 http://www.muraroa.demon.co.uk
414 359
415 The ideas embodied in _b_z_i_p_2 are due to (at least) the fol- 360 The ideas embodied in _b_z_i_p_2 are due to (at least) the fol-
416 lowing people: Michael Burrows and David Wheeler (for the 361 lowing people: Michael Burrows and David Wheeler (for the
417 block sorting transformation), David Wheeler (again, for 362 block sorting transformation), David Wheeler (again, for
418 the Huffman coder), Peter Fenwick (for the structured cod- 363 the Huffman coder), Peter Fenwick (for the structured cod-
419 ing model in the original _b_z_i_p_, and many refinements), and 364 ing model in the original _b_z_i_p_, and many refinements), and
420 Alistair Moffat, Radford Neal and Ian Witten (for the 365 Alistair Moffat, Radford Neal and Ian Witten (for the
421 arithmetic coder in the original _b_z_i_p_)_. I am much 366 arithmetic coder in the original _b_z_i_p_)_. I am much
422 indebted for their help, support and advice. See the man- 367 indebted for their help, support and advice. See the man-
423 ual in the source distribution for pointers to sources of 368 ual in the source distribution for pointers to sources of
424 documentation. Christian von Roques encouraged me to look 369 documentation. Christian von Roques encouraged me to look
425 for faster sorting algorithms, so as to speed up compres- 370 for faster sorting algorithms, so as to speed up compres-
426 sion. Bela Lubkin encouraged me to improve the worst-case 371 sion. Bela Lubkin encouraged me to improve the worst-case
427 compression performance. Many people sent patches, helped 372 compression performance. Many people sent patches, helped
428 with portability problems, lent machines, gave advice and 373 with portability problems, lent machines, gave advice and
429 were generally helpful. 374 were generally helpful.
430 375
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461 7
462
463
diff --git a/bzip2.c b/bzip2.c
index 6a3ab95..abb9530 100644
--- a/bzip2.c
+++ b/bzip2.c
@@ -7,7 +7,7 @@
7 This file is a part of bzip2 and/or libbzip2, a program and 7 This file is a part of bzip2 and/or libbzip2, a program and
8 library for lossless, block-sorting data compression. 8 library for lossless, block-sorting data compression.
9 9
10 Copyright (C) 1996-1998 Julian R Seward. All rights reserved. 10 Copyright (C) 1996-1999 Julian R Seward. All rights reserved.
11 11
12 Redistribution and use in source and binary forms, with or without 12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions 13 modification, are permitted provided that the following conditions
@@ -40,9 +40,9 @@
40 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 40 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
41 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 41 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42 42
43 Julian Seward, Guildford, Surrey, UK. 43 Julian Seward, Cambridge, UK.
44 jseward@acm.org 44 jseward@acm.org
45 bzip2/libbzip2 version 0.9.0c of 18 October 1998 45 bzip2/libbzip2 version 0.9.5 of 24 May 1999
46 46
47 This program is based on (at least) the work of: 47 This program is based on (at least) the work of:
48 Mike Burrows 48 Mike Burrows
@@ -123,8 +123,10 @@
123--*/ 123--*/
124#define BZ_LCCWIN32 0 124#define BZ_LCCWIN32 0
125 125
126#ifdef _WIN32 126#if defined(_WIN32) && !defined(__CYGWIN32__)
127#undef BZ_LCCWIN32
127#define BZ_LCCWIN32 1 128#define BZ_LCCWIN32 1
129#undef BZ_UNIX
128#define BZ_UNIX 0 130#define BZ_UNIX 0
129#endif 131#endif
130 132
@@ -139,6 +141,8 @@
139#include <string.h> 141#include <string.h>
140#include <signal.h> 142#include <signal.h>
141#include <math.h> 143#include <math.h>
144#include <errno.h>
145#include <ctype.h>
142#include "bzlib.h" 146#include "bzlib.h"
143 147
144#define ERROR_IF_EOF(i) { if ((i) == EOF) ioError(); } 148#define ERROR_IF_EOF(i) { if ((i) == EOF) ioError(); }
@@ -166,6 +170,9 @@
166# define APPEND_FILESPEC(root, name) \ 170# define APPEND_FILESPEC(root, name) \
167 root=snocString((root), (name)) 171 root=snocString((root), (name))
168 172
173# define APPEND_FLAG(root, name) \
174 root=snocString((root), (name))
175
169# define SET_BINARY_MODE(fd) /**/ 176# define SET_BINARY_MODE(fd) /**/
170 177
171# ifdef __GNUC__ 178# ifdef __GNUC__
@@ -173,6 +180,19 @@
173# else 180# else
174# define NORETURN /**/ 181# define NORETURN /**/
175# endif 182# endif
183# ifdef __DJGPP__
184# include <io.h>
185# include <fcntl.h>
186# undef MY_LSTAT
187# define MY_LSTAT stat
188# undef SET_BINARY_MODE
189# define SET_BINARY_MODE(fd) \
190 do { \
191 int retVal = setmode ( fileno ( fd ), \
192 O_BINARY ); \
193 ERROR_IF_MINUS_ONE ( retVal ); \
194 } while ( 0 )
195# endif
176#endif 196#endif
177 197
178 198
@@ -188,6 +208,9 @@
188# define MY_STAT _stat 208# define MY_STAT _stat
189# define MY_S_IFREG(x) ((x) & _S_IFREG) 209# define MY_S_IFREG(x) ((x) & _S_IFREG)
190 210
211# define APPEND_FLAG(root, name) \
212 root=snocString((root), (name))
213
191# if 0 214# if 0
192 /*-- lcc-win32 seems to expand wildcards itself --*/ 215 /*-- lcc-win32 seems to expand wildcards itself --*/
193# define APPEND_FILESPEC(root, spec) \ 216# define APPEND_FILESPEC(root, spec) \
@@ -254,7 +277,7 @@ typedef int IntNative;
254 277
255Int32 verbosity; 278Int32 verbosity;
256Bool keepInputFiles, smallMode; 279Bool keepInputFiles, smallMode;
257Bool forceOverwrite, testFailsExist; 280Bool forceOverwrite, testFailsExist, noisy;
258Int32 numFileNames, numFilesProcessed, blockSize100k; 281Int32 numFileNames, numFilesProcessed, blockSize100k;
259 282
260 283
@@ -274,8 +297,9 @@ Int32 srcMode;
274#define FILE_NAME_LEN 1034 297#define FILE_NAME_LEN 1034
275 298
276Int32 longestFileName; 299Int32 longestFileName;
277Char inName[FILE_NAME_LEN]; 300Char inName [FILE_NAME_LEN];
278Char outName[FILE_NAME_LEN]; 301Char outName[FILE_NAME_LEN];
302Char tmpName[FILE_NAME_LEN];
279Char *progName; 303Char *progName;
280Char progNameReally[FILE_NAME_LEN]; 304Char progNameReally[FILE_NAME_LEN];
281FILE *outputHandleJustInCase; 305FILE *outputHandleJustInCase;
@@ -467,6 +491,7 @@ Bool uncompressStream ( FILE *zStream, FILE *stream )
467 if (streamNo == 1) { 491 if (streamNo == 1) {
468 return False; 492 return False;
469 } else { 493 } else {
494 if (noisy)
470 fprintf ( stderr, 495 fprintf ( stderr,
471 "\n%s: %s: trailing garbage after EOF ignored\n", 496 "\n%s: %s: trailing garbage after EOF ignored\n",
472 progName, inName ); 497 progName, inName );
@@ -532,32 +557,31 @@ Bool testStream ( FILE *zStream )
532 557
533 errhandler: 558 errhandler:
534 bzReadClose ( &bzerr_dummy, bzf ); 559 bzReadClose ( &bzerr_dummy, bzf );
560 if (verbosity == 0)
561 fprintf ( stderr, "%s: %s: ", progName, inName );
535 switch (bzerr) { 562 switch (bzerr) {
536 case BZ_IO_ERROR: 563 case BZ_IO_ERROR:
537 errhandler_io: 564 errhandler_io:
538 ioError(); break; 565 ioError(); break;
539 case BZ_DATA_ERROR: 566 case BZ_DATA_ERROR:
540 fprintf ( stderr, 567 fprintf ( stderr,
541 "\n%s: data integrity (CRC) error in data\n", 568 "data integrity (CRC) error in data\n" );
542 inName );
543 return False; 569 return False;
544 case BZ_MEM_ERROR: 570 case BZ_MEM_ERROR:
545 outOfMemory(); 571 outOfMemory();
546 case BZ_UNEXPECTED_EOF: 572 case BZ_UNEXPECTED_EOF:
547 fprintf ( stderr, 573 fprintf ( stderr,
548 "\n%s: file ends unexpectedly\n", 574 "file ends unexpectedly\n" );
549 inName );
550 return False; 575 return False;
551 case BZ_DATA_ERROR_MAGIC: 576 case BZ_DATA_ERROR_MAGIC:
552 if (streamNo == 1) { 577 if (streamNo == 1) {
553 fprintf ( stderr, 578 fprintf ( stderr,
554 "\n%s: bad magic number (ie, not created by bzip2)\n", 579 "bad magic number (file not created by bzip2)\n" );
555 inName );
556 return False; 580 return False;
557 } else { 581 } else {
582 if (noisy)
558 fprintf ( stderr, 583 fprintf ( stderr,
559 "\n%s: %s: trailing garbage after EOF ignored\n", 584 "trailing garbage after EOF ignored\n" );
560 progName, inName );
561 return True; 585 return True;
562 } 586 }
563 default: 587 default:
@@ -576,6 +600,7 @@ Bool testStream ( FILE *zStream )
576/*---------------------------------------------*/ 600/*---------------------------------------------*/
577void cadvise ( void ) 601void cadvise ( void )
578{ 602{
603 if (noisy)
579 fprintf ( 604 fprintf (
580 stderr, 605 stderr,
581 "\nIt is possible that the compressed file(s) have become corrupted.\n" 606 "\nIt is possible that the compressed file(s) have become corrupted.\n"
@@ -589,6 +614,7 @@ void cadvise ( void )
589/*---------------------------------------------*/ 614/*---------------------------------------------*/
590void showFileNames ( void ) 615void showFileNames ( void )
591{ 616{
617 if (noisy)
592 fprintf ( 618 fprintf (
593 stderr, 619 stderr,
594 "\tInput file = %s, output file = %s\n", 620 "\tInput file = %s, output file = %s\n",
@@ -603,6 +629,7 @@ void cleanUpAndFail ( Int32 ec )
603 IntNative retVal; 629 IntNative retVal;
604 630
605 if ( srcMode == SM_F2F && opMode != OM_TEST ) { 631 if ( srcMode == SM_F2F && opMode != OM_TEST ) {
632 if (noisy)
606 fprintf ( stderr, "%s: Deleting output file %s, if it exists.\n", 633 fprintf ( stderr, "%s: Deleting output file %s, if it exists.\n",
607 progName, outName ); 634 progName, outName );
608 if (outputHandleJustInCase != NULL) 635 if (outputHandleJustInCase != NULL)
@@ -613,7 +640,7 @@ void cleanUpAndFail ( Int32 ec )
613 "%s: WARNING: deletion of output file (apparently) failed.\n", 640 "%s: WARNING: deletion of output file (apparently) failed.\n",
614 progName ); 641 progName );
615 } 642 }
616 if (numFileNames > 0 && numFilesProcessed < numFileNames) { 643 if (noisy && numFileNames > 0 && numFilesProcessed < numFileNames) {
617 fprintf ( stderr, 644 fprintf ( stderr,
618 "%s: WARNING: some files have not been processed:\n" 645 "%s: WARNING: some files have not been processed:\n"
619 "\t%d specified on command line, %d not processed yet.\n\n", 646 "\t%d specified on command line, %d not processed yet.\n\n",
@@ -639,7 +666,7 @@ void panic ( Char* s )
639 666
640 667
641/*---------------------------------------------*/ 668/*---------------------------------------------*/
642void crcError () 669void crcError ( void )
643{ 670{
644 fprintf ( stderr, 671 fprintf ( stderr,
645 "\n%s: Data integrity error when decompressing.\n", 672 "\n%s: Data integrity error when decompressing.\n",
@@ -665,7 +692,7 @@ void compressedStreamEOF ( void )
665 692
666 693
667/*---------------------------------------------*/ 694/*---------------------------------------------*/
668void ioError ( ) 695void ioError ( void )
669{ 696{
670 fprintf ( stderr, 697 fprintf ( stderr,
671 "\n%s: I/O or other error, bailing out. Possible reason follows.\n", 698 "\n%s: I/O or other error, bailing out. Possible reason follows.\n",
@@ -680,7 +707,7 @@ void ioError ( )
680void mySignalCatcher ( IntNative n ) 707void mySignalCatcher ( IntNative n )
681{ 708{
682 fprintf ( stderr, 709 fprintf ( stderr,
683 "\n%s: Control-C (or similar) caught, quitting.\n", 710 "\n%s: Control-C or similar caught, quitting.\n",
684 progName ); 711 progName );
685 cleanUpAndFail(1); 712 cleanUpAndFail(1);
686} 713}
@@ -740,9 +767,10 @@ void copyFileName ( Char* to, Char* from )
740 if ( strlen(from) > FILE_NAME_LEN-10 ) { 767 if ( strlen(from) > FILE_NAME_LEN-10 ) {
741 fprintf ( 768 fprintf (
742 stderr, 769 stderr,
743 "bzip2: file name\n`%s'\nis suspiciously (> 1024 chars) long.\n" 770 "bzip2: file name\n`%s'\n"
744 "Try using a reasonable file name instead. Sorry! :)\n", 771 "is suspiciously (more than %d chars) long.\n"
745 from 772 "Try using a reasonable file name instead. Sorry! :-)\n",
773 from, FILE_NAME_LEN-10
746 ); 774 );
747 exit(1); 775 exit(1);
748 } 776 }
@@ -779,6 +807,21 @@ Bool notAStandardFile ( Char* name )
779 807
780 808
781/*---------------------------------------------*/ 809/*---------------------------------------------*/
810/*--
811 rac 11/21/98 see if file has hard links to it
812--*/
813Int32 countHardLinks ( Char* name )
814{
815 IntNative i;
816 struct MY_STAT statBuf;
817
818 i = MY_LSTAT ( name, &statBuf );
819 if (i != 0) return 0;
820 return (statBuf.st_nlink - 1);
821}
822
823
824/*---------------------------------------------*/
782void copyDatePermissionsAndOwner ( Char *srcName, Char *dstName ) 825void copyDatePermissionsAndOwner ( Char *srcName, Char *dstName )
783{ 826{
784#if BZ_UNIX 827#if BZ_UNIX
@@ -793,17 +836,14 @@ void copyDatePermissionsAndOwner ( Char *srcName, Char *dstName )
793 836
794 retVal = chmod ( dstName, statBuf.st_mode ); 837 retVal = chmod ( dstName, statBuf.st_mode );
795 ERROR_IF_NOT_ZERO ( retVal ); 838 ERROR_IF_NOT_ZERO ( retVal );
796 /* Not sure if this is really portable or not. Causes 839
797 problems on my x86-Linux Redhat 5.0 box. Decided
798 to omit it from 0.9.0. JRS, 27 June 98. If you
799 understand Unix file semantics and portability issues
800 well enough to fix this properly, drop me a line
801 at jseward@acm.org.
802 retVal = chown ( dstName, statBuf.st_uid, statBuf.st_gid );
803 ERROR_IF_NOT_ZERO ( retVal );
804 */
805 retVal = utime ( dstName, &uTimBuf ); 840 retVal = utime ( dstName, &uTimBuf );
806 ERROR_IF_NOT_ZERO ( retVal ); 841 ERROR_IF_NOT_ZERO ( retVal );
842
843 retVal = chown ( dstName, statBuf.st_uid, statBuf.st_gid );
844 /* chown() will in many cases return with EPERM, which can
845 be safely ignored.
846 */
807#endif 847#endif
808} 848}
809 849
@@ -819,20 +859,6 @@ void setInterimPermissions ( Char *dstName )
819} 859}
820 860
821 861
822
823/*---------------------------------------------*/
824Bool endsInBz2 ( Char* name )
825{
826 Int32 n = strlen ( name );
827 if (n <= 4) return False;
828 return
829 (name[n-4] == '.' &&
830 name[n-3] == 'b' &&
831 name[n-2] == 'z' &&
832 name[n-1] == '2');
833}
834
835
836/*---------------------------------------------*/ 862/*---------------------------------------------*/
837Bool containsDubiousChars ( Char* name ) 863Bool containsDubiousChars ( Char* name )
838{ 864{
@@ -844,49 +870,94 @@ Bool containsDubiousChars ( Char* name )
844 870
845 871
846/*---------------------------------------------*/ 872/*---------------------------------------------*/
847void compress ( Char *name ) 873#define BZ_N_SUFFIX_PAIRS 4
874
875Char* zSuffix[BZ_N_SUFFIX_PAIRS]
876 = { ".bz2", ".bz", ".tbz2", ".tbz" };
877Char* unzSuffix[BZ_N_SUFFIX_PAIRS]
878 = { "", "", ".tar", ".tar" };
879
880Bool hasSuffix ( Char* s, Char* suffix )
848{ 881{
849 FILE *inStr; 882 Int32 ns = strlen(s);
850 FILE *outStr; 883 Int32 nx = strlen(suffix);
884 if (ns < nx) return False;
885 if (strcmp(s + ns - nx, suffix) == 0) return True;
886 return False;
887}
851 888
889Bool mapSuffix ( Char* name,
890 Char* oldSuffix, Char* newSuffix )
891{
892 if (!hasSuffix(name,oldSuffix)) return False;
893 name[strlen(name)-strlen(oldSuffix)] = 0;
894 strcat ( name, newSuffix );
895 return True;
896}
897
898
899/*---------------------------------------------*/
900void compress ( Char *name )
901{
902 FILE *inStr;
903 FILE *outStr;
904 Int32 n, i;
852 if (name == NULL && srcMode != SM_I2O) 905 if (name == NULL && srcMode != SM_I2O)
853 panic ( "compress: bad modes\n" ); 906 panic ( "compress: bad modes\n" );
854 907
855 switch (srcMode) { 908 switch (srcMode) {
856 case SM_I2O: copyFileName ( inName, "(stdin)" ); 909 case SM_I2O:
857 copyFileName ( outName, "(stdout)" ); break; 910 copyFileName ( inName, "(stdin)" );
858 case SM_F2F: copyFileName ( inName, name ); 911 copyFileName ( outName, "(stdout)" );
859 copyFileName ( outName, name ); 912 break;
860 strcat ( outName, ".bz2" ); break; 913 case SM_F2F:
861 case SM_F2O: copyFileName ( inName, name ); 914 copyFileName ( inName, name );
862 copyFileName ( outName, "(stdout)" ); break; 915 copyFileName ( outName, name );
916 strcat ( outName, ".bz2" );
917 break;
918 case SM_F2O:
919 copyFileName ( inName, name );
920 copyFileName ( outName, "(stdout)" );
921 break;
863 } 922 }
864 923
865 if ( srcMode != SM_I2O && containsDubiousChars ( inName ) ) { 924 if ( srcMode != SM_I2O && containsDubiousChars ( inName ) ) {
925 if (noisy)
866 fprintf ( stderr, "%s: There are no files matching `%s'.\n", 926 fprintf ( stderr, "%s: There are no files matching `%s'.\n",
867 progName, inName ); 927 progName, inName );
868 return; 928 return;
869 } 929 }
870 if ( srcMode != SM_I2O && !fileExists ( inName ) ) { 930 if ( srcMode != SM_I2O && !fileExists ( inName ) ) {
871 fprintf ( stderr, "%s: Input file %s doesn't exist, skipping.\n", 931 fprintf ( stderr, "%s: Can't open input file %s: %s.\n",
872 progName, inName ); 932 progName, inName, strerror(errno) );
873 return; 933 return;
874 } 934 }
875 if ( srcMode != SM_I2O && endsInBz2 ( inName )) { 935 for (i = 0; i < BZ_N_SUFFIX_PAIRS; i++) {
876 fprintf ( stderr, "%s: Input file name %s ends in `.bz2', skipping.\n", 936 if (hasSuffix(inName, zSuffix[i])) {
877 progName, inName ); 937 if (noisy)
878 return; 938 fprintf ( stderr,
939 "%s: Input file %s already has %s suffix.\n",
940 progName, inName, zSuffix[i] );
941 return;
942 }
879 } 943 }
880 if ( srcMode != SM_I2O && notAStandardFile ( inName )) { 944 if ( srcMode == SM_F2F && !forceOverwrite && notAStandardFile ( inName )) {
881 fprintf ( stderr, "%s: Input file %s is not a normal file, skipping.\n", 945 if (noisy)
946 fprintf ( stderr, "%s: Input file %s is not a normal file.\n",
882 progName, inName ); 947 progName, inName );
883 return; 948 return;
884 } 949 }
885 if ( srcMode == SM_F2F && !forceOverwrite && fileExists ( outName ) ) { 950 if ( srcMode == SM_F2F && !forceOverwrite && fileExists ( outName ) ) {
886 fprintf ( stderr, "%s: Output file %s already exists, skipping.\n", 951 fprintf ( stderr, "%s: Output file %s already exists.\n",
887 progName, outName ); 952 progName, outName );
888 return; 953 return;
889 } 954 }
955 if ( srcMode == SM_F2F && !forceOverwrite &&
956 (n=countHardLinks ( inName )) > 0) {
957 fprintf ( stderr, "%s: Input file %s has %d other link%s.\n",
958 progName, inName, n, n > 1 ? "s" : "" );
959 return;
960 }
890 961
891 switch ( srcMode ) { 962 switch ( srcMode ) {
892 963
@@ -912,11 +983,12 @@ void compress ( Char *name )
912 progName ); 983 progName );
913 fprintf ( stderr, "%s: For help, type: `%s --help'.\n", 984 fprintf ( stderr, "%s: For help, type: `%s --help'.\n",
914 progName, progName ); 985 progName, progName );
986 if ( inStr != NULL ) fclose ( inStr );
915 return; 987 return;
916 }; 988 };
917 if ( inStr == NULL ) { 989 if ( inStr == NULL ) {
918 fprintf ( stderr, "%s: Can't open input file %s, skipping.\n", 990 fprintf ( stderr, "%s: Can't open input file %s: %s.\n",
919 progName, inName ); 991 progName, inName, strerror(errno) );
920 return; 992 return;
921 }; 993 };
922 break; 994 break;
@@ -925,13 +997,15 @@ void compress ( Char *name )
925 inStr = fopen ( inName, "rb" ); 997 inStr = fopen ( inName, "rb" );
926 outStr = fopen ( outName, "wb" ); 998 outStr = fopen ( outName, "wb" );
927 if ( outStr == NULL) { 999 if ( outStr == NULL) {
928 fprintf ( stderr, "%s: Can't create output file %s, skipping.\n", 1000 fprintf ( stderr, "%s: Can't create output file %s: %s.\n",
929 progName, outName ); 1001 progName, outName, strerror(errno) );
1002 if ( inStr != NULL ) fclose ( inStr );
930 return; 1003 return;
931 } 1004 }
932 if ( inStr == NULL ) { 1005 if ( inStr == NULL ) {
933 fprintf ( stderr, "%s: Can't open input file %s, skipping.\n", 1006 fprintf ( stderr, "%s: Can't open input file %s: %s.\n",
934 progName, inName ); 1007 progName, inName, strerror(errno) );
1008 if ( outStr != NULL ) fclose ( outStr );
935 return; 1009 return;
936 }; 1010 };
937 setInterimPermissions ( outName ); 1011 setInterimPermissions ( outName );
@@ -967,51 +1041,72 @@ void compress ( Char *name )
967/*---------------------------------------------*/ 1041/*---------------------------------------------*/
968void uncompress ( Char *name ) 1042void uncompress ( Char *name )
969{ 1043{
970 FILE *inStr; 1044 FILE *inStr;
971 FILE *outStr; 1045 FILE *outStr;
972 Bool magicNumberOK; 1046 Int32 n, i;
1047 Bool magicNumberOK;
1048 Bool cantGuess;
973 1049
974 if (name == NULL && srcMode != SM_I2O) 1050 if (name == NULL && srcMode != SM_I2O)
975 panic ( "uncompress: bad modes\n" ); 1051 panic ( "uncompress: bad modes\n" );
976 1052
1053 cantGuess = False;
977 switch (srcMode) { 1054 switch (srcMode) {
978 case SM_I2O: copyFileName ( inName, "(stdin)" ); 1055 case SM_I2O:
979 copyFileName ( outName, "(stdout)" ); break; 1056 copyFileName ( inName, "(stdin)" );
980 case SM_F2F: copyFileName ( inName, name ); 1057 copyFileName ( outName, "(stdout)" );
981 copyFileName ( outName, name ); 1058 break;
982 if (endsInBz2 ( outName )) 1059 case SM_F2F:
983 outName [ strlen ( outName ) - 4 ] = '\0'; 1060 copyFileName ( inName, name );
984 break; 1061 copyFileName ( outName, name );
985 case SM_F2O: copyFileName ( inName, name ); 1062 for (i = 0; i < BZ_N_SUFFIX_PAIRS; i++)
986 copyFileName ( outName, "(stdout)" ); break; 1063 if (mapSuffix(outName,zSuffix[i],unzSuffix[i]))
1064 goto zzz;
1065 cantGuess = True;
1066 strcat ( outName, ".out" );
1067 break;
1068 case SM_F2O:
1069 copyFileName ( inName, name );
1070 copyFileName ( outName, "(stdout)" );
1071 break;
987 } 1072 }
988 1073
1074 zzz:
989 if ( srcMode != SM_I2O && containsDubiousChars ( inName ) ) { 1075 if ( srcMode != SM_I2O && containsDubiousChars ( inName ) ) {
1076 if (noisy)
990 fprintf ( stderr, "%s: There are no files matching `%s'.\n", 1077 fprintf ( stderr, "%s: There are no files matching `%s'.\n",
991 progName, inName ); 1078 progName, inName );
992 return; 1079 return;
993 } 1080 }
994 if ( srcMode != SM_I2O && !fileExists ( inName ) ) { 1081 if ( srcMode != SM_I2O && !fileExists ( inName ) ) {
995 fprintf ( stderr, "%s: Input file %s doesn't exist, skipping.\n", 1082 fprintf ( stderr, "%s: Can't open input file %s: %s.\n",
996 progName, inName ); 1083 progName, inName, strerror(errno) );
997 return; 1084 return;
998 } 1085 }
999 if ( srcMode != SM_I2O && !endsInBz2 ( inName )) { 1086 if ( srcMode == SM_F2F && !forceOverwrite && notAStandardFile ( inName )) {
1000 fprintf ( stderr, 1087 if (noisy)
1001 "%s: Input file name %s doesn't end in `.bz2', skipping.\n", 1088 fprintf ( stderr, "%s: Input file %s is not a normal file.\n",
1002 progName, inName );
1003 return;
1004 }
1005 if ( srcMode != SM_I2O && notAStandardFile ( inName )) {
1006 fprintf ( stderr, "%s: Input file %s is not a normal file, skipping.\n",
1007 progName, inName ); 1089 progName, inName );
1008 return; 1090 return;
1009 } 1091 }
1092 if ( /* srcMode == SM_F2F implied && */ cantGuess ) {
1093 if (noisy)
1094 fprintf ( stderr,
1095 "%s: Can't guess original name for %s -- using %s\n",
1096 progName, inName, outName );
1097 /* just a warning, no return */
1098 }
1010 if ( srcMode == SM_F2F && !forceOverwrite && fileExists ( outName ) ) { 1099 if ( srcMode == SM_F2F && !forceOverwrite && fileExists ( outName ) ) {
1011 fprintf ( stderr, "%s: Output file %s already exists, skipping.\n", 1100 fprintf ( stderr, "%s: Output file %s already exists.\n",
1012 progName, outName ); 1101 progName, outName );
1013 return; 1102 return;
1014 } 1103 }
1104 if ( srcMode == SM_F2F && !forceOverwrite &&
1105 (n=countHardLinks ( inName ) ) > 0) {
1106 fprintf ( stderr, "%s: Input file %s has %d other link%s.\n",
1107 progName, inName, n, n > 1 ? "s" : "" );
1108 return;
1109 }
1015 1110
1016 switch ( srcMode ) { 1111 switch ( srcMode ) {
1017 1112
@@ -1032,8 +1127,9 @@ void uncompress ( Char *name )
1032 inStr = fopen ( inName, "rb" ); 1127 inStr = fopen ( inName, "rb" );
1033 outStr = stdout; 1128 outStr = stdout;
1034 if ( inStr == NULL ) { 1129 if ( inStr == NULL ) {
1035 fprintf ( stderr, "%s: Can't open input file %s, skipping.\n", 1130 fprintf ( stderr, "%s: Can't open input file %s:%s.\n",
1036 progName, inName ); 1131 progName, inName, strerror(errno) );
1132 if ( inStr != NULL ) fclose ( inStr );
1037 return; 1133 return;
1038 }; 1134 };
1039 break; 1135 break;
@@ -1042,13 +1138,15 @@ void uncompress ( Char *name )
1042 inStr = fopen ( inName, "rb" ); 1138 inStr = fopen ( inName, "rb" );
1043 outStr = fopen ( outName, "wb" ); 1139 outStr = fopen ( outName, "wb" );
1044 if ( outStr == NULL) { 1140 if ( outStr == NULL) {
1045 fprintf ( stderr, "%s: Can't create output file %s, skipping.\n", 1141 fprintf ( stderr, "%s: Can't create output file %s: %s.\n",
1046 progName, outName ); 1142 progName, outName, strerror(errno) );
1143 if ( inStr != NULL ) fclose ( inStr );
1047 return; 1144 return;
1048 } 1145 }
1049 if ( inStr == NULL ) { 1146 if ( inStr == NULL ) {
1050 fprintf ( stderr, "%s: Can't open input file %s, skipping.\n", 1147 fprintf ( stderr, "%s: Can't open input file %s: %s.\n",
1051 progName, inName ); 1148 progName, inName, strerror(errno) );
1149 if ( outStr != NULL ) fclose ( outStr );
1052 return; 1150 return;
1053 }; 1151 };
1054 setInterimPermissions ( outName ); 1152 setInterimPermissions ( outName );
@@ -1091,9 +1189,9 @@ void uncompress ( Char *name )
1091 fprintf ( stderr, "done\n" ); 1189 fprintf ( stderr, "done\n" );
1092 } else { 1190 } else {
1093 if (verbosity >= 1) 1191 if (verbosity >= 1)
1094 fprintf ( stderr, "not a bzip2 file, skipping.\n" ); else 1192 fprintf ( stderr, "not a bzip2 file.\n" ); else
1095 fprintf ( stderr, 1193 fprintf ( stderr,
1096 "%s: %s is not a bzip2 file, skipping.\n", 1194 "%s: %s is not a bzip2 file.\n",
1097 progName, inName ); 1195 progName, inName );
1098 } 1196 }
1099 1197
@@ -1117,24 +1215,14 @@ void testf ( Char *name )
1117 } 1215 }
1118 1216
1119 if ( srcMode != SM_I2O && containsDubiousChars ( inName ) ) { 1217 if ( srcMode != SM_I2O && containsDubiousChars ( inName ) ) {
1218 if (noisy)
1120 fprintf ( stderr, "%s: There are no files matching `%s'.\n", 1219 fprintf ( stderr, "%s: There are no files matching `%s'.\n",
1121 progName, inName ); 1220 progName, inName );
1122 return; 1221 return;
1123 } 1222 }
1124 if ( srcMode != SM_I2O && !fileExists ( inName ) ) { 1223 if ( srcMode != SM_I2O && !fileExists ( inName ) ) {
1125 fprintf ( stderr, "%s: Input file %s doesn't exist, skipping.\n", 1224 fprintf ( stderr, "%s: Can't open input %s: %s.\n",
1126 progName, inName ); 1225 progName, inName, strerror(errno) );
1127 return;
1128 }
1129 if ( srcMode != SM_I2O && !endsInBz2 ( inName )) {
1130 fprintf ( stderr,
1131 "%s: Input file name %s doesn't end in `.bz2', skipping.\n",
1132 progName, inName );
1133 return;
1134 }
1135 if ( srcMode != SM_I2O && notAStandardFile ( inName )) {
1136 fprintf ( stderr, "%s: Input file %s is not a normal file, skipping.\n",
1137 progName, inName );
1138 return; 1226 return;
1139 } 1227 }
1140 1228
@@ -1155,8 +1243,8 @@ void testf ( Char *name )
1155 case SM_F2O: case SM_F2F: 1243 case SM_F2O: case SM_F2F:
1156 inStr = fopen ( inName, "rb" ); 1244 inStr = fopen ( inName, "rb" );
1157 if ( inStr == NULL ) { 1245 if ( inStr == NULL ) {
1158 fprintf ( stderr, "%s: Can't open input file %s, skipping.\n", 1246 fprintf ( stderr, "%s: Can't open input file %s:%s.\n",
1159 progName, inName ); 1247 progName, inName, strerror(errno) );
1160 return; 1248 return;
1161 }; 1249 };
1162 break; 1250 break;
@@ -1186,13 +1274,13 @@ void license ( void )
1186 fprintf ( stderr, 1274 fprintf ( stderr,
1187 1275
1188 "bzip2, a block-sorting file compressor. " 1276 "bzip2, a block-sorting file compressor. "
1189 "Version 0.9.0c, 18-Oct-98.\n" 1277 "Version 0.9.5d, 4-Sept-99.\n"
1190 " \n" 1278 " \n"
1191 " Copyright (C) 1996, 1997, 1998 by Julian Seward.\n" 1279 " Copyright (C) 1996, 1997, 1998, 1999 by Julian Seward.\n"
1192 " \n" 1280 " \n"
1193 " This program is free software; you can redistribute it and/or modify\n" 1281 " This program is free software; you can redistribute it and/or modify\n"
1194 " it under the terms set out in the LICENSE file, which is included\n" 1282 " it under the terms set out in the LICENSE file, which is included\n"
1195 " in the bzip2-0.9.0c source distribution.\n" 1283 " in the bzip2-0.9.5 source distribution.\n"
1196 " \n" 1284 " \n"
1197 " This program is distributed in the hope that it will be useful,\n" 1285 " This program is distributed in the hope that it will be useful,\n"
1198 " but WITHOUT ANY WARRANTY; without even the implied warranty of\n" 1286 " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
@@ -1209,27 +1297,26 @@ void usage ( Char *fullProgName )
1209 fprintf ( 1297 fprintf (
1210 stderr, 1298 stderr,
1211 "bzip2, a block-sorting file compressor. " 1299 "bzip2, a block-sorting file compressor. "
1212 "Version 0.9.0c, 18-Oct-98.\n" 1300 "Version 0.9.5d, 4-Sept-99.\n"
1213 "\n usage: %s [flags and input files in any order]\n" 1301 "\n usage: %s [flags and input files in any order]\n"
1214 "\n" 1302 "\n"
1215 " -h --help print this message\n" 1303 " -h --help print this message\n"
1216 " -d --decompress force decompression\n" 1304 " -d --decompress force decompression\n"
1217 " -z --compress force compression\n" 1305 " -z --compress force compression\n"
1218 " -k --keep keep (don't delete) input files\n" 1306 " -k --keep keep (don't delete) input files\n"
1219 " -f --force overwrite existing output filess\n" 1307 " -f --force overwrite existing output files\n"
1220 " -t --test test compressed file integrity\n" 1308 " -t --test test compressed file integrity\n"
1221 " -c --stdout output to standard out\n" 1309 " -c --stdout output to standard out\n"
1310 " -q --quiet suppress noncritical error messages\n"
1222 " -v --verbose be verbose (a 2nd -v gives more)\n" 1311 " -v --verbose be verbose (a 2nd -v gives more)\n"
1223 " -L --license display software version & license\n" 1312 " -L --license display software version & license\n"
1224 " -V --version display software version & license\n" 1313 " -V --version display software version & license\n"
1225 " -s --small use less memory (at most 2500k)\n" 1314 " -s --small use less memory (at most 2500k)\n"
1226 " -1 .. -9 set block size to 100k .. 900k\n" 1315 " -1 .. -9 set block size to 100k .. 900k\n"
1227 " --repetitive-fast compress repetitive blocks faster\n"
1228 " --repetitive-best compress repetitive blocks better\n"
1229 "\n" 1316 "\n"
1230 " If invoked as `bzip2', default action is to compress.\n" 1317 " If invoked as `bzip2', default action is to compress.\n"
1231 " as `bunzip2', default action is to decompress.\n" 1318 " as `bunzip2', default action is to decompress.\n"
1232 " as `bz2cat', default action is to decompress to stdout.\n" 1319 " as `bzcat', default action is to decompress to stdout.\n"
1233 "\n" 1320 "\n"
1234 " If no file names are given, bzip2 compresses or decompresses\n" 1321 " If no file names are given, bzip2 compresses or decompresses\n"
1235 " from standard input to standard output. You can combine\n" 1322 " from standard input to standard output. You can combine\n"
@@ -1245,18 +1332,28 @@ void usage ( Char *fullProgName )
1245 1332
1246 1333
1247/*---------------------------------------------*/ 1334/*---------------------------------------------*/
1335void redundant ( Char* flag )
1336{
1337 fprintf (
1338 stderr,
1339 "%s: %s is redundant in versions 0.9.5 and above\n",
1340 progName, flag );
1341}
1342
1343
1344/*---------------------------------------------*/
1248/*-- 1345/*--
1249 All the garbage from here to main() is purely to 1346 All the garbage from here to main() is purely to
1250 implement a linked list of command-line arguments, 1347 implement a linked list of command-line arguments,
1251 into which main() copies argv[1 .. argc-1]. 1348 into which main() copies argv[1 .. argc-1].
1252 1349
1253 The purpose of this ridiculous exercise is to 1350 The purpose of this exercise is to facilitate
1254 facilitate the expansion of wildcard characters 1351 the expansion of wildcard characters * and ? in
1255 * and ? in filenames for halfwitted OSs like 1352 filenames for OSs which don't know how to do it
1256 MSDOS, Windows 95 and NT. 1353 themselves, like MSDOS, Windows 95 and NT.
1257 1354
1258 The actual Dirty Work is done by the platform-specific 1355 The actual Dirty Work is done by the platform-
1259 macro APPEND_FILESPEC. 1356 specific macro APPEND_FILESPEC.
1260--*/ 1357--*/
1261 1358
1262typedef 1359typedef
@@ -1308,15 +1405,42 @@ Cell *snocString ( Cell *root, Char *name )
1308 1405
1309 1406
1310/*---------------------------------------------*/ 1407/*---------------------------------------------*/
1311#define ISFLAG(s) (strcmp(aa->name, (s))==0) 1408void addFlagsFromEnvVar ( Cell** argList, Char* varName )
1409{
1410 Int32 i, j, k;
1411 Char *envbase, *p;
1412
1413 envbase = getenv(varName);
1414 if (envbase != NULL) {
1415 p = envbase;
1416 i = 0;
1417 while (True) {
1418 if (p[i] == 0) break;
1419 p += i;
1420 i = 0;
1421 while (isspace((Int32)(p[0]))) p++;
1422 while (p[i] != 0 && !isspace((Int32)(p[i]))) i++;
1423 if (i > 0) {
1424 k = i; if (k > FILE_NAME_LEN-10) k = FILE_NAME_LEN-10;
1425 for (j = 0; j < k; j++) tmpName[j] = p[j];
1426 tmpName[k] = 0;
1427 APPEND_FLAG(*argList, tmpName);
1428 }
1429 }
1430 }
1431}
1312 1432
1313 1433
1434/*---------------------------------------------*/
1435#define ISFLAG(s) (strcmp(aa->name, (s))==0)
1436
1314IntNative main ( IntNative argc, Char *argv[] ) 1437IntNative main ( IntNative argc, Char *argv[] )
1315{ 1438{
1316 Int32 i, j; 1439 Int32 i, j;
1317 Char *tmp; 1440 Char *tmp;
1318 Cell *argList; 1441 Cell *argList;
1319 Cell *aa; 1442 Cell *aa;
1443 Bool decode;
1320 1444
1321 /*-- Be really really really paranoid :-) --*/ 1445 /*-- Be really really really paranoid :-) --*/
1322 if (sizeof(Int32) != 4 || sizeof(UInt32) != 4 || 1446 if (sizeof(Int32) != 4 || sizeof(UInt32) != 4 ||
@@ -1332,27 +1456,27 @@ IntNative main ( IntNative argc, Char *argv[] )
1332 } 1456 }
1333 1457
1334 1458
1335 /*-- Set up signal handlers --*/
1336 signal (SIGINT, mySignalCatcher);
1337 signal (SIGTERM, mySignalCatcher);
1338 signal (SIGSEGV, mySIGSEGVorSIGBUScatcher);
1339#if BZ_UNIX
1340 signal (SIGHUP, mySignalCatcher);
1341 signal (SIGBUS, mySIGSEGVorSIGBUScatcher);