59 byte = byte & 0x000000FF;
89 int i, j, last = UCHAR_MAX + 1;
109 for (i=UCHAR_MAX, j=
alphasize-1; i>=0; i--) {
133 int i, j, k, last = UCHAR_MAX + 1, count;
162 for (; j>count; i--, j--) {
173 for (; j>0; i--, j--) {
194 static void zip(
char *filename,
char *compressed,
BOOL algorithm,
int parts,
BOOL see) {
195 Uchar *origText, *prevText = NULL;
196 Uint origTextLen, partTextLen, currentTextLen;
197 FILE *compressed_file;
209 if(origText == NULL) {
210 fprintf(stderr,
"Cannot open file %s\n", filename);
216 strcpy(compressed, filename);
217 strcat(compressed,
".ctx");
221 compressed_file = fopen(compressed,
"wb");
222 if (!compressed_file) {
223 printf(
"Could not open output file");
226 if (alloc)
FREE(compressed);
230 printf(
"Algorithm %d\n", algorithm);
235 putc(
MAGIC >> 8, compressed_file);
236 putc(
MAGIC, compressed_file);
238 putc(parts, compressed_file);
246 for (part = 1; part <= parts; part++) {
247 printf(
"---------- part %d ---------------\n", part);
249 partTextLen = floor(origTextLen / parts);
252 partTextLen = origTextLen - (floor(origTextLen / parts) * (parts - 1));
267 printf(
"Tree built\n");
273 printf(
"Tree built\n");
282 DEBUGCODE(printf(
"gamma hits: %d gamma Misses: %d\n", getHits(), getMisses()));
283 printf(
"height: %ld\n",
getHeight(stree));
286 for (i=3; i>=0; i--) {
289 printf (
"Textlen: %ld\n",
textlen);
294 printf(
"Encoding...\n");
296 encode(stree, compressed_file, origText + currentTextLen, partTextLen, see);
298 currentTextLen += partTextLen;
313 fclose(compressed_file);
323 static void unzip(
char *filename,
char *output,
BOOL see) {
324 FILE *output_file, *compressed_file;
325 int i, header, parts, part;
331 compressed_file = fopen(filename,
"rb");
332 if (!compressed_file) {
333 perror(
"Could not open input file");
338 output = strrchr(filename,
'.');
343 strcat(filename,
".out");
348 output_file = fopen(output,
"wb");
350 perror(
"Could not open output file");
355 header = getc(compressed_file) << 8;
356 header += getc(compressed_file);
357 if (header !=
MAGIC) {
358 fprintf(stderr,
"Invalid compressed file\n");
363 parts = getc(compressed_file);
372 for (part = 1; part <= parts; part++) {
373 printf(
"---------- part %d ---------------\n", part);
375 for (textlen=0, i=3; i>=0; i--) {
376 textlen +=
readByte(compressed_file) << (8 * i);
381 printf(
"Tree built\n");
382 printf(
"Textlen: %ld\n", textlen);
389 printf(
"Decoding...\n");
390 decode(tree, textlen, compressed_file, output_file, see);
392 fclose(compressed_file);
402 fprintf(stderr,
"Usage: %s [options] input_file [output_file]\n\n", progname);
403 fprintf(stderr,
"Options:\n");
404 fprintf(stderr,
"\t-z: compress (default)\n");
405 fprintf(stderr,
"\t-d: decompress (default if invoked as hpunzip)\n");
406 fprintf(stderr,
"\t-s: use secondary espace estimation (SEE)\n");
407 fprintf(stderr,
"\t-h: show this message\n");
409 fprintf(stderr,
"\nOptions for compression only:\n");
410 fprintf(stderr,
"\t-k: use Kurtz algorithm (default)\n");
411 fprintf(stderr,
"\t-u: use Ukkonnen algorithm (default with -b)\n");
412 fprintf(stderr,
"\t-p <num>: number of parts to partition the file\n");
422 int main(
int argc,
char *argv[])
424 int i, algorithm = 0, parts = 1;
426 char *error = NULL, *
pos;
429 pos = strrchr(argv[0],
'\\');
431 pos = strrchr(argv[0],
'/');
440 compress = !(strncmp(pos,
"uncontext", 9) == 0);
443 while (i < argc && argv[i][0] ==
'-') {
444 switch (argv[i][1]) {
462 parts = atoi(argv[i]);
470 error =
"Invalid option parameter";
476 error =
"Invalid number of parts";
481 if (!error && argc > i) {
482 if (algorithm == 0) {
488 zip(argv[i], argv[i+1], algorithm, parts, see);
491 zip(argv[i], NULL, algorithm, parts, see);
496 unzip(argv[i], argv[i+1], see);
499 unzip(argv[i], NULL, see);
508 error =
"Not enough parameters";
511 fprintf(stderr,
"%s\n\n", error);
void initialize_input_bitstream()
void initialize_output_bitstream()
int main(int argc, char *argv[])
Parses input parameters and calls the appropriate function.
void flush_output_bitstream(FILE *stream)
fsmTree_t buildSTree()
Builds a pruned fsm tree from the input text.
void buildAlpha(Uchar *text, const Uint textlen)
Reads input text and initializes alphabet variables.
void buildSuffixTree(suffixTree_t tree)
Builds a suffix tree based on the input string.
void initialize_arithmetic_decoder(FILE *stream)
static int readByte(FILE *file)
Reads a byte using the arithmetic decoder.
void encode(fsmTree_t tree, FILE *compressedFile, const Uchar *text, const Uint textlen, const BOOL useSee)
Encode the input data into the output file using the tree as model.
Uint textlen
Length of the input text in the encoder.
void encode_symbol(FILE *stream, SYMBOL *s)
static void unzip(char *filename, char *output, BOOL see)
Decompresses the data in an input file and writes it in a new file.
void initialize_arithmetic_encoder()
Uint getHeight(const fsmTree_t tree)
#define True
True boolean constant.
Uint alphasize
Alphabet size.
unsigned char Uchar
Unsigned char type.
void reversestring(Uchar *s, Uint len, Uchar *sreverse)
Reverses a string and returns the reverse in a new string.
static void printUsage(char *progname)
Prints program usage instructions in the standard error output.
Uchar * text
Input text to the encoder.
#define MAGIC
Magic number to detect valid ctx files.
static void zip(char *filename, char *compressed, BOOL algorithm, int parts, BOOL see)
Compresses the input text and writes the compressed data to a file.
#define KURTZ
Kurtz suffix tree contruction algorithm.
fsmTree_t fsmSuffixTree(suffixTree_t tree)
Transforms this tree in an equivalent fsm tree structure.
#define FREE(P)
Frees a block of memory and makes the pointer NULL.
static void writeAlphabet(FILE *file)
Writes the alphabet to the output file.
static void writeByte(int byte, FILE *file)
Writes a byte to the output file using the arithmetic encoder.
#define CALLOC(S, T, N)
Allocs a new block of memory and sets all its bits to zero.
unsigned long Uint
Unsigned int type.
#define BOOL
Boolean data type.
void pruneSuffixTree(suffixTree_t tree)
Prunes this suffix tree according to some cost function.
unsigned short int low_count
suffixTree_t initSuffixTree()
Creates and initializes a new suffix tree structure instance.
Encoder context tree structure.
void writeFsmTree(const fsmTree_t tree, FILE *file)
Writes this tree into a file.
static short pos
Current position in the buffer.
Uint MAX_COUNT
Reset threshold.
void flush_arithmetic_encoder(FILE *stream)
void setMaxCount()
Sets the reset threshold.
void makeDecoderFsm(decoderTree_t tree)
Calculates the FSM closure of this tree.
Uchar characters[UCHAR_MAX+1]
Characters in text in alphabetical order.
#define False
False boolean constant.
void initDecoderTreeStack()
static void readAlphabet(FILE *file)
Reads the alphabet from the input file.
void makeFsm(fsmTree_t tree)
Calculates the FSM closure of this tree.
Uint alphaindex[UCHAR_MAX+1]
Index of each alphabet character.
void * file2String(char *name, Uint *textlen)
Opens a file and maps it into memory.
unsigned short int high_count
void freeFsmTree(fsmTree_t tree)
Deletes a fsm tree structure instance.
void freetextspace(Uchar *text, Uint textlen)
Frees the memory used by the mapping of a file.
void remove_symbol_from_stream(FILE *stream, SYMBOL *s)
long bit_ftell_output(FILE *stream)
Decoder context tree structure.
short int get_current_count(SYMBOL *s)
decoderTree_t readDecoderTree(FILE *file)
Creates a new decoder tree reading it from a file.
#define UKKONEN
Ukkonen linear suffix tree contruction algorithm.
void decode(decoderTree_t tree, const Uint textlen, FILE *compressedFile, FILE *output, const BOOL useSee)
Decodes the compressed file data to the output file (for non-binary alphabet).