Context algorithm
Semi-predictive context algorithm implementation
 All Data Structures Files Functions Variables Typedefs Macros Pages
main.c
Go to the documentation of this file.
1 /* Copyright 2013 Jorge Merlino
2 
3  This file is part of Context.
4 
5  Context is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  Context is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with Context. If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <assert.h>
20 #include <math.h>
21 #include "types.h"
22 #include "spacedef.h"
23 #include "mapfile.h"
24 #include "reverse.h"
25 #include "wotd.h"
26 #include "fsmTree.h"
27 #include "suffixTree.h"
28 #include "decoderTree.h"
29 #include "encoder.h"
30 #include "decoder.h"
31 #include "debug.h"
32 #include "alpha.h"
33 #include "text.h"
34 #include "reset.h"
35 #include "arithmetic/coder.h"
36 #include "arithmetic/bitio.h"
37 
38 #ifdef DEBUG
39 #include "gammaFunc.h"
40 #endif
41 
43 #define UKKONEN 1
44 
46 #define KURTZ 2
47 
49 #define MAGIC 0x3276
50 
56 static void writeByte(int byte, FILE *file) {
57  SYMBOL s;
58 
59  byte = byte & 0x000000FF;
60  s.scale = 256;
61  s.low_count = byte;
62  s.high_count = byte + 1;
63  encode_symbol(file, &s);
64 }
65 
71 static int readByte(FILE *file) {
72  SYMBOL s;
73  int ret;
74 
75  s.scale = 256;
76  ret = get_current_count(&s);
77  s.low_count = ret;
78  s.high_count = ret + 1;
79  remove_symbol_from_stream(file, &s);
80 
81  return ret;
82 }
83 
88 static void writeAlphabet(FILE *file) {
89  int i, j, last = UCHAR_MAX + 1;
90  SYMBOL s;
91  long cost;
92 
93  /* write alphabet size */
94  writeByte(alphasize, file);
95 
96  cost = bit_ftell_output(file);
97 
98  if (alphasize <= UCHAR_MAX) {
99  if (alphasize < 128) { /* send alphabet */
100  for (i=alphasize-1; i>=0; i--) {
101  s.scale = last;
102  s.low_count = characters[i];
103  s.high_count = characters[i] + 1;
104  encode_symbol(file, &s);
105  last = characters[i];
106  }
107  }
108  else { /* send complement of alphabet */
109  for (i=UCHAR_MAX, j=alphasize-1; i>=0; i--) {
110  if (j<0 || characters[j] != i) {
111  s.scale = last;
112  s.low_count = i;
113  s.high_count = i+1;
114  encode_symbol(file, &s);
115  last = i;
116  }
117  else {
118  j--;
119  }
120  }
121  }
122  }
123  printf("Alphabet cost: %ld\n", bit_ftell_output(file) - cost);
124 }
125 
126 
131 static void readAlphabet(FILE *file) {
132  SYMBOL s;
133  int i, j, k, last = UCHAR_MAX + 1, count;
134 
135  /* read alphabet size */
136  alphasize = readByte(file);
137 
138  if (alphasize == 0) alphasize = 256;
139  printf("Alphasize: %ld\n", alphasize);
140 
141  if (alphasize <= UCHAR_MAX) {
142  if (alphasize < 128) { /* read alphabet */
143  for (i=alphasize-1; i>=0; i--) {
144  s.scale = last;
145  count = get_current_count(&s);
146  characters[i] = count;
147  alphaindex[characters[i]] = i;
148  last = count;
149 
150  s.low_count = count;
151  s.high_count = count + 1;
152  remove_symbol_from_stream(file, &s);
153  }
154  }
155  else { /* read complement of alphabet */
156  for (i=alphasize-1, j=UCHAR_MAX, k=UCHAR_MAX - alphasize; k>=0; k--) {
157  s.scale = last;
158  count = get_current_count(&s);
159  last = count;
160 
161  if (j>count) {
162  for (; j>count; i--, j--) {
163  characters[i] = j;
164  alphaindex[characters[i]] = i;
165  }
166  }
167  j = count-1;
168 
169  s.low_count = count;
170  s.high_count = count + 1;
171  remove_symbol_from_stream(file, &s);
172  }
173  for (; j>0; i--, j--) {
174  characters[i] = j;
175  alphaindex[characters[i]] = i;
176  }
177  }
178  }
179  else {
180  for (i=0; i<alphasize; i++) {
181  characters[i] = i;
182  alphaindex[i] = i;
183  }
184  }
185 }
186 
194 static void zip(char *filename, char *compressed, BOOL algorithm, int parts, BOOL see) {
195  Uchar *origText, *prevText = NULL;
196  Uint origTextLen, partTextLen, currentTextLen;
197  FILE *compressed_file;
198  int i, part;
199  fsmTree_t stree = NULL, prevTree = NULL;
200  BOOL alloc = False;
201 
202 #ifdef WIN32
203  HANDLE hndl;
204  origText = (Uchar *) file2String(filename, &origTextLen, &hndl);
205 #else
206  origText = (Uchar *) file2String(filename, &origTextLen);
207 #endif
208 
209  if(origText == NULL) {
210  fprintf(stderr,"Cannot open file %s\n", filename);
211  exit(EXIT_FAILURE);
212  }
213 
214  if (!compressed) {
215  CALLOC(compressed, Uchar, strlen(filename) + 5);
216  strcpy(compressed, filename);
217  strcat(compressed, ".ctx");
218  alloc = True;
219  }
220 
221  compressed_file = fopen(compressed, "wb");
222  if (!compressed_file) {
223  printf( "Could not open output file");
224  exit(1);
225  }
226  if (alloc) FREE(compressed);
227 
228  buildAlpha(origText, origTextLen);
229  printf ("Alphasize: %ld\n", alphasize);
230  printf("Algorithm %d\n", algorithm);
231 
232  setMaxCount();
233 
234  /* write magic number */
235  putc(MAGIC >> 8, compressed_file);
236  putc(MAGIC, compressed_file);
237  /* write # of parts */
238  putc(parts, compressed_file);
239 
242 
243  writeAlphabet(compressed_file);
244 
245  currentTextLen = 0;
246  for (part = 1; part <= parts; part++) {
247  printf("---------- part %d ---------------\n", part);
248  if (part != parts) {
249  partTextLen = floor(origTextLen / parts);
250  }
251  else {
252  partTextLen = origTextLen - (floor(origTextLen / parts) * (parts - 1));
253  }
254 
255  if (part > 1) {
256  prevText = text;
257  prevTree = stree;
258  }
259 
260  textlen = partTextLen;
262  reversestring(origText + currentTextLen, textlen, text);
263 
264  if (algorithm == UKKONEN) {
265  suffixTree_t tree = initSuffixTree();
266  buildSuffixTree(tree);
267  printf("Tree built\n");
268  pruneSuffixTree(tree);
269  stree = fsmSuffixTree(tree);
270  }
271  else {
272  stree = buildSTree();
273  printf("Tree built\n");
274  }
275 
276  /*if (part > 1) {
277  copyStatistics(prevTree, stree, prevText);
278  FREE(prevText);
279  freeFsmTree(prevTree);
280  }*/
281 
282  DEBUGCODE(printf("gamma hits: %d gamma Misses: %d\n", getHits(), getMisses()));
283  printf("height: %ld\n", getHeight(stree));
284 
285  /* write textlen */
286  for (i=3; i>=0; i--) {
287  writeByte(textlen >> (8 * i), compressed_file);
288  }
289  printf ("Textlen: %ld\n", textlen);
290  writeFsmTree(stree, compressed_file);
291  printf("FSM...\n");
292  makeFsm(stree);
293  DEBUGCODE(printFsmTree(stree));
294  printf("Encoding...\n");
295 
296  encode(stree, compressed_file, origText + currentTextLen, partTextLen, see);
297 
298  currentTextLen += partTextLen;
299  }
300 
301  FREE(text);
302  freeFsmTree(stree);
303 
304  flush_arithmetic_encoder(compressed_file);
305  flush_output_bitstream(compressed_file);
306 
307 #ifdef WIN32
308  freetextspace(origText, hndl);
309 #else
310  freetextspace(origText, origTextLen);
311 #endif
312 
313  fclose(compressed_file);
314 }
315 
316 
323 static void unzip(char *filename, char *output, BOOL see) {
324  FILE *output_file, *compressed_file;
325  int i, header, parts, part;
326  Uint textlen = 0;
327  decoderTree_t tree;
328 
330 
331  compressed_file = fopen(filename, "rb");
332  if (!compressed_file) {
333  perror( "Could not open input file");
334  exit(1);
335  }
336 
337  if (!output) {
338  output = strrchr(filename, '.');
339  if (output) {
340  *output = '\0';
341  }
342  else {
343  strcat(filename, ".out");
344  }
345  output = filename;
346  }
347 
348  output_file = fopen(output, "wb");
349  if (!output_file) {
350  perror( "Could not open output file");
351  exit(1);
352  }
353 
354  /* check magic */
355  header = getc(compressed_file) << 8;
356  header += getc(compressed_file);
357  if (header != MAGIC) {
358  fprintf(stderr, "Invalid compressed file\n");
359  exit(1);
360  }
361 
362  /* read parts */
363  parts = getc(compressed_file);
364 
366  initialize_arithmetic_decoder(compressed_file);
367 
368  readAlphabet(compressed_file);
369 
370  setMaxCount();
371 
372  for (part = 1; part <= parts; part++) {
373  printf("---------- part %d ---------------\n", part);
374  /* read textlen */
375  for (textlen=0, i=3; i>=0; i--) {
376  textlen += readByte(compressed_file) << (8 * i);
377  }
378 
379  tree = readDecoderTree(compressed_file);
380 
381  printf("Tree built\n");
382  printf("Textlen: %ld\n", textlen);
383  printf("FSM...\n");
384 
385  DEBUGCODE(printDecoderTree(tree));
386  makeDecoderFsm(tree);
387  DEBUGCODE(printDecoderTree(tree));
388 
389  printf("Decoding...\n");
390  decode(tree, textlen, compressed_file, output_file, see);
391  }
392  fclose(compressed_file);
393  fclose(output_file);
394 }
395 
396 
401 static void printUsage (char *progname) {
402  fprintf(stderr, "Usage: %s [options] input_file [output_file]\n\n", progname);
403  fprintf(stderr, "Options:\n");
404  fprintf(stderr, "\t-z: compress (default)\n");
405  fprintf(stderr, "\t-d: decompress (default if invoked as hpunzip)\n");
406  fprintf(stderr, "\t-s: use secondary espace estimation (SEE)\n");
407  fprintf(stderr, "\t-h: show this message\n");
408 
409  fprintf(stderr, "\nOptions for compression only:\n");
410  fprintf(stderr, "\t-k: use Kurtz algorithm (default)\n");
411  fprintf(stderr, "\t-u: use Ukkonnen algorithm (default with -b)\n");
412  fprintf(stderr, "\t-p <num>: number of parts to partition the file\n");
413 }
414 
415 
422 int main(int argc,char *argv[])
423 {
424  int i, algorithm = 0, parts = 1;
425  BOOL compress, see = True;
426  char *error = NULL, *pos;
427 
428 #ifdef WIN32
429  pos = strrchr(argv[0], '\\');
430 #else
431  pos = strrchr(argv[0], '/');
432 #endif
433 
434  if (!pos) {
435  pos = argv[0];
436  }
437  else {
438  pos++;
439  }
440  compress = !(strncmp(pos, "uncontext", 9) == 0);
441 
442  i = 1;
443  while (i < argc && argv[i][0] == '-') {
444  switch (argv[i][1]) {
445  case 'd':
446  compress = False;
447  break;
448  case 'z':
449  compress = True;
450  break;
451  case 'k':
452  algorithm = KURTZ;
453  break;
454  case 'u':
455  algorithm = UKKONEN;
456  break;
457  case 's':
458  see = True;
459  break;
460  case 'p':
461  i++;
462  parts = atoi(argv[i]);
463  assert(parts < 256);
464  break;
465  case 'h':
466  printUsage(argv[0]);
467  printf("\n");
468  return EXIT_SUCCESS;
469  default:
470  error = "Invalid option parameter";
471  }
472  i++;
473  }
474 
475  if (parts < 1) {
476  error = "Invalid number of parts";
477  }
478 
479  MAX_COUNT = 300;
480 
481  if (!error && argc > i) {
482  if (algorithm == 0) {
483  algorithm = KURTZ;
484  }
485 
486  if (compress) {
487  if (argc > i+1) {
488  zip(argv[i], argv[i+1], algorithm, parts, see);
489  }
490  else {
491  zip(argv[i], NULL, algorithm, parts, see);
492  }
493  }
494  else {
495  if (argc > i+1) {
496  unzip(argv[i], argv[i+1], see);
497  }
498  else {
499  unzip(argv[i], NULL, see);
500  }
501  }
502 
503  printf("\n");
504  return EXIT_SUCCESS;
505  }
506  else {
507  if (!error) {
508  error = "Not enough parameters";
509  }
510 
511  fprintf(stderr, "%s\n\n", error);
512  printUsage(argv[0]);
513 
514  printf("\n");
515  return EXIT_FAILURE;
516  }
517 }
void initialize_input_bitstream()
Definition: bitio.c:96
void initialize_output_bitstream()
Definition: bitio.c:48
int main(int argc, char *argv[])
Parses input parameters and calls the appropriate function.
Definition: main.c:422
void flush_output_bitstream(FILE *stream)
Definition: bitio.c:85
fsmTree_t buildSTree()
Builds a pruned fsm tree from the input text.
Definition: wotd.c:892
Suffix tree structure.
Definition: suffixTree.h:27
void buildAlpha(Uchar *text, const Uint textlen)
Reads input text and initializes alphabet variables.
Definition: alpha.c:21
void buildSuffixTree(suffixTree_t tree)
Builds a suffix tree based on the input string.
Definition: suffixTree.c:325
void initialize_arithmetic_decoder(FILE *stream)
Definition: coder.c:141
static int readByte(FILE *file)
Reads a byte using the arithmetic decoder.
Definition: main.c:71
void encode(fsmTree_t tree, FILE *compressedFile, const Uchar *text, const Uint textlen, const BOOL useSee)
Encode the input data into the output file using the tree as model.
Definition: encoder.c:126
Uint textlen
Length of the input text in the encoder.
Definition: text.h:23
unsigned short int scale
Definition: coder.h:24
void encode_symbol(FILE *stream, SYMBOL *s)
Definition: coder.c:54
static void unzip(char *filename, char *output, BOOL see)
Decompresses the data in an input file and writes it in a new file.
Definition: main.c:323
Definition: coder.h:21
void initialize_arithmetic_encoder()
Definition: coder.c:37
Uint getHeight(const fsmTree_t tree)
Definition: fsmTree.c:606
#define True
True boolean constant.
Definition: types.h:108
Uint alphasize
Alphabet size.
Definition: alpha.h:25
unsigned char Uchar
Unsigned char type.
Definition: types.h:48
void reversestring(Uchar *s, Uint len, Uchar *sreverse)
Reverses a string and returns the reverse in a new string.
Definition: reverse.c:41
static void printUsage(char *progname)
Prints program usage instructions in the standard error output.
Definition: main.c:401
Uchar * text
Input text to the encoder.
Definition: text.h:26
#define MAGIC
Magic number to detect valid ctx files.
Definition: main.c:49
static void zip(char *filename, char *compressed, BOOL algorithm, int parts, BOOL see)
Compresses the input text and writes the compressed data to a file.
Definition: main.c:194
#define KURTZ
Kurtz suffix tree contruction algorithm.
Definition: main.c:46
fsmTree_t fsmSuffixTree(suffixTree_t tree)
Transforms this tree in an equivalent fsm tree structure.
Definition: suffixTree.c:422
#define FREE(P)
Frees a block of memory and makes the pointer NULL.
Definition: spacedef.h:58
static void writeAlphabet(FILE *file)
Writes the alphabet to the output file.
Definition: main.c:88
static void writeByte(int byte, FILE *file)
Writes a byte to the output file using the arithmetic encoder.
Definition: main.c:56
#define CALLOC(S, T, N)
Allocs a new block of memory and sets all its bits to zero.
Definition: spacedef.h:72
unsigned long Uint
Unsigned int type.
Definition: types.h:54
#define BOOL
Boolean data type.
Definition: types.h:92
void pruneSuffixTree(suffixTree_t tree)
Prunes this suffix tree according to some cost function.
Definition: suffixTree.c:343
unsigned short int low_count
Definition: coder.h:22
suffixTree_t initSuffixTree()
Creates and initializes a new suffix tree structure instance.
Definition: suffixTree.c:298
Encoder context tree structure.
Definition: fsmTree.h:28
void writeFsmTree(const fsmTree_t tree, FILE *file)
Writes this tree into a file.
Definition: fsmTree.c:572
static short pos
Current position in the buffer.
Definition: fsmTree.c:35
Uint MAX_COUNT
Reset threshold.
Definition: reset.c:22
void flush_arithmetic_encoder(FILE *stream)
Definition: coder.c:108
void setMaxCount()
Sets the reset threshold.
Definition: reset.c:24
void makeDecoderFsm(decoderTree_t tree)
Calculates the FSM closure of this tree.
Definition: decoderTree.c:665
Uchar characters[UCHAR_MAX+1]
Characters in text in alphabetical order.
Definition: alpha.h:28
#define DEBUGCODE(S)
Definition: debug.h:32
#define False
False boolean constant.
Definition: types.h:100
void initDecoderTreeStack()
Definition: decoderTree.c:584
static void readAlphabet(FILE *file)
Reads the alphabet from the input file.
Definition: main.c:131
void makeFsm(fsmTree_t tree)
Calculates the FSM closure of this tree.
Definition: fsmTree.c:553
Uint alphaindex[UCHAR_MAX+1]
Index of each alphabet character.
Definition: alpha.h:31
void * file2String(char *name, Uint *textlen)
Opens a file and maps it into memory.
Definition: mapfile.c:237
unsigned short int high_count
Definition: coder.h:23
void freeFsmTree(fsmTree_t tree)
Deletes a fsm tree structure instance.
Definition: fsmTree.c:530
void freetextspace(Uchar *text, Uint textlen)
Frees the memory used by the mapping of a file.
Definition: mapfile.c:225
void remove_symbol_from_stream(FILE *stream, SYMBOL *s)
Definition: coder.c:161
long bit_ftell_output(FILE *stream)
Definition: bitio.c:148
Decoder context tree structure.
Definition: decoderTree.h:25
short int get_current_count(SYMBOL *s)
Definition: coder.c:124
decoderTree_t readDecoderTree(FILE *file)
Creates a new decoder tree reading it from a file.
Definition: decoderTree.c:683
#define UKKONEN
Ukkonen linear suffix tree contruction algorithm.
Definition: main.c:43
void decode(decoderTree_t tree, const Uint textlen, FILE *compressedFile, FILE *output, const BOOL useSee)
Decodes the compressed file data to the output file (for non-binary alphabet).
Definition: decoder.c:312