Context algorithm
Semi-predictive context algorithm implementation
 All Data Structures Files Functions Variables Typedefs Macros Pages
encoder.c
Go to the documentation of this file.
1 /* Copyright 2013 Jorge Merlino
2 
3  This file is part of Context.
4 
5  Context is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  Context is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with Context. If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <assert.h>
22 #include <math.h>
23 #include "encoder.h"
24 #include "alpha.h"
25 #include "debug.h"
26 #include "text.h"
27 #include "spacedef.h"
28 #include "arithmetic/coder.h"
29 #include "arithmetic/bitio.h"
30 #include "see.h"
31 #include "reset.h"
32 
33 #ifdef DEBUG
34 
35 static void printStats (fsmTree_t origTree, Uint *maskedChars, Uint i) {
36  int j;
37 
38  printf ("## ");
39  for(j=0; j<origTree->totalSyms; j++) {
40  printf("count[%d] = %ld %s", origTree->symbols[j], origTree->count[j],
41  (maskedChars[alphaindex[origTree->symbols[j]]] == i+1 ? "(masked) " : ""));
42  }
43  printf("Total: %ld %ld (%p)\n", origTree->totalCount, origTree->totalSyms, (void *)origTree);
44 }
45 
46 #endif
47 
55 static void fixParents (fsmTree_t tree, BOOL *deletedChars) {
56  Uint i;
57  fsmTree_t parTree = tree;
58  BOOL newChars;
59 
60  while (!isRootFsmTree(parTree)) {
61  parTree = parTree->parent->origin;
62  newChars = False;
63 
64  for (i=0; i<parTree->totalSyms && !newChars; i++) {
65  newChars |= parTree->count[i] > 1;
66  }
67 
68  if (!newChars) { /* there are no new symbols */
69  DEBUGCODE(printf("fixing %p\n", (void *)parTree));
70  parTree->totalCount = 0;
71  for (i=0; i<parTree->totalSyms; i++) {
72  if (deletedChars[alphaindex[parTree->symbols[i]]]) {
73  parTree->count[i] = 0;
74  }
75  else {
76  parTree->totalCount += parTree->count[i];
77  }
78  }
79 
80  parTree->totalCount *= 2;
81  }
82  else break;
83  }
84 }
85 
86 static void rescale (fsmTree_t tree) {
87  BOOL *charFlags, needToFix = False;
88  Uint numEscapes, i;
89  DEBUGCODE(printf("rescalo %p\n", (void *)tree));
90 
91  numEscapes = tree->totalCount;
92  tree->totalCount = 0;
93  CALLOC(charFlags, BOOL, alphasize);
94 
95  for (i=0; i<tree->totalSyms; i++) {
96  if (tree->count[i] > 0) {
97  numEscapes -= tree->count[i];
98  tree->count[i] = tree->count[i] >> 1;
99  if (tree->count[i] == 0) {
100  charFlags[alphaindex[tree->symbols[i]]] = True;
101  needToFix = True;
102  }
103  else {
104  tree->totalCount += tree->count[i];
105  }
106  }
107  }
108 
109  if (needToFix) {
110  fixParents(tree, charFlags);
111  }
112  FREE(charFlags)
113  numEscapes = numEscapes >> 1;
114  if (numEscapes == 0) numEscapes = 1;
115  tree->totalCount += numEscapes;
116 }
117 
118 
126 void encode (fsmTree_t tree, FILE *compressedFile, const Uchar *text, const Uint textlen, const BOOL useSee) {
127  SYMBOL s, escape;
128  Uint i, j, k, numMasked, low, pos, allCount, noMask, *maskedChars, state;
129  BOOL found;
130  fsmTree_t origTree;
131  Uchar sym;
132 
133  printf("MAX_COUNT: %ld\n", MAX_COUNT);
134  CALLOC(maskedChars, Uint, alphasize);
135 
136  if (useSee) {
137  initSee();
138  }
139 
140  for (i=0; i<textlen; i++) {
141  sym = text[i];
142  pos = GETINDEX(i);
143  DEBUGCODE(printf("index: %ld\n", i));
144  origTree = tree->origin;
145  numMasked = 0;
146  assert(origTree != NULL);
147 
148  if (origTree->totalCount > MAX_COUNT && origTree->used) {
149  rescale (origTree);
150  }
151  DEBUGCODE(printStats(origTree, maskedChars, i));
152  found = False;
153  do {
154  noMask = -1;
155  if (origTree->totalSyms > numMasked) { /* found a parent with more data */
156  low = j = allCount = 0;
157  do {
158  allCount += origTree->count[j];
159  if (maskedChars[alphaindex[origTree->symbols[j]]] != i+1) {
160  found = (origTree->symbols[j] == sym) && (origTree->count[j] > 0);
161  if (origTree->symbols[j] == sym && origTree->count[j] == 0) {
162  noMask = j;
163  }
164 
165  if (!found) {
166  low += origTree->count[j];
167  }
168  }
169  }
170  while (!found && ++j < origTree->totalSyms);
171 
172  if (found) {
173  s.low_count = low;
174  s.high_count = low += origTree->count[j];
175  /* compute the new scale */
176  for (k=j+1; k < origTree->totalSyms; k++) {
177  allCount += origTree->count[k];
178  if (maskedChars[alphaindex[origTree->symbols[k]]] != i+1) {
179  low += origTree->count[k];
180  }
181  }
182  s.scale = low + origTree->totalCount - allCount; /* low + escapes */
183 
184  if (low > 0 && useSee) {
185  state = getSeeStateEncoder(origTree, allCount, i, numMasked, text, alphasize);
186  if (state != -1) {
187  escape.scale = escape.high_count = See[state][1];
188  escape.low_count = See[state][0];
189 
190  DEBUGCODE(printf("--scale: %d diff: %d symbol: %d\n", escape.scale, escape.high_count - escape.low_count, -2));
191  encode_symbol(compressedFile, &escape);
192 
193  DEBUGCODE(printf("SEE encontrado: original %f, SEE %f, resultado: %s\n", (s.high_count - s.low_count) / (double)s.scale,
194  (escape.high_count - escape.low_count) / (double)escape.scale * (s.high_count - s.low_count) / (double)low,
195  ((escape.high_count - escape.low_count) / (double)escape.scale * (s.high_count - s.low_count) / (double)low) >
196  ((s.high_count - s.low_count) / (double)s.scale) ? "gana" : "pierde"));
197  s.scale = low;
198  updateSee(state, False, alphasize);
199  }
200  }
201 
202  DEBUGCODE(printf("++scale: %d diff: %d symbol: %d\n", s.scale, s.high_count - s.low_count, sym));
203 
204  encode_symbol(compressedFile, &s);
205  origTree->totalCount += 2;
206  origTree->count[j] += 2;
207  origTree->used = True;
208  }
209  else {
210  s.low_count = low;
211  s.high_count = s.scale = low + origTree->totalCount - allCount; /* low + escapes */
212 
213  if (origTree->totalCount > 0 && low > 0 && useSee) {
214  state = getSeeStateEncoder(origTree, allCount, i, numMasked, text, alphasize);
215  if (state != -1) {
216  escape.scale = See[state][1];
217  escape.low_count = 0;
218  escape.high_count = See[state][0];
219 
220  DEBUGCODE(printf("--scale: %d diff: %d symbol: %d\n", escape.scale, escape.high_count - escape.low_count, -1));
221  assert(See[state][1] > 0);
222  encode_symbol(compressedFile, &escape);
223 
224  DEBUGCODE(printf("SEE escape: original %f, SEE %f, resultado: %s\n", (s.high_count - s.low_count) / (double)s.scale,
225  escape.high_count / (double)escape.scale,
226  (escape.high_count / (double)escape.scale) > ((s.high_count - s.low_count) / (double)s.scale) ? "gana" : "pierde"));
227  s.scale = 0;
228  updateSee(state, True, alphasize);
229  }
230  }
231 
232  if (s.scale > 0) {
233  DEBUGCODE(printf("--scale: %d diff: %d symbol: %d\n", s.scale, s.high_count - s.low_count, -1));
234  encode_symbol(compressedFile, &s);
235  }
236 
237  for (j--; j != -1; j--) {
238  if (origTree->count[j] > 0 && (maskedChars[alphaindex[origTree->symbols[j]]] != i+1)) {
239  maskedChars[alphaindex[origTree->symbols[j]]] = i+1;
240  numMasked++;
241  }
242  }
243 
244  if (noMask == -1) {
245  addSymbol(origTree, sym);
246  }
247  else {
248  origTree->count[noMask] = 1;
249  origTree->totalCount++;
250  }
251  origTree->totalCount++; /* escape */
252  }
253  } /* if (origTree->totalSyms > numMasked) */
254  else {
255  addSymbol(origTree, sym);
256  origTree->totalCount++; /* escape */
257  DEBUGCODE(printf("--escape\n"));
258  }
259 
260  if (isRootFsmTree(origTree) && !found) {
261  found = True;
262  for (j=0, allCount=0, low=0; j<alphasize; j++) {
263  if (maskedChars[j] != i+1) {
264  allCount++;
265  if (j < pos) {
266  low++;
267  }
268  }
269  }
270 
271  /* do not reserve probability for symbols already seen */
272  s.scale = allCount;
273  s.low_count = low;
274  s.high_count = low + 1;
275  encode_symbol(compressedFile, &s);
276 
277  DEBUGCODE(printf("++scale: %d diff: %d symbol: %d\n", s.scale, s.high_count - s.low_count, sym));
278  }
279 
280  if (!found) {
281  origTree = origTree->parent->origin;
282 
283  if (origTree->totalCount > MAX_COUNT && origTree->used) {
284  rescale (origTree);
285  }
286  DEBUGCODE(printStats(origTree, maskedChars, i));
287 
288  /* search for a parent with more information */
289  while (!isRootFsmTree(origTree) && origTree->totalSyms == numMasked) {
290  addSymbol(origTree, sym);
291  origTree->totalCount++; /* escape */
292 
293  DEBUGCODE(printf("--escape\n"));
294  origTree = origTree->parent->origin;
295 
296  if (origTree->totalCount > MAX_COUNT && origTree->used) {
297  rescale (origTree);
298  }
299  DEBUGCODE(printStats(origTree, maskedChars, i));
300  DEBUGCODE(printf("--escape\n"));
301  }
302  }
303  }
304  while (!found);
305  DEBUGCODE(printf("\n"));
306  tree = tree->transitions[pos];
307  }
308  FREE(maskedChars);
309 }
void addSymbol(fsmTree_t tree, const Uchar sym)
Adds a new symbol to this tree node.
Definition: fsmTree.c:543
void encode(fsmTree_t tree, FILE *compressedFile, const Uchar *text, const Uint textlen, const BOOL useSee)
Encode the input data into the output file using the tree as model.
Definition: encoder.c:126
Uint textlen
Length of the input text in the encoder.
Definition: text.h:23
unsigned short int scale
Definition: coder.h:24
void encode_symbol(FILE *stream, SYMBOL *s)
Definition: coder.c:54
struct fsmTree ** transitions
List of FSM transitions from this state.
Definition: fsmTree.h:39
BOOL isRootFsmTree(const fsmTree_t tree)
Indicates if the parameter node is the root of the tree.
Definition: fsmTree.c:602
Definition: coder.h:21
#define True
True boolean constant.
Definition: types.h:108
Uint totalCount
Definition: fsmTree.h:29
Uint alphasize
Alphabet size.
Definition: alpha.h:25
unsigned char Uchar
Unsigned char type.
Definition: types.h:48
static void fixParents(fsmTree_t tree, BOOL *deletedChars)
Remove symbols from the statistics of the ancestors of this node in case that is necessary.
Definition: encoder.c:55
Uchar * text
Input text to the encoder.
Definition: text.h:26
BOOL used
Flag that indicates if this node has been used to encode a symbol.
Definition: fsmTree.h:45
Uint totalSyms
Definition: decoderTree.h:26
Uchar * symbols
Definition: fsmTree.h:37
#define FREE(P)
Frees a block of memory and makes the pointer NULL.
Definition: spacedef.h:58
static unsigned short int low
Definition: coder.c:27
struct fsmTree * origin
Pointer to the original node this one descends from.
Definition: fsmTree.h:39
#define CALLOC(S, T, N)
Allocs a new block of memory and sets all its bits to zero.
Definition: spacedef.h:72
unsigned long Uint
Unsigned int type.
Definition: types.h:54
#define BOOL
Boolean data type.
Definition: types.h:92
Uint * count
List containing the number of occurrences of each character in this state.
Definition: fsmTree.h:29
unsigned short int low_count
Definition: coder.h:22
Encoder context tree structure.
Definition: fsmTree.h:28
static short pos
Current position in the buffer.
Definition: fsmTree.c:35
Uint MAX_COUNT
Reset threshold.
Definition: reset.c:22
#define DEBUGCODE(S)
Definition: debug.h:32
#define False
False boolean constant.
Definition: types.h:100
int getSeeStateEncoder(fsmTree_t tree, Uint allCount, Uint pos, Uint numMasked, const Uchar *text, Uint alphasize)
Returns the contents of the SEE table for the encoder.
Definition: see.c:44
Uint totalSyms
Total number of symbols occuring at this state.
Definition: fsmTree.h:29
Uint alphaindex[UCHAR_MAX+1]
Index of each alphabet character.
Definition: alpha.h:31
Uint See[1<< 14][2]
SEE table structure.
Definition: see.h:27
unsigned short int high_count
Definition: coder.h:23
void initSee()
Initalizes the SEE table.
Definition: see.c:139
#define GETINDEX(N)
Returns the index of the Nth character in the input text.
Definition: alpha.h:38
struct fsmTree * parent
Pointer to the parent of this node.
Definition: fsmTree.h:39
void updateSee(Uint state, BOOL escape, Uint alphasize)
Updates the SEE table.
Definition: see.c:120
static void rescale(fsmTree_t tree)
Definition: encoder.c:86