Context algorithm
Semi-predictive context algorithm implementation
 All Data Structures Files Functions Variables Typedefs Macros Pages
suffixTree.c
Go to the documentation of this file.
1 /* Copyright 2013 Jorge Merlino
2 
3  This file is part of Context.
4 
5  Context is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  Context is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with Context. If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <stdlib.h>
20 #include "suffixTree.h"
21 #include "stack.h"
22 #include "alpha.h"
23 #include "text.h"
24 #include "gammaFunc.h"
25 #include "spacedef.h"
26 #include "debug.h"
27 
30 #define INFINITY -3
31 
33 #define BOTTOM -2
34 
36 #define ROOT -1
37 
43 #define GET_RIGHT(R, T)\
44  if (T->child) {\
45  R = T->child->left-1;\
46  }\
47  else {\
48  R = INFINITY;\
49  }\
50 
51 
60 static void canonize (suffixTree_t s, Uint k, Uint p, suffixTree_t *retS, Uint *retK) {
61  Uint kPrime, pPrime;
62  suffixTree_t sPrime;
63 
64  if (p == -1 || p < k) {
65  *retS = s;
66  *retK = k;
67  }
68  else {
69  sPrime = s;
70  if (s->left == BOTTOM) {
71  sPrime = s->child; /* ROOT is the t-transition of BOTTOM for every t */
72  }
73  else {
74  for (sPrime=sPrime->child; sPrime && text[sPrime->left]!=text[k]; sPrime=sPrime->sibling);
75  }
76  kPrime = sPrime->left;
77  GET_RIGHT(pPrime, sPrime);
78  while ((k <= p) && (pPrime - kPrime <= p - k)) {
79  k = k + pPrime - kPrime +1;
80  s = sPrime;
81  if (k <= p) {
82  for (sPrime=sPrime->child; sPrime && text[sPrime->left]!=text[k]; sPrime=sPrime->sibling);
83  kPrime = sPrime->left;
84  GET_RIGHT(pPrime, sPrime);
85  }
86  }
87  *retS = s;
88  *retK = k;
89  }
90 }
91 
92 
103  suffixTree_t sPrime = s, new;
104  Uint kPrime;
105 
106  if (p != -1 && k <= p) {
107  for (sPrime=sPrime->child; sPrime && text[sPrime->left]!=text[k]; sPrime=sPrime->sibling);
108  kPrime = sPrime->left;
109  if ((p+1 < textlen) && (text[kPrime+p-k+1] == text[p+1])) {
110  *r = s;
111  return True;
112  }
113  else {
114  /* split transition */
115  CALLOC(new, struct suffixTree, 1);
116  new->left = sPrime->left;
117  new->child = sPrime;
118  new->sibling = sPrime->sibling;
119  new->parent = sPrime->parent;
120 
121  sPrime->left = kPrime + p - k + 1;
122  sPrime->parent = new;
123  sPrime->sibling = NULL;
124 
125  /* fix the parent */
126  if (new->parent->child == sPrime) {
127  new->parent->child = new;
128  }
129  else {
130  for (s=new->parent->child; s->sibling != sPrime; s=s->sibling);
131  s->sibling = new;
132  }
133 
134  *r = new;
135  return False;
136  }
137  }
138  else { /* s is explicit */
139  *r = s;
140  if (s->left == BOTTOM) {
141  return True;
142  }
143  if (p+1 < textlen) {
144  for (sPrime=sPrime->child; sPrime && text[sPrime->left]!=text[p+1]; sPrime=sPrime->sibling);
145  }
146  else {
147  sPrime=NULL;
148  }
149  return sPrime != NULL;
150  }
151 }
152 
153 
159 static void addChild (suffixTree_t s, Uint i) {
160  suffixTree_t new;
161 
162  CALLOC(new, struct suffixTree, 1);
163  new->left = i;
164  new->parent = s;
165 
166  if (!s->child) {
167  s->child = new;
168  }
169  else {
170  for (s=s->child; s->sibling; s=s->sibling);
171  s->sibling = new;
172  }
173 }
174 
175 
184 static void update (suffixTree_t s, Uint k, Uint i, suffixTree_t *retS, Uint *retK) {
185  suffixTree_t oldr=NULL, r;
186 
187  while (!testAndSplit(s, k, i-1, &r)) {
188  /* add new leaf */
189  addChild(r, i);
190  if (oldr) {
191  /* create suffix link */
192  oldr->suffix = r;
193  }
194  oldr = r;
195  canonize(s->suffix, k, i-1, &s, &k);
196  }
197 
198  if (oldr && oldr->left!=ROOT) {
199  oldr->suffix = r;
200  }
201  *retS = s;
202  *retK = k;
203 }
204 
205 
210 static void pruneSubTree(suffixTree_t tree) {
211  Uint stacktop=0, stackalloc=0, *stack = NULL, treePtr;
212 
213  PUSHNODE((Uint)tree);
214  while(NOTSTACKEMPTY) {
215  POPNODE(treePtr);
216  tree = (suffixTree_t)treePtr;
217  if (tree->sibling) {
218  PUSHNODE((Uint)tree->sibling);
219  }
220  if (tree->child) {
221  PUSHNODE((Uint)tree->child);
222  }
223  freeSuffixTree(tree);
224  }
225  FREE(stack);
226 }
227 
228 #ifdef DEBUG
229 
234 static void printNode(suffixTree_t tree) {
235  Uint right;
236 
237  if (tree->left == BOTTOM) {
238  /* nothing */
239  }
240  else if (tree->left == ROOT) {
241  printf("root %p", tree);
242  }
243  else {
244  GET_RIGHT(right, tree);
245  if (right == INFINITY) right = textlen;
246  printf("%ld - %ld - %p", tree->left, right, tree);
247  }
248 }
249 
250 
256 static void printRec(suffixTree_t tree, Uint level) {
257  int i;
258 
259  if (level > 0) {
260  for (i=1; i<level; i++) {
261  printf("-");
262  }
263  printNode (tree);
264  printf("\n");
265  }
266 
267  level++;
268  if (tree->child) {
269  for (tree=tree->child; tree; tree=tree->sibling) {
270  printRec(tree, level);
271  }
272  }
273 }
274 
275 
281 static int treeSize(suffixTree_t tree) {
282  int sum = 1;
283 
284  if (tree->child) {
285  for (tree=tree->child; tree; tree=tree->sibling) {
286  sum += treeSize(tree);
287  }
288  }
289  return sum;
290 }
291 
292 #endif
293 
294 
299  suffixTree_t tree;
300 
301  CALLOC(tree, struct suffixTree, 1);
302  CALLOC(tree->child, struct suffixTree, 1);
303 
304  tree->left = BOTTOM;
305 
306  tree->child->left = ROOT;
307  tree->child->suffix = tree;
308  tree->child->parent = tree;
309 
310  return tree;
311 }
312 
313 
318  FREE(tree);
319 }
320 
321 
326  Uint k=0, i=0;
327  suffixTree_t s = tree->child; /* ROOT */
328 
329  while (i <= textlen) {
330  update(s, k, i, &s, &k);
331  canonize(s, k, i, &s, &k);
332  i++;
333  if (i%1000 == 0) printf("%ld/%ld\r", i, textlen);
334  }
335  printf("%ld/%ld\n", textlen, textlen);
336  DEBUGCODE(printf("Original tree size: %d\n", treeSize(tree)));
337 }
338 
339 
344  Uint stacktop=0, stackalloc=0, *stack = NULL, treePtr, length, i, right;
345  suffixTree_t child;
346  double est, auxx;
347  Uint * distinct;
348 
349  tree = tree->child; /* ROOT */
350  PUSHNODE((Uint)tree);
351  PUSHNODE(0);
352  while(NOTSTACKEMPTY) {
353  POPNODE(length);
354  POPNODE(treePtr);
355  tree = (suffixTree_t)treePtr;
356  if (!tree->child) { /* is a leaf */
357  tree->stats = allocStatistics();
358  if (tree->left > length) { /* this is not the full string prefix */
359  i = GETINDEX(tree->left-length-1);
360  tree->stats->count[i] = 1;
361  tree->stats->symbols[tree->stats->symbolCount++] = i;
362  }
363  /*tree->stats->cost = log2Alpha();*/
364  }
365  else { /* is a branching node */
366  for (child=tree->child; child && child->stats; child=child->sibling);
367  if (child) { /* there is one child to evaluate */
368  PUSHNODE((Uint)tree);
369  PUSHNODE(length);
370  PUSHNODE((Uint)child);
371  if (tree->left != ROOT) {
372  GET_RIGHT(right, tree);
373  PUSHNODE(length + right - tree->left + 1);
374  }
375  else {
376  PUSHNODE(0);
377  }
378  }
379  else { /* all children are already evaluated */
380  tree->stats = allocStatistics();
381  CALLOC(distinct, Uint, alphasize);
382 
383  for (child=tree->child; child; child=child->sibling) {
384  for (i=0; i<child->stats->symbolCount; i++) {
385  if (tree->stats->count[child->stats->symbols[i]] == 0) {
386  tree->stats->symbols[tree->stats->symbolCount++] = child->stats->symbols[i];
387  }
388  tree->stats->count[child->stats->symbols[i]] += child->stats->count[child->stats->symbols[i]];
389  distinct[child->stats->symbols[i]]++;
390  }
391  tree->stats->cost += child->stats->cost;
392  freeStatistics(child->stats);
393  }
394 
395  GET_RIGHT(right, tree);
396  tree->stats->cost += hAlpha() * alphasize * (right - tree->left + 1);
397 
398  auxx = escapeCost(tree->stats, distinct);
399  tree->stats->cost += auxx;
400 
401  free(distinct);
402 
403  est = nodeCost(tree->stats);
404 
405  if (est <= tree->stats->cost) { /* we have to prune */
406  tree->stats->cost = est;
407  pruneSubTree(tree->child);
408  tree->child = NULL;
409  }
410  }
411  }
412  }
413  freeStatistics(tree->stats);
414  FREE(stack);
415 }
416 
417 
423  Uint stacktop=0, stackalloc=0, *stack = NULL, sfxPtr, fsmPtr, pos, right;
424  fsmTree_t ret = initFsmTree(), fsmNode;
425 
426  if (tree->child->child) { /* if root has children */
427  PUSHNODE((Uint)tree->child->child);
428  PUSHNODE((Uint)ret); /* ROOT */
429  while(NOTSTACKEMPTY) {
430  POPNODE(fsmPtr);
431  POPNODE(sfxPtr);
432  fsmNode = (fsmTree_t)fsmPtr;
433  tree = (suffixTree_t)sfxPtr;
434  if (tree->left != textlen) { /* ignore $ leaves */
435  pos = GETINDEX(tree->left);
436  fsmNode->children[pos] = initFsmTree();
437  fsmNode->children[pos]->parent = fsmNode;
438  fsmNode->children[pos]->left = tree->left;
439  GET_RIGHT(right, tree);
440  if (right == INFINITY) {
441  fsmNode->children[pos]->right = tree->left; /* shorten leaf */
442  }
443  else {
444  fsmNode->children[pos]->right = right;
445  }
446  if (fsmNode->left != ROOT) {
447  fsmNode->children[pos]->length = fsmNode->length + fsmNode->right - fsmNode->left + 1;
448  }
449 
450  if (tree->sibling) {
451  PUSHNODE((Uint)tree->sibling);
452  PUSHNODE(fsmPtr);
453  }
454  if (tree->child) {
455  PUSHNODE((Uint)tree->child);
456  PUSHNODE((Uint)fsmNode->children[pos]);
457  }
458  }
459  FREE(tree);
460  }
461  FREE(stack);
462  }
463  return ret;
464 }
465 
466 #ifdef DEBUG
467 
471 void printSuffixTree(const suffixTree_t tree) {
472  printRec(tree, 0);
473 }
474 
475 #endif
Uint * count
List with the number of occurrences of each character in this state.
Definition: statistics.h:26
static void pruneSubTree(suffixTree_t tree)
Delete a subtree from the complete tree.
Definition: suffixTree.c:210
Suffix tree structure.
Definition: suffixTree.h:27
#define BOTTOM
Flag to indicate a node is bottom.
Definition: suffixTree.c:33
void buildSuffixTree(suffixTree_t tree)
Builds a suffix tree based on the input string.
Definition: suffixTree.c:325
struct suffixTree * suffixTree_t
Suffix tree structure.
double escapeCost(statistics_t stats, Uint *distinct)
Calculates the cost of escapes in a node.
Definition: gammaFunc.c:143
fsmTree_t initFsmTree()
Creates and initializes a new fsm tree structure instance.
Definition: fsmTree.c:487
struct suffixTree * suffix
Pointer to the node whose label is the tail of this node&#39;s label.
Definition: suffixTree.h:29
Uint textlen
Length of the input text in the encoder.
Definition: text.h:23
#define GET_RIGHT(R, T)
Gets the index of the rightmost character of this node label in the input string. ...
Definition: suffixTree.c:43
double hAlpha()
Returns the binary entropy of 1/alphasize.
Definition: gammaFunc.c:171
#define True
True boolean constant.
Definition: types.h:108
Uint alphasize
Alphabet size.
Definition: alpha.h:25
struct suffixTree * sibling
Pointer to the next sibling of this node.
Definition: suffixTree.h:29
double cost
Cost assigned to this subtree.
Definition: statistics.h:29
Uchar * text
Input text to the encoder.
Definition: text.h:26
#define POPNODE(N)
Pops an element out of the stack.
Definition: stack.h:51
static void update(suffixTree_t s, Uint k, Uint i, suffixTree_t *retS, Uint *retK)
Adds a new character of the input string to the evolving suffix tree.
Definition: suffixTree.c:184
fsmTree_t fsmSuffixTree(suffixTree_t tree)
Transforms this tree in an equivalent fsm tree structure.
Definition: suffixTree.c:422
#define FREE(P)
Frees a block of memory and makes the pointer NULL.
Definition: spacedef.h:58
#define CALLOC(S, T, N)
Allocs a new block of memory and sets all its bits to zero.
Definition: spacedef.h:72
unsigned long Uint
Unsigned int type.
Definition: types.h:54
#define BOOL
Boolean data type.
Definition: types.h:92
void pruneSuffixTree(suffixTree_t tree)
Prunes this suffix tree according to some cost function.
Definition: suffixTree.c:343
Uint symbolCount
Number of occuring symbols.
Definition: statistics.h:28
suffixTree_t initSuffixTree()
Creates and initializes a new suffix tree structure instance.
Definition: suffixTree.c:298
statistics_t allocStatistics()
Creates and initializes a new statistics structure instance.
Definition: statistics.c:29
Encoder context tree structure.
Definition: fsmTree.h:28
statistics_t stats
Pointer to the statistics for this node context.
Definition: suffixTree.h:33
static short pos
Current position in the buffer.
Definition: fsmTree.c:35
static Uint * stack
Definition: statistics.c:24
static Uint stackalloc
Definition: statistics.c:24
static Uint stacktop
Definition: statistics.c:24
void freeSuffixTree(suffixTree_t tree)
Deletes a suffix tree structure instance.
Definition: suffixTree.c:317
#define DEBUGCODE(S)
Definition: debug.h:32
struct fsmTree * fsmTree_t
Encoder context tree structure.
#define False
False boolean constant.
Definition: types.h:100
void freeStatistics(statistics_t st)
Deletes a statistics structure instance.
Definition: statistics.c:42
static void addChild(suffixTree_t s, Uint i)
Creates a new leaf on the tree.
Definition: suffixTree.c:159
Uchar * symbols
List of occuring symbols.
Definition: statistics.h:27
#define ROOT
Flag to indicate a node is the root.
Definition: suffixTree.c:36
#define INFINITY
Flag to indicate that the rightmost index of a label is the index of the last processed character of ...
Definition: suffixTree.c:30
#define GETINDEX(N)
Returns the index of the Nth character in the input text.
Definition: alpha.h:38
static BOOL testAndSplit(suffixTree_t s, Uint k, Uint p, suffixTree_t *r)
Tests if the input state is the end point. If it is not the end point, and it is not an explicit stat...
Definition: suffixTree.c:102
static void canonize(suffixTree_t s, Uint k, Uint p, suffixTree_t *retS, Uint *retK)
Given a reference for a (possibly implicit) state returns the canonical reference to the same state...
Definition: suffixTree.c:60
struct suffixTree * parent
Pointer to the parent of this node.
Definition: suffixTree.h:29
#define NOTSTACKEMPTY
Indicates if the stack is empty.
Definition: stack.h:30
#define PUSHNODE(N)
Pushes a new item to the stack.
Definition: stack.h:37
double nodeCost(statistics_t stats)
Calculates the cost of a node.
Definition: gammaFunc.c:123
struct suffixTree * child
Pointer to the first child of this node.
Definition: suffixTree.h:29
Uint left
Index of the leftmost character of this node label in the input string.
Definition: suffixTree.h:28