Context algorithm
Semi-predictive context algorithm implementation
 All Data Structures Files Functions Variables Typedefs Macros Pages
wotd.c
Go to the documentation of this file.
1 /* Copyright 2013 Jorge Merlino
2 
3  This file is part of Context.
4 
5  Context is free software: you can redistribute it and/or modify
6  it under the terms of the GNU General Public License as published by
7  the Free Software Foundation, either version 3 of the License, or
8  (at your option) any later version.
9 
10  Context is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with Context. If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <limits.h>
22 #include <string.h>
23 #include <math.h>
24 #include <assert.h>
25 #include <gsl/gsl_sf_gamma.h>
26 #include <gsl/gsl_math.h>
27 #include <sys/types.h>
28 #include "types.h"
29 #include "debug.h"
30 #include "spacedef.h"
31 #include "fsmTree.h"
32 #include "stack.h"
33 #include "alpha.h"
34 #include "gammaFunc.h"
35 #include "statistics.h"
36 #include "text.h"
37 
39 #define INTWORDSIZE (UintConst(1) << LOGWORDSIZE)
40 
42 #define FIRSTBIT (UintConst(1) << (INTWORDSIZE-1))
43 
45 #define SECONDBIT (FIRSTBIT >> 1)
46 
48 #define BRANCHWIDTH UintConst(3)
49 
51 #define MAX_HEIGHT 30
52 
58 #define NODEINDEX(N) ((Uint) ((N) - streetab))
59 
61 #define LEAFBIT FIRSTBIT
62 
64 #define RIGHTMOSTCHILDBIT SECONDBIT
65 
67 #define UNEVALUATEDBIT FIRSTBIT
68 
74 #define ISLEAF(P) ((*(P)) & LEAFBIT)
75 
81 #define ISRIGHTMOSTCHILD(P) ((*(P)) & RIGHTMOSTCHILDBIT)
82 
88 #define ISUNEVALUATED(P) ((*((P)+1)) & UNEVALUATEDBIT)
89 
96 #define GETLP(P) ((*(P)) & ~(LEAFBIT | RIGHTMOSTCHILDBIT))
97 
104 #define GETFIRSTCHILD(P) (*((P)+1))
105 
112 #define SETLP(P,LP) *(P) = (*(P) & RIGHTMOSTCHILDBIT) | (LP)
113 
119 #define SETFIRSTCHILD(P,C) *((P)+1) = C
120 
126 #define SETLEAF(P,L) *(P) = (L) | LEAFBIT
127 
133 #define GETSTATS(P) (*((P)+2))
134 
140 #define SETSTATS(P,C) *((P)+2) = C
141 
143 #define UNDEFREFERENCE (UINT_MAX)
144 
151 #define SUFFIXNUMBER(L) ((Uint) (*(L) - text))
152 
160 #define STOREBOUNDARIES(P,L,R) *(P) = (Uint) ((L) - suffixbase);\
161  *((P)+1) = ((R) - suffixbase) | UNEVALUATEDBIT
162 
168 #define GETLEFTBOUNDARY(P) (suffixbase + *(P))
169 
176 #define GETRIGHTBOUNDARY(P) (suffixbase + ((*((P)+1)) & ~UNEVALUATEDBIT))
177 
179 #define UNDEFINEDSUCC (UINT_MAX)
180 
182  **suffixes,
183  **suffixbase,
184  **sbuffer,
185  **sbufferspace = NULL,
186  **bound[UCHAR_MAX+1];
188 Uint occurrence[UCHAR_MAX+1],
189  *streetab = NULL,
190  streetabsize,
191  *nextfreeentry,
192  sbufferwidth,
194  suffixessize,
196  rootchildtab[UCHAR_MAX+1];
200 
201 
210 {
211  Uint width = (Uint) (right-left+1);
212 
213  if(width > (Uint) (left-suffixes))
214  {
215  if(width > sbufferwidth)
216  {
217  sbufferwidth = width;
219  }
220  return sbufferspace;
221  }
222  return left - width;
223 }
224 
225 
230 #define MAXSUCCSPACE (BRANCHWIDTH * alphasize + 1)
231 
232 
236 static void allocstreetab()
237 {
238  Uint tmpindex = NODEINDEX(nextfreeentry);
239  if (tmpindex + MAXSUCCSPACE >= streetabsize) {
242  /* update necessary, since streetab may have been moved. */
243  nextfreeentry = streetab + tmpindex;
244  }
245 }
246 
247 
255 static void sortByChar(Uchar **left, Uchar **right, Uint prefixlen)
256 {
257  Uchar **i, **j, **nextFree = sbuffer;
258  Uint a;
259 
260  if(*right + prefixlen == sentinel) /* shortest suffix is sentinel: skip */
261  {
262  *right += prefixlen;
263  right--;
264  }
265  for(i=left; i<=right; i++) /* determine size for each group */
266  {
267  *i += prefixlen; /* drop the common prefix */
268  occurrence[(Uint) **i]++;
269  }
270  for(i=left; i<=right; i++) /* determine right bound for each group */
271  {
272  a = (Uint) **i;
273  if(occurrence[a] > 0)
274  {
275  bound[a] = nextFree+occurrence[a]-1;
276  nextFree = bound[a]+1;
277  occurrence[a] = 0;
278  }
279  }
280  for(i=right; i>=left; i--) /* insert suffixes into buffer */
281  {
282  *(bound[(Uint) **i]--) = *i;
283  }
284  for(i=left,j=sbuffer; i<=right; i++,j++) /* copy grouped suffixes back */
285  {
286  *i = *j;
287  }
288 }
289 
290 
295 static void sortByChar0()
296 {
297  Uchar *cptr, **nextFree = suffixes;
298  Uint a;
299 
300  for(cptr=text; cptr < text+textlen; cptr++) /* determine size for each group */
301  {
302  occurrence[(Uint) *cptr]++;
303  }
304  for(cptr=characters; cptr < characters+alphasize; cptr++)
305  {
306  a = (Uint) *cptr;
307  bound[a] = nextFree+occurrence[a]-1;
308  nextFree = bound[a]+1;
309  occurrence[a] = 0;
310  }
311  for(cptr=text+textlen-1; cptr>=text; cptr--) /* insert suffixes into array */
312  {
313  *(bound[(Uint) *cptr]--) = cptr;
314  }
315  suffixes[textlen] = sentinel; /* suffix $ is the largest suffix */
316 }
317 
318 
325 static Uint grouplcp(Uchar **left,Uchar **right)
326 {
327  Uchar cmpchar, **i;
328  Uint j;
329 
330  for(j=UintConst(1); /* nothing */; j++)
331  {
332  if(*right+j == sentinel)
333  {
334  return j;
335  }
336  cmpchar = *(*left+j);
337  for(i=left+1; i<=right; i++)
338  {
339  if(*(*i+j) != cmpchar)
340  {
341  return j;
342  }
343  }
344  }
345 }
346 
347 
354 static Uint evalsuccedges(Uchar **left,Uchar **right)
355 {
356  Uchar firstchar, **r, **l;
357  Uint leafnum, firstbranch = UNDEFREFERENCE, *previousnode = NULL;
358  BOOL sentineledge = False;
359 
360  allocstreetab();
361  if(*right == sentinel)
362  {
363  right--; /* skip the smallest suffix */
364  sentineledge = True;
365  }
366  for(l=left; l<=right; l=r+1)
367  {
368  for(firstchar=**l,r=l; r<right && **(r+1)==firstchar; r++)
369  {
370  /* nothing */ ;
371  }
372  previousnode = nextfreeentry;
373  if(r > l) /* create branching node */
374  {
375  if(firstbranch == UNDEFREFERENCE)
376  {
377  firstbranch = NODEINDEX(nextfreeentry);
378  }
380  /* store l and r. resume later with this unevaluated node */
382  } else /* create leaf */
383  {
384  leafnum = SUFFIXNUMBER(l);
385  SETLEAF(nextfreeentry,leafnum);
386  nextfreeentry++;
387  }
388  }
389  if(sentineledge)
390  {
391  leafnum = SUFFIXNUMBER(right+1);
392  SETLEAF(nextfreeentry,leafnum);
393  previousnode = nextfreeentry++;
394  }
395  assert(previousnode != NULL);
396  *previousnode |= RIGHTMOSTCHILDBIT;
397  return firstbranch;
398 }
399 
408 {
409  Uchar firstchar, **r, **l;
410  Uint *rptr, leafnum, firstbranch = UNDEFREFERENCE;
411 
412  for(rptr = rootchildtab; rptr <= rootchildtab + UCHAR_MAX; rptr++)
413  {
414  *rptr = UNDEFINEDSUCC;
415  }
416  for(l=left; l<=right; l=r+1) /* first phase */
417  {
418  for(firstchar=**l,r=l; r<right && **(r+1)==firstchar; r++)
419  {
420  /* nothing */ ;
421  }
422  if(r > l) /* create branching node */
423  {
424  if(firstbranch == UNDEFREFERENCE)
425  {
426  firstbranch = NODEINDEX(nextfreeentry);
427  }
429  /* store l and r. resume later with this unevaluated branch node */
430  rootchildtab[firstchar] = NODEINDEX(nextfreeentry);
432  } else /* create leaf */
433  {
434  leafnum = SUFFIXNUMBER(l);
435  SETLEAF(nextfreeentry,leafnum);
436  rootchildtab[firstchar] = leafnum | LEAFBIT;
437  nextfreeentry++;
438  }
439  }
441  nextfreeentry++;
442  return firstbranch;
443 }
444 
445 
452 static Uint evaluatenodeeager(Uint node, Uint *length)
453 {
454  Uint prefixlen, *nodeptr, unusedsuffixes;
455  Uchar **left, **right;
456 
457  nodeptr = streetab + node;
458  left = GETLEFTBOUNDARY(nodeptr);
459  right = GETRIGHTBOUNDARY(nodeptr);
460  SETLP(nodeptr,SUFFIXNUMBER(left));
462 
463  unusedsuffixes = (Uint) (left - suffixes);
464  if(suffixessize > UintConst(10000) && unusedsuffixes > maxunusedsuffixes)
465  {
466  Uint tmpdiff, width = (Uint) (right - left + 1);
467  Uchar **i, **j;
468  for(i=left, j=suffixes; i<suffixes+suffixessize; i++, j++)
469  {
470  *j = *i; /* move remaining suffixes to the left */
471  }
472  suffixessize -= unusedsuffixes;
473  maxunusedsuffixes = suffixessize >> 1;
474  tmpdiff = (Uint) (suffixes - suffixbase);
475  REALLOC(suffixes,suffixes,Uchar *,suffixessize);
476  suffixbase = suffixes - (tmpdiff + unusedsuffixes);
477  left = suffixes;
478  right = suffixes + width - 1;
479  }
480  sbuffer = getsbufferspaceeager(left,right);
481  prefixlen = grouplcp(left,right);
482  *length = prefixlen;
483  sortByChar(left,right,prefixlen);
484  return evalsuccedges(left,right);
485 }
486 
487 
493 static Uint getnextbranch(Uint previousbranch)
494 {
495  Uint *nodeptr = streetab + previousbranch;
496 
497  if(ISRIGHTMOSTCHILD(nodeptr))
498  {
499  return UNDEFREFERENCE;
500  }
501  nodeptr += BRANCHWIDTH;
502  while(True)
503  {
504  if(ISLEAF(nodeptr))
505  {
506  if(ISRIGHTMOSTCHILD(nodeptr))
507  {
508  return UNDEFREFERENCE;
509  }
510  nodeptr++;
511  } else
512  {
513  return NODEINDEX(nodeptr);
514  }
515  }
516 }
517 
521 static void pruneRoot() {
522  Uint *nodeptr;
523  Uint pos, i, idx;
524  Uint end;
525  statistics_t stats, childStats;
526  double est, auxx;
527 
528  Uint * distinct;
529  CALLOC(distinct, Uint, alphasize);
530 
531  nodeptr = streetab;
532 
533  /* counters */
534  stats = getStatistics();
535 
536  do {
537  if (ISLEAF(nodeptr)) {
538  if (GETLP(nodeptr) > 0) { /* if false the previous character is outside of string */
539  pos = GETLP(nodeptr) - 1; /* position where leaf starts minus previous length */
540  idx = alphaindex[*(text+pos)];
541  if (stats->count[idx] == 0) {
542  stats->symbols[stats->symbolCount++] = idx;
543  }
544  stats->count[idx]++;
545  distinct[idx]++;
546  }
547  /*stats->cost += log2Alpha();*/
548  }
549  else {
550  childStats = (statistics_t)GETSTATS(nodeptr);
551  for (i=0; i<childStats->symbolCount; i++) {
552  if (stats->count[childStats->symbols[i]] == 0) {
553  stats->symbols[stats->symbolCount++] = childStats->symbols[i];
554  }
555  stats->count[childStats->symbols[i]] += childStats->count[childStats->symbols[i]];
556  distinct[childStats->symbols[i]]++;
557  }
558  stats->cost += childStats->cost;
559  returnStatistics(childStats);
560  }
561 
562  end = ISRIGHTMOSTCHILD(nodeptr);
563  if (!end) {
564  if (ISLEAF(nodeptr)) {
565  nodeptr++;
566  }
567  else {
568  nodeptr += BRANCHWIDTH;
569  }
570  }
571  } while (!end);
572 
573  stats->cost += hAlpha() * alphasize;
574 
575  auxx = escapeCost(stats, distinct);
576  stats->cost += auxx;
577  assert(auxx >= 0);
578 
579  free(distinct);
580 
581  est = nodeCost(stats);
582 
583  if (est <= stats->cost) { /* pruning needed */
584  nextfreeentry = 0; /* indicates an empty tree */
585  }
586  freeStatistics(stats);
587 }
588 
589 
596 static void prune(Uint node, Uint length, Uint branchLength) {
597  Uint *nodeptr, *nodeptrPar;
598  Uint pos, i, idx;
599  statistics_t stats=NULL, childStats;
600  Uint end;
601  double est, auxx;
602 
603  Uint * distinct;
604  CALLOC(distinct, Uint, alphasize);
605 
606  nodeptrPar = streetab + node;
607  nodeptr = streetab + GETFIRSTCHILD(nodeptrPar);
608 
609  do {
610  if (ISLEAF(nodeptr)) {
611  if (stats == NULL) {
612  stats = getStatistics();
613  }
614  if (GETLP(nodeptr) > length) { /* if false the previous character is outside the string */
615  pos = GETLP(nodeptr) - length - 1; /* position where the leaf begins minus previous length */
616  idx = alphaindex[*(text+pos)];
617  if (stats->count[idx] == 0) {
618  stats->symbols[stats->symbolCount++] = idx;
619  }
620  stats->count[idx]++;
621  distinct[idx]++;
622  }
623  }
624  else {
625  childStats = (statistics_t)GETSTATS(nodeptr);
626 
627  if (stats == NULL) {
628  stats = childStats;
629  for (i=0; i<stats->symbolCount; i++) {
630  distinct[stats->symbols[i]]++;
631  }
632  }
633  else {
634  for (i=0; i<childStats->symbolCount; i++) {
635  if (stats->count[childStats->symbols[i]] == 0) {
636  stats->symbols[stats->symbolCount++] = childStats->symbols[i];
637  }
638  stats->count[childStats->symbols[i]] += childStats->count[childStats->symbols[i]];
639  distinct[childStats->symbols[i]]++;
640  }
641  stats->cost += childStats->cost;
642  returnStatistics(childStats);
643  }
644  }
645 
646  end = ISRIGHTMOSTCHILD(nodeptr);
647  if (!end) {
648  if (ISLEAF(nodeptr)) {
649  nodeptr++;
650  }
651  else {
652  nodeptr += BRANCHWIDTH;
653  }
654  }
655  } while (!end);
656 
657  stats->cost += hAlpha() * alphasize * branchLength;
658 
659  auxx = escapeCost(stats, distinct);
660  stats->cost += auxx;
661  assert(auxx >= 0);
662 
663  free(distinct);
664  if ((length-branchLength) < MAX_HEIGHT) {
665  est = nodeCost(stats);
666  assert(est >= 0);
667 
668  if (est <= stats->cost) { /* pruning needed */
669  nextfreeentry = streetab + GETFIRSTCHILD(nodeptrPar);
670  SETFIRSTCHILD(nodeptrPar, UNDEFREFERENCE);
671  stats->cost = est;
672  }
673  }
674  else {
675  stats->cost = 0xFFFFFFFF;
676  nextfreeentry = streetab + GETFIRSTCHILD(nodeptrPar);
677  SETFIRSTCHILD(nodeptrPar, UNDEFREFERENCE);
678  }
679 
680  SETSTATS(nodeptrPar, (Uint)stats);
681 }
682 
688 static void prune2 (Uint node, Uint length) {
689  Uint *nodeptr, idx;
690  Uchar **left, **right, **i;
692 
693  stats = getStatistics();
694 
695  nodeptr = streetab + node;
696  left = GETLEFTBOUNDARY(nodeptr);
697  right = GETRIGHTBOUNDARY(nodeptr);
698 
699  for (i = left; i <= right; i++) {
700  if (SUFFIXNUMBER(i) > length) {
701  idx = alphaindex[text[SUFFIXNUMBER(i) - length - 1]];
702  if (stats->count[idx] == 0) {
703  stats->symbols[stats->symbolCount++] = idx;
704  }
705  stats->count[idx]++;
706  }
707  }
708  stats->cost = 0xFFFFFFFF;
709  SETSTATS(nodeptr, (Uint)stats);
710  SETFIRSTCHILD(nodeptr, UNDEFREFERENCE);
711 }
712 
716 static void evaluateeager() {
717  Uint firstbranch, nextbranch, node, newNode, length, branchLength, stacktop=0, stackalloc=0, *stack = NULL;
718 
719  sortByChar0();
720  firstbranch = evalrootsuccedges(suffixes,suffixes+textlen-1);
721 
722  if(firstbranch != UNDEFREFERENCE)
723  {
724  PUSHNODE(firstbranch);
725  PUSHNODE(0);
726  PUSHNODE(0);
727  while(NOTSTACKEMPTY)
728  {
729  POPNODE(branchLength);
730  POPNODE(length);
731  POPNODE(node);
732  while(node != UNDEFREFERENCE)
733  {
734  if (!ISUNEVALUATED(streetab+node)) { /* the node has already been evaluated */
735  prune(node, length, branchLength);
736  node = UNDEFREFERENCE; /* to continue the while */
737  }
738  else {
739  if((nextbranch = getnextbranch(node)) != UNDEFREFERENCE) {
740  PUSHNODE(nextbranch);
741  PUSHNODE(length); /* my parent's length */
742  PUSHNODE(0); /* does not matter, will be branchLenth when evaluated */
743  }
744 
745  if (length < MAX_HEIGHT) {
746  newNode = evaluatenodeeager(node, &branchLength);
747  if (newNode == UNDEFREFERENCE) { /* all children are leaves */
748  prune(node, length + branchLength, branchLength);
749  }
750  else {
751  PUSHNODE(node);
752  PUSHNODE(branchLength + length);
753  PUSHNODE(branchLength);
754  }
755  node = newNode;
756  length += branchLength; /* is the length of the father */
757  }
758  else {
759  prune2(node, length);
760  node = UNDEFREFERENCE;
761  }
762  }
763  }
764  }
765  FREE(stack);
766  }
767 }
768 
769 
773 static void inittree()
774 {
775  Uint i;
776 
781 
782  /* no partition */
783  suffixessize = textlen+1;
787 
788  sbufferwidth = 0;
789  maxsbufferwidth = textlen >> 8;
791  for(i=0; i<=UCHAR_MAX; i++)
792  {
793  occurrence[i] = 0;
794  }
795 }
796 
797 
801 static void wotd()
802 {
803  inittree();
804  evaluateeager();
805  FREE(suffixes);
806  FREE(sbufferspace);
807 }
808 
809 
816  Uint *nodeptr = streetab + node;
817 
818  if(ISRIGHTMOSTCHILD(nodeptr))
819  {
820  return UNDEFREFERENCE;
821  }
822  else {
823  if(ISLEAF(nodeptr)) {
824  return NODEINDEX(nodeptr + 1);
825  }
826  else {
827  return NODEINDEX(nodeptr + BRANCHWIDTH);
828  }
829  }
830 }
831 
832 
836 static fsmTree_t buildTree () {
837  Uint stacktop=0, stackalloc=0, *stack = NULL, sibling, child, pos, node, *nodeptr, parentptr;
838  fsmTree_t ret = initFsmTree(), parent;
839 
840  if (nextfreeentry == 0) { /* only root */
841  return ret;
842  }
843 
844  PUSHNODE(0);
845  PUSHNODE((Uint)ret);
846 
847  while(NOTSTACKEMPTY) {
848  POPNODE(parentptr);
849  POPNODE(node);
850  parent = (fsmTree_t)parentptr;
851  nodeptr = streetab + node;
852 
853  if (GETLP(nodeptr) != textlen) { /* ignore $ leaves */
854  pos = GETINDEX(GETLP(nodeptr));
855  parent->children[pos] = initFsmTree();
856  parent->children[pos]->parent = parent;
857  parent->children[pos]->left = GETLP(nodeptr);
858 
859  /* push the next sibling */
860  sibling = getnextsibling(node);
861  if (sibling != UNDEFREFERENCE) {
862  PUSHNODE(sibling);
863  PUSHNODE(parentptr);
864  }
865 
866  if (parent->left != -1) { /* != ROOT */
867  parent->children[pos]->length = parent->length + parent->right - parent->left + 1;
868  }
869  if (ISLEAF(nodeptr) || GETFIRSTCHILD(nodeptr) == UNDEFREFERENCE) { /* is a leaf or has been pruned */
870  parent->children[pos]->right = GETLP(nodeptr); /* shorten leaf */
871  }
872  else {
873  child = GETFIRSTCHILD(nodeptr);
874  parent->children[pos]->right = GETLP(streetab + child) - 1;
875 
876  /* push this child */
877  PUSHNODE(child);
878  PUSHNODE((Uint)parent->children[pos]);
879  }
880  }
881  }
882 
883  FREE(stack);
884  FREE(streetab);
885  return ret;
886 }
887 
888 
893 {
894  wotd();
895  /* as the representation has no root it has to be done specifically */
896  pruneRoot();
897  freeBuffer();
898 
899  return buildTree();
900 }
#define UNDEFINEDSUCC
Undefined successor.
Definition: wotd.c:179
Uint rootchildtab[UCHAR_MAX+1]
Constant time access to successors of root.
Definition: wotd.c:189
Uint maxsbufferwidth
Maximal number of elements in sbufferspace.
Definition: wotd.c:189
Uint * count
List with the number of occurrences of each character in this state.
Definition: statistics.h:26
#define GETLEFTBOUNDARY(P)
Given a pointer to an unevaluated node returns the stored left boundary.
Definition: wotd.c:168
fsmTree_t buildSTree()
Builds a pruned fsm tree from the input text.
Definition: wotd.c:892
#define GETRIGHTBOUNDARY(P)
Given a pointer to an unevaluated node returns the stored right boundary.
Definition: wotd.c:176
#define GETSTATS(P)
Returns the statistics for a node.
Definition: wotd.c:133
double escapeCost(statistics_t stats, Uint *distinct)
Calculates the cost of escapes in a node.
Definition: gammaFunc.c:143
fsmTree_t initFsmTree()
Creates and initializes a new fsm tree structure instance.
Definition: fsmTree.c:487
#define RIGHTMOSTCHILDBIT
Bit used to indicate that a node is the last (rightmost) child of a node.
Definition: wotd.c:64
Uchar * sentinel
Points to text[textlen] which is undefined.
Definition: wotd.c:181
#define BRANCHWIDTH
Number of array buckets per node.
Definition: wotd.c:48
#define SETSTATS(P, C)
Sets the statistics for a node.
Definition: wotd.c:140
Uchar ** sbufferspace
Space to be used by sbuffer.
Definition: wotd.c:185
struct statistics * statistics_t
Structure that saves statistics for each tree node in the encoder.
void freeBuffer()
Deletes all statistics stored in the buffer.
Definition: statistics.c:74
Uchar ** suffixbase
Pointers into suffixes are considered w.r.t. this pointer.
Definition: wotd.c:181
Uint textlen
Length of the input text in the encoder.
Definition: text.h:23
static void evaluateeager()
Main method to build and prune the tree.
Definition: wotd.c:716
static void sortByChar0()
Sorts lexicographically a portion of the suffixes array.
Definition: wotd.c:295
Uint * streetab
Array that holds the suffix tree representation.
Definition: wotd.c:189
Uchar ** sbuffer
Buffer to sort suffixes in sortByChar.
Definition: wotd.c:181
#define UintConst(N)
Unsigned integer constant.
Definition: types.h:78
double hAlpha()
Returns the binary entropy of 1/alphasize.
Definition: gammaFunc.c:171
Uint * nextfreeentry
Pointer to next unused element in streetab.
Definition: wotd.c:189
Uint suffixessize
Number of unprocessed suffixes.
Definition: wotd.c:189
#define True
True boolean constant.
Definition: types.h:108
Uint alphasize
Alphabet size.
Definition: alpha.h:25
unsigned char Uchar
Unsigned char type.
Definition: types.h:48
#define ISLEAF(P)
Indicates if a node of the tree is a leaf checking the corresponding bit.
Definition: wotd.c:74
#define SETFIRSTCHILD(P, C)
Sets the index in the tree array of the first child of this node.
Definition: wotd.c:119
struct suffixTree * sibling
Pointer to the next sibling of this node.
Definition: suffixTree.h:29
#define MAXSUCCSPACE
Maximum number of extra array nodes needed for the children of a node.
Definition: wotd.c:230
double cost
Cost assigned to this subtree.
Definition: statistics.h:29
Uchar * text
Input text to the encoder.
Definition: text.h:26
#define POPNODE(N)
Pops an element out of the stack.
Definition: stack.h:51
static Uint evalsuccedges(Uchar **left, Uchar **right)
Given the boundaries of an unevaluated node, evaluates it and reserves space for all its children...
Definition: wotd.c:354
static Uchar ** getsbufferspaceeager(Uchar **left, Uchar **right)
Does space management for the sbuffer array.
Definition: wotd.c:209
#define NODEINDEX(N)
Given a pointer to the tree array returns the index in the array of that pointer. ...
Definition: wotd.c:58
BOOL rootevaluated
Flag indicating that the root has been evaluated.
Definition: wotd.c:199
#define FREE(P)
Frees a block of memory and makes the pointer NULL.
Definition: spacedef.h:58
#define ISRIGHTMOSTCHILD(P)
Indicates if a node is the last (rightmost) child of its parent checking the corresponding bit...
Definition: wotd.c:81
static Uint evaluatenodeeager(Uint node, Uint *length)
Extracts the boundaries of an unevaluated node, evaluates it and creates all its children.
Definition: wotd.c:452
#define CALLOC(S, T, N)
Allocs a new block of memory and sets all its bits to zero.
Definition: spacedef.h:72
unsigned long Uint
Unsigned int type.
Definition: types.h:54
Uchar ** bound[UCHAR_MAX+1]
Pointers into sbuffer while sorting.
Definition: wotd.c:185
#define BOOL
Boolean data type.
Definition: types.h:92
Uint symbolCount
Number of occuring symbols.
Definition: statistics.h:28
#define UNDEFREFERENCE
Undefined reference.
Definition: wotd.c:143
statistics_t getStatistics()
Returns a new statistics structure, can be created or obtained from the buffer.
Definition: statistics.c:51
static void inittree()
Initializes the tree array structure and ancillary structures.
Definition: wotd.c:773
Encoder context tree structure.
Definition: fsmTree.h:28
statistics_t stats
Pointer to the statistics for this node context.
Definition: suffixTree.h:33
#define SUFFIXNUMBER(L)
Given a pointer to the input text returns the index in the array of such pointer. ...
Definition: wotd.c:151
static short pos
Current position in the buffer.
Definition: fsmTree.c:35
static Uint * stack
Definition: statistics.c:24
static Uint evalrootsuccedges(Uchar **left, Uchar **right)
Evaluates all the edges outgoing from the root of the tree.
Definition: wotd.c:407
Uint sbufferwidth
Number of elements in sbufferspace.
Definition: wotd.c:189
static Uint stackalloc
Definition: statistics.c:24
static Uint stacktop
Definition: statistics.c:24
Uchar characters[UCHAR_MAX+1]
Characters in text in alphabetical order.
Definition: alpha.h:28
Structure that saves statistics for each tree node in the encoder.
Definition: statistics.h:25
#define SETLP(P, LP)
Sets the index in the input string of leftmost character of this node label.
Definition: wotd.c:112
static void pruneRoot()
Specifically tests if it is necessary to prune the tree at the root and does it if necessary ...
Definition: wotd.c:521
static Uint grouplcp(Uchar **left, Uchar **right)
Finds the longest common prefix (LCP) length of a number of consecutive prefixes. ...
Definition: wotd.c:325
struct fsmTree * fsmTree_t
Encoder context tree structure.
static void wotd()
Builds a pruned context tree.
Definition: wotd.c:801
#define False
False boolean constant.
Definition: types.h:100
void returnStatistics(statistics_t st)
Puts a statistics structure that is no longer used into the buffer.
Definition: statistics.c:66
#define SETLEAF(P, L)
Adds a new leaf to the tree.
Definition: wotd.c:126
static void sortByChar(Uchar **left, Uchar **right, Uint prefixlen)
Sorts lexicographically a portion of the suffixes array.
Definition: wotd.c:255
#define GETFIRSTCHILD(P)
Returns the index in the tree array of the first child of this node.
Definition: wotd.c:104
void freeStatistics(statistics_t st)
Deletes a statistics structure instance.
Definition: statistics.c:42
Uint alphaindex[UCHAR_MAX+1]
Index of each alphabet character.
Definition: alpha.h:31
Uchar * symbols
List of occuring symbols.
Definition: statistics.h:27
static fsmTree_t buildTree()
Builds a new fsm tree from the pruned tree array representation.
Definition: wotd.c:836
Uint occurrence[UCHAR_MAX+1]
Number of occurrences of each character.
Definition: wotd.c:188
#define REALLOC(V, S, T, N)
Resizes a block of memory and prints an error message if necessary.
Definition: spacedef.h:43
#define STOREBOUNDARIES(P, L, R)
Store the boundaries of the portion of the text that needs to be read when evaluating this node...
Definition: wotd.c:160
#define LEAFBIT
Bit used to indicate that a node is a leaf.
Definition: wotd.c:61
static void prune2(Uint node, Uint length)
Prunes a node inconditionally.
Definition: wotd.c:688
Uint streetabsize
Number of integers in the allocated streetab memory.
Definition: wotd.c:189
#define GETINDEX(N)
Returns the index of the Nth character in the input text.
Definition: alpha.h:38
#define ISUNEVALUATED(P)
Indicates if a node has not been evaluated checking the corresponding bit.
Definition: wotd.c:88
static void prune(Uint node, Uint length, Uint branchLength)
Evaluates the cost function to decide if we have to prune this node and does it if necessary...
Definition: wotd.c:596
struct suffixTree * parent
Pointer to the parent of this node.
Definition: suffixTree.h:29
#define NOTSTACKEMPTY
Indicates if the stack is empty.
Definition: stack.h:30
Uint maxunusedsuffixes
When reached, then move and halve space for suffixes.
Definition: wotd.c:189
Uint getnextsibling(Uint node)
Returns the next sibling of a node.
Definition: wotd.c:815
static void allocstreetab()
Enlarges the tree array structure if the current free space is not enough for the evaluation of a new...
Definition: wotd.c:236
#define PUSHNODE(N)
Pushes a new item to the stack.
Definition: stack.h:37
double nodeCost(statistics_t stats)
Calculates the cost of a node.
Definition: gammaFunc.c:123
static Uint getnextbranch(Uint previousbranch)
Returns the next brother of this node that is not a leaf.
Definition: wotd.c:493
Uchar ** suffixes
Array of pointers to suffixes of t.
Definition: wotd.c:181
struct suffixTree * child
Pointer to the first child of this node.
Definition: suffixTree.h:29
#define MAX_HEIGHT
Maximum allowed height for the tree.
Definition: wotd.c:51
Uint left
Index of the leftmost character of this node label in the input string.
Definition: suffixTree.h:28
#define GETLP(P)
Returns the index in the input string of the leftmost character in this node label.
Definition: wotd.c:96