25 #include <gsl/gsl_sf_gamma.h>
26 #include <gsl/gsl_math.h>
27 #include <sys/types.h>
39 #define INTWORDSIZE (UintConst(1) << LOGWORDSIZE)
42 #define FIRSTBIT (UintConst(1) << (INTWORDSIZE-1))
45 #define SECONDBIT (FIRSTBIT >> 1)
48 #define BRANCHWIDTH UintConst(3)
58 #define NODEINDEX(N) ((Uint) ((N) - streetab))
61 #define LEAFBIT FIRSTBIT
64 #define RIGHTMOSTCHILDBIT SECONDBIT
67 #define UNEVALUATEDBIT FIRSTBIT
74 #define ISLEAF(P) ((*(P)) & LEAFBIT)
81 #define ISRIGHTMOSTCHILD(P) ((*(P)) & RIGHTMOSTCHILDBIT)
88 #define ISUNEVALUATED(P) ((*((P)+1)) & UNEVALUATEDBIT)
96 #define GETLP(P) ((*(P)) & ~(LEAFBIT | RIGHTMOSTCHILDBIT))
104 #define GETFIRSTCHILD(P) (*((P)+1))
112 #define SETLP(P,LP) *(P) = (*(P) & RIGHTMOSTCHILDBIT) | (LP)
119 #define SETFIRSTCHILD(P,C) *((P)+1) = C
126 #define SETLEAF(P,L) *(P) = (L) | LEAFBIT
133 #define GETSTATS(P) (*((P)+2))
140 #define SETSTATS(P,C) *((P)+2) = C
143 #define UNDEFREFERENCE (UINT_MAX)
151 #define SUFFIXNUMBER(L) ((Uint) (*(L) - text))
160 #define STOREBOUNDARIES(P,L,R) *(P) = (Uint) ((L) - suffixbase);\
161 *((P)+1) = ((R) - suffixbase) | UNEVALUATEDBIT
168 #define GETLEFTBOUNDARY(P) (suffixbase + *(P))
176 #define GETRIGHTBOUNDARY(P) (suffixbase + ((*((P)+1)) & ~UNEVALUATEDBIT))
179 #define UNDEFINEDSUCC (UINT_MAX)
186 **
bound[UCHAR_MAX+1];
230 #define MAXSUCCSPACE (BRANCHWIDTH * alphasize + 1)
265 for(i=left; i<=right; i++)
270 for(i=left; i<=right; i++)
276 nextFree =
bound[a]+1;
280 for(i=right; i>=
left; i--)
284 for(i=left,j=
sbuffer; i<=right; i++,j++)
308 nextFree =
bound[a]+1;
311 for(cptr=
text+textlen-1; cptr>=
text; cptr--)
336 cmpchar = *(*left+j);
337 for(i=left+1; i<=right; i++)
339 if(*(*i+j) != cmpchar)
356 Uchar firstchar, **r, **l;
366 for(l=left; l<=right; l=r+1)
368 for(firstchar=**l,r=l; r<right && **(r+1)==firstchar; r++)
395 assert(previousnode != NULL);
409 Uchar firstchar, **r, **l;
416 for(l=left; l<=right; l=r+1)
418 for(firstchar=**l,r=l; r<right && **(r+1)==firstchar; r++)
454 Uint prefixlen, *nodeptr, unusedsuffixes;
466 Uint tmpdiff, width = (
Uint) (right - left + 1);
472 suffixessize -= unusedsuffixes;
538 if (
GETLP(nodeptr) > 0) {
539 pos =
GETLP(nodeptr) - 1;
541 if (stats->
count[idx] == 0) {
556 distinct[childStats->
symbols[i]]++;
583 if (est <= stats->cost) {
597 Uint *nodeptr, *nodeptrPar;
614 if (
GETLP(nodeptr) > length) {
615 pos =
GETLP(nodeptr) - length - 1;
617 if (stats->
count[idx] == 0) {
634 for (i=0; i<childStats->symbolCount; i++) {
635 if (stats->
count[childStats->symbols[i]] == 0) {
638 stats->
count[childStats->symbols[i]] += childStats->count[childStats->symbols[i]];
639 distinct[childStats->symbols[i]]++;
641 stats->
cost += childStats->cost;
668 if (est <= stats->cost) {
675 stats->
cost = 0xFFFFFFFF;
699 for (i = left; i <= right; i++) {
702 if (stats->
count[idx] == 0) {
708 stats->
cost = 0xFFFFFFFF;
735 prune(node, length, branchLength);
748 prune(node, length + branchLength, branchLength);
756 length += branchLength;
791 for(i=0; i<=UCHAR_MAX; i++)
#define UNDEFINEDSUCC
Undefined successor.
Uint rootchildtab[UCHAR_MAX+1]
Constant time access to successors of root.
Uint maxsbufferwidth
Maximal number of elements in sbufferspace.
Uint * count
List with the number of occurrences of each character in this state.
#define GETLEFTBOUNDARY(P)
Given a pointer to an unevaluated node returns the stored left boundary.
fsmTree_t buildSTree()
Builds a pruned fsm tree from the input text.
#define GETRIGHTBOUNDARY(P)
Given a pointer to an unevaluated node returns the stored right boundary.
#define GETSTATS(P)
Returns the statistics for a node.
double escapeCost(statistics_t stats, Uint *distinct)
Calculates the cost of escapes in a node.
fsmTree_t initFsmTree()
Creates and initializes a new fsm tree structure instance.
#define RIGHTMOSTCHILDBIT
Bit used to indicate that a node is the last (rightmost) child of a node.
Uchar * sentinel
Points to text[textlen] which is undefined.
#define BRANCHWIDTH
Number of array buckets per node.
#define SETSTATS(P, C)
Sets the statistics for a node.
Uchar ** sbufferspace
Space to be used by sbuffer.
struct statistics * statistics_t
Structure that saves statistics for each tree node in the encoder.
void freeBuffer()
Deletes all statistics stored in the buffer.
Uchar ** suffixbase
Pointers into suffixes are considered w.r.t. this pointer.
Uint textlen
Length of the input text in the encoder.
static void evaluateeager()
Main method to build and prune the tree.
static void sortByChar0()
Sorts lexicographically a portion of the suffixes array.
Uint * streetab
Array that holds the suffix tree representation.
Uchar ** sbuffer
Buffer to sort suffixes in sortByChar.
#define UintConst(N)
Unsigned integer constant.
double hAlpha()
Returns the binary entropy of 1/alphasize.
Uint * nextfreeentry
Pointer to next unused element in streetab.
Uint suffixessize
Number of unprocessed suffixes.
#define True
True boolean constant.
Uint alphasize
Alphabet size.
unsigned char Uchar
Unsigned char type.
#define ISLEAF(P)
Indicates if a node of the tree is a leaf checking the corresponding bit.
#define SETFIRSTCHILD(P, C)
Sets the index in the tree array of the first child of this node.
struct suffixTree * sibling
Pointer to the next sibling of this node.
#define MAXSUCCSPACE
Maximum number of extra array nodes needed for the children of a node.
double cost
Cost assigned to this subtree.
Uchar * text
Input text to the encoder.
#define POPNODE(N)
Pops an element out of the stack.
static Uint evalsuccedges(Uchar **left, Uchar **right)
Given the boundaries of an unevaluated node, evaluates it and reserves space for all its children...
static Uchar ** getsbufferspaceeager(Uchar **left, Uchar **right)
Does space management for the sbuffer array.
#define NODEINDEX(N)
Given a pointer to the tree array returns the index in the array of that pointer. ...
BOOL rootevaluated
Flag indicating that the root has been evaluated.
#define FREE(P)
Frees a block of memory and makes the pointer NULL.
#define ISRIGHTMOSTCHILD(P)
Indicates if a node is the last (rightmost) child of its parent checking the corresponding bit...
static Uint evaluatenodeeager(Uint node, Uint *length)
Extracts the boundaries of an unevaluated node, evaluates it and creates all its children.
#define CALLOC(S, T, N)
Allocs a new block of memory and sets all its bits to zero.
unsigned long Uint
Unsigned int type.
Uchar ** bound[UCHAR_MAX+1]
Pointers into sbuffer while sorting.
#define BOOL
Boolean data type.
Uint symbolCount
Number of occuring symbols.
#define UNDEFREFERENCE
Undefined reference.
statistics_t getStatistics()
Returns a new statistics structure, can be created or obtained from the buffer.
static void inittree()
Initializes the tree array structure and ancillary structures.
Encoder context tree structure.
statistics_t stats
Pointer to the statistics for this node context.
#define SUFFIXNUMBER(L)
Given a pointer to the input text returns the index in the array of such pointer. ...
static short pos
Current position in the buffer.
static Uint evalrootsuccedges(Uchar **left, Uchar **right)
Evaluates all the edges outgoing from the root of the tree.
Uint sbufferwidth
Number of elements in sbufferspace.
Uchar characters[UCHAR_MAX+1]
Characters in text in alphabetical order.
Structure that saves statistics for each tree node in the encoder.
#define SETLP(P, LP)
Sets the index in the input string of leftmost character of this node label.
static void pruneRoot()
Specifically tests if it is necessary to prune the tree at the root and does it if necessary ...
static Uint grouplcp(Uchar **left, Uchar **right)
Finds the longest common prefix (LCP) length of a number of consecutive prefixes. ...
struct fsmTree * fsmTree_t
Encoder context tree structure.
static void wotd()
Builds a pruned context tree.
#define False
False boolean constant.
void returnStatistics(statistics_t st)
Puts a statistics structure that is no longer used into the buffer.
#define SETLEAF(P, L)
Adds a new leaf to the tree.
static void sortByChar(Uchar **left, Uchar **right, Uint prefixlen)
Sorts lexicographically a portion of the suffixes array.
#define GETFIRSTCHILD(P)
Returns the index in the tree array of the first child of this node.
void freeStatistics(statistics_t st)
Deletes a statistics structure instance.
Uint alphaindex[UCHAR_MAX+1]
Index of each alphabet character.
Uchar * symbols
List of occuring symbols.
static fsmTree_t buildTree()
Builds a new fsm tree from the pruned tree array representation.
Uint occurrence[UCHAR_MAX+1]
Number of occurrences of each character.
#define REALLOC(V, S, T, N)
Resizes a block of memory and prints an error message if necessary.
#define STOREBOUNDARIES(P, L, R)
Store the boundaries of the portion of the text that needs to be read when evaluating this node...
#define LEAFBIT
Bit used to indicate that a node is a leaf.
static void prune2(Uint node, Uint length)
Prunes a node inconditionally.
Uint streetabsize
Number of integers in the allocated streetab memory.
#define GETINDEX(N)
Returns the index of the Nth character in the input text.
#define ISUNEVALUATED(P)
Indicates if a node has not been evaluated checking the corresponding bit.
static void prune(Uint node, Uint length, Uint branchLength)
Evaluates the cost function to decide if we have to prune this node and does it if necessary...
struct suffixTree * parent
Pointer to the parent of this node.
#define NOTSTACKEMPTY
Indicates if the stack is empty.
Uint maxunusedsuffixes
When reached, then move and halve space for suffixes.
Uint getnextsibling(Uint node)
Returns the next sibling of a node.
static void allocstreetab()
Enlarges the tree array structure if the current free space is not enough for the evaluation of a new...
#define PUSHNODE(N)
Pushes a new item to the stack.
double nodeCost(statistics_t stats)
Calculates the cost of a node.
static Uint getnextbranch(Uint previousbranch)
Returns the next brother of this node that is not a leaf.
Uchar ** suffixes
Array of pointers to suffixes of t.
struct suffixTree * child
Pointer to the first child of this node.
#define MAX_HEIGHT
Maximum allowed height for the tree.
Uint left
Index of the leftmost character of this node label in the input string.
#define GETLP(P)
Returns the index in the input string of the leftmost character in this node label.