Context algorithm
Semi-predictive context algorithm implementation
 All Data Structures Files Functions Variables Typedefs Macros Pages
wotd.c File Reference
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <gsl/gsl_sf_gamma.h>
#include <gsl/gsl_math.h>
#include <sys/types.h>
#include "types.h"
#include "debug.h"
#include "spacedef.h"
#include "fsmTree.h"
#include "stack.h"
#include "alpha.h"
#include "gammaFunc.h"
#include "statistics.h"
#include "text.h"

Go to the source code of this file.

Macros

#define INTWORDSIZE   (UintConst(1) << LOGWORDSIZE)
 Number of bits in Uint. More...
 
#define FIRSTBIT   (UintConst(1) << (INTWORDSIZE-1))
 Most significative bit of a Uint. More...
 
#define SECONDBIT   (FIRSTBIT >> 1)
 Second most significative bit of a Uint. More...
 
#define BRANCHWIDTH   UintConst(3)
 Number of array buckets per node. More...
 
#define MAX_HEIGHT   30
 Maximum allowed height for the tree. More...
 
#define NODEINDEX(N)   ((Uint) ((N) - streetab))
 Given a pointer to the tree array returns the index in the array of that pointer. More...
 
#define LEAFBIT   FIRSTBIT
 Bit used to indicate that a node is a leaf. More...
 
#define RIGHTMOSTCHILDBIT   SECONDBIT
 Bit used to indicate that a node is the last (rightmost) child of a node. More...
 
#define UNEVALUATEDBIT   FIRSTBIT
 Bit used to indicate that a node has not been evaluated. More...
 
#define ISLEAF(P)   ((*(P)) & LEAFBIT)
 Indicates if a node of the tree is a leaf checking the corresponding bit. More...
 
#define ISRIGHTMOSTCHILD(P)   ((*(P)) & RIGHTMOSTCHILDBIT)
 Indicates if a node is the last (rightmost) child of its parent checking the corresponding bit. More...
 
#define ISUNEVALUATED(P)   ((*((P)+1)) & UNEVALUATEDBIT)
 Indicates if a node has not been evaluated checking the corresponding bit. More...
 
#define GETLP(P)   ((*(P)) & ~(LEAFBIT | RIGHTMOSTCHILDBIT))
 Returns the index in the input string of the leftmost character in this node label. More...
 
#define GETFIRSTCHILD(P)   (*((P)+1))
 Returns the index in the tree array of the first child of this node. More...
 
#define SETLP(P, LP)   *(P) = (*(P) & RIGHTMOSTCHILDBIT) | (LP)
 Sets the index in the input string of leftmost character of this node label. More...
 
#define SETFIRSTCHILD(P, C)   *((P)+1) = C
 Sets the index in the tree array of the first child of this node. More...
 
#define SETLEAF(P, L)   *(P) = (L) | LEAFBIT
 Adds a new leaf to the tree. More...
 
#define GETSTATS(P)   (*((P)+2))
 Returns the statistics for a node. More...
 
#define SETSTATS(P, C)   *((P)+2) = C
 Sets the statistics for a node. More...
 
#define UNDEFREFERENCE   (UINT_MAX)
 Undefined reference. More...
 
#define SUFFIXNUMBER(L)   ((Uint) (*(L) - text))
 Given a pointer to the input text returns the index in the array of such pointer. More...
 
#define STOREBOUNDARIES(P, L, R)
 Store the boundaries of the portion of the text that needs to be read when evaluating this node. More...
 
#define GETLEFTBOUNDARY(P)   (suffixbase + *(P))
 Given a pointer to an unevaluated node returns the stored left boundary. More...
 
#define GETRIGHTBOUNDARY(P)   (suffixbase + ((*((P)+1)) & ~UNEVALUATEDBIT))
 Given a pointer to an unevaluated node returns the stored right boundary. More...
 
#define UNDEFINEDSUCC   (UINT_MAX)
 Undefined successor. More...
 
#define MAXSUCCSPACE   (BRANCHWIDTH * alphasize + 1)
 Maximum number of extra array nodes needed for the children of a node. More...
 

Functions

static Uchar ** getsbufferspaceeager (Uchar **left, Uchar **right)
 Does space management for the sbuffer array. More...
 
static void allocstreetab ()
 Enlarges the tree array structure if the current free space is not enough for the evaluation of a new node. More...
 
static void sortByChar (Uchar **left, Uchar **right, Uint prefixlen)
 Sorts lexicographically a portion of the suffixes array. More...
 
static void sortByChar0 ()
 Sorts lexicographically a portion of the suffixes array. More...
 
static Uint grouplcp (Uchar **left, Uchar **right)
 Finds the longest common prefix (LCP) length of a number of consecutive prefixes. More...
 
static Uint evalsuccedges (Uchar **left, Uchar **right)
 Given the boundaries of an unevaluated node, evaluates it and reserves space for all its children. More...
 
static Uint evalrootsuccedges (Uchar **left, Uchar **right)
 Evaluates all the edges outgoing from the root of the tree. More...
 
static Uint evaluatenodeeager (Uint node, Uint *length)
 Extracts the boundaries of an unevaluated node, evaluates it and creates all its children. More...
 
static Uint getnextbranch (Uint previousbranch)
 Returns the next brother of this node that is not a leaf. More...
 
static void pruneRoot ()
 Specifically tests if it is necessary to prune the tree at the root and does it if necessary . More...
 
static void prune (Uint node, Uint length, Uint branchLength)
 Evaluates the cost function to decide if we have to prune this node and does it if necessary. More...
 
static void prune2 (Uint node, Uint length)
 Prunes a node inconditionally. More...
 
static void evaluateeager ()
 Main method to build and prune the tree. More...
 
static void inittree ()
 Initializes the tree array structure and ancillary structures. More...
 
static void wotd ()
 Builds a pruned context tree. More...
 
Uint getnextsibling (Uint node)
 Returns the next sibling of a node. More...
 
static fsmTree_t buildTree ()
 Builds a new fsm tree from the pruned tree array representation. More...
 
fsmTree_t buildSTree ()
 Builds a pruned fsm tree from the input text. More...
 

Variables

Ucharsentinel
 Points to text[textlen] which is undefined. More...
 
Uchar ** suffixes
 Array of pointers to suffixes of t. More...
 
Uchar ** suffixbase
 Pointers into suffixes are considered w.r.t. this pointer. More...
 
Uchar ** sbuffer
 Buffer to sort suffixes in sortByChar. More...
 
Uchar ** sbufferspace = NULL
 Space to be used by sbuffer. More...
 
Uchar ** bound [UCHAR_MAX+1]
 Pointers into sbuffer while sorting. More...
 
Uint occurrence [UCHAR_MAX+1]
 Number of occurrences of each character. More...
 
Uintstreetab = NULL
 Array that holds the suffix tree representation. More...
 
Uint streetabsize
 Number of integers in the allocated streetab memory. More...
 
Uintnextfreeentry
 Pointer to next unused element in streetab. More...
 
Uint sbufferwidth
 Number of elements in sbufferspace. More...
 
Uint maxsbufferwidth
 Maximal number of elements in sbufferspace. More...
 
Uint suffixessize
 Number of unprocessed suffixes. More...
 
Uint maxunusedsuffixes
 When reached, then move and halve space for suffixes. More...
 
Uint rootchildtab [UCHAR_MAX+1]
 Constant time access to successors of root. More...
 
BOOL rootevaluated
 Flag indicating that the root has been evaluated. More...
 

Macro Definition Documentation

#define BRANCHWIDTH   UintConst(3)

Number of array buckets per node.

If more information is needed per node this value should be adjusted accordingly.

Definition at line 48 of file wotd.c.

#define FIRSTBIT   (UintConst(1) << (INTWORDSIZE-1))

Most significative bit of a Uint.

Definition at line 42 of file wotd.c.

#define GETFIRSTCHILD (   P)    (*((P)+1))

Returns the index in the tree array of the first child of this node.

This is the second integer of the three reserved for each internal node.

Parameters
[in]Ppointer to the first index of a node in the tree array.
Returns
the index of the first child.

Definition at line 104 of file wotd.c.

#define GETLEFTBOUNDARY (   P)    (suffixbase + *(P))

Given a pointer to an unevaluated node returns the stored left boundary.

Parameters
[in]Ppointer to the first index of a node in the tree array.
Returns
the left boundary.

Definition at line 168 of file wotd.c.

#define GETLP (   P)    ((*(P)) & ~(LEAFBIT | RIGHTMOSTCHILDBIT))

Returns the index in the input string of the leftmost character in this node label.

First removes the all bit flags.

Parameters
Ppointer to the first index of a node in the tree array.
Returns
the index of the leftmost character.

Definition at line 96 of file wotd.c.

#define GETRIGHTBOUNDARY (   P)    (suffixbase + ((*((P)+1)) & ~UNEVALUATEDBIT))

Given a pointer to an unevaluated node returns the stored right boundary.

This is stored in the second integer of the three reserved for each internal node.

Parameters
[in]Ppointer to the first index of a node in the tree array.
Returns
the right boundary.

Definition at line 176 of file wotd.c.

#define GETSTATS (   P)    (*((P)+2))

Returns the statistics for a node.

This is the third integer of the three reserved for each internal node.

Parameters
[in]Ppointer to the first index of a node in the tree array.
Returns
pointer to the node statistics.

Definition at line 133 of file wotd.c.

#define INTWORDSIZE   (UintConst(1) << LOGWORDSIZE)

Number of bits in Uint.

Definition at line 39 of file wotd.c.

#define ISLEAF (   P)    ((*(P)) & LEAFBIT)

Indicates if a node of the tree is a leaf checking the corresponding bit.

Parameters
[in]Ppointer to the first index on a node in the tree array.
Returns
True if the node is a leaf.

Definition at line 74 of file wotd.c.

#define ISRIGHTMOSTCHILD (   P)    ((*(P)) & RIGHTMOSTCHILDBIT)

Indicates if a node is the last (rightmost) child of its parent checking the corresponding bit.

Parameters
[in]Ppointer to the first index of a node in the tree array.
Returns
True if the node is the last child.

Definition at line 81 of file wotd.c.

#define ISUNEVALUATED (   P)    ((*((P)+1)) & UNEVALUATEDBIT)

Indicates if a node has not been evaluated checking the corresponding bit.

Parameters
[in]Ppointer to the first index of a node in the tree array.
Returns
True if the node has not been evaluated.

Definition at line 88 of file wotd.c.

#define LEAFBIT   FIRSTBIT

Bit used to indicate that a node is a leaf.

Definition at line 61 of file wotd.c.

#define MAX_HEIGHT   30

Maximum allowed height for the tree.

Definition at line 51 of file wotd.c.

#define MAXSUCCSPACE   (BRANCHWIDTH * alphasize + 1)

Maximum number of extra array nodes needed for the children of a node.

The extra 1 is for the $ edge that always leads to a leaf.

Definition at line 230 of file wotd.c.

#define NODEINDEX (   N)    ((Uint) ((N) - streetab))

Given a pointer to the tree array returns the index in the array of that pointer.

Parameters
[in]Npointer to the tree array.
Returns
index of the pointer in the array.

Definition at line 58 of file wotd.c.

#define RIGHTMOSTCHILDBIT   SECONDBIT

Bit used to indicate that a node is the last (rightmost) child of a node.

Definition at line 64 of file wotd.c.

#define SECONDBIT   (FIRSTBIT >> 1)

Second most significative bit of a Uint.

Definition at line 45 of file wotd.c.

#define SETFIRSTCHILD (   P,
 
)    *((P)+1) = C

Sets the index in the tree array of the first child of this node.

Parameters
[in]Ppointer to the first index of a node in the tree array.
[in]Cindex of the first child.

Definition at line 119 of file wotd.c.

#define SETLEAF (   P,
 
)    *(P) = (L) | LEAFBIT

Adds a new leaf to the tree.

Parameters
[out]Ppointer to the tree array where the new leaf will be set.
[in]Lindex in the input string of the leftmost character of this leaf label.

Definition at line 126 of file wotd.c.

#define SETLP (   P,
  LP 
)    *(P) = (*(P) & RIGHTMOSTCHILDBIT) | (LP)

Sets the index in the input string of leftmost character of this node label.

Preserves the rightmost child bit setting.

Parameters
[in]Ppointer to the first index of a node in the tree array.
[in]LPnew index of the leftmost character.

Definition at line 112 of file wotd.c.

#define SETSTATS (   P,
 
)    *((P)+2) = C

Sets the statistics for a node.

Parameters
[in]Ppointer to the first index of a node in the tree array.
[in]Cpointer to the node statistics.

Definition at line 140 of file wotd.c.

#define STOREBOUNDARIES (   P,
  L,
 
)
Value:
*(P) = (Uint) ((L) - suffixbase);\
*((P)+1) = ((R) - suffixbase) | UNEVALUATEDBIT
#define UNEVALUATEDBIT
Bit used to indicate that a node has not been evaluated.
Definition: wotd.c:67
Uchar ** suffixbase
Pointers into suffixes are considered w.r.t. this pointer.
Definition: wotd.c:181
unsigned long Uint
Unsigned int type.
Definition: types.h:54

Store the boundaries of the portion of the text that needs to be read when evaluating this node.

This includes setting the unevaluated bit for that node.

Parameters
[in]Ppointer to the first index of a node in the tree array.
[in]Lthe left boundary.
[in]Rthe right boundary.

Definition at line 160 of file wotd.c.

#define SUFFIXNUMBER (   L)    ((Uint) (*(L) - text))

Given a pointer to the input text returns the index in the array of such pointer.

This is used as the start position of a certain suffix.

Parameters
[in]Lpointer to the input text
Returns
the index of the pointed char in the array.

Definition at line 151 of file wotd.c.

#define UNDEFINEDSUCC   (UINT_MAX)

Undefined successor.

Definition at line 179 of file wotd.c.

#define UNDEFREFERENCE   (UINT_MAX)

Undefined reference.

Definition at line 143 of file wotd.c.

#define UNEVALUATEDBIT   FIRSTBIT

Bit used to indicate that a node has not been evaluated.

Definition at line 67 of file wotd.c.

Function Documentation

static void allocstreetab ( )
static

Enlarges the tree array structure if the current free space is not enough for the evaluation of a new node.

Definition at line 236 of file wotd.c.

fsmTree_t buildSTree ( void  )

Builds a pruned fsm tree from the input text.

Returns
a new fsm tree.

Definition at line 892 of file wotd.c.

static fsmTree_t buildTree ( )
static

Builds a new fsm tree from the pruned tree array representation.

Returns
a new fsm tree.

Definition at line 836 of file wotd.c.

static Uint evalrootsuccedges ( Uchar **  left,
Uchar **  right 
)
static

Evaluates all the edges outgoing from the root of the tree.

It is a specialization of evaluatesuccedges for the root node.

Parameters
[in]leftthe left boundary of the portion of the text that needs to be read for evaluation.
[in]rightthe right boundary of the portion of the text that needs to be read for evaluation.
Returns
the index in the tree array of the first branching child of the now evaluated node.
See Also
evaluatesuccedges

Definition at line 407 of file wotd.c.

static Uint evalsuccedges ( Uchar **  left,
Uchar **  right 
)
static

Given the boundaries of an unevaluated node, evaluates it and reserves space for all its children.

Parameters
[in]leftthe left boundary of the portion of the text that needs to be read for evaluation.
[in]rightthe right boundary of the portion of the text that needs to be read for evaluation.
Returns
the index in the tree array of the first branching child of the now evaluated node.

Definition at line 354 of file wotd.c.

static void evaluateeager ( )
static

Main method to build and prune the tree.

Definition at line 716 of file wotd.c.

static Uint evaluatenodeeager ( Uint  node,
Uint length 
)
static

Extracts the boundaries of an unevaluated node, evaluates it and creates all its children.

Parameters
[in]nodeindex of the node to evaluate in the tree array representation.
[out]lengthlength of the label of this node.
Returns
the index in the tree array of the first branching child of the now evaluated node.

Definition at line 452 of file wotd.c.

static Uint getnextbranch ( Uint  previousbranch)
static

Returns the next brother of this node that is not a leaf.

Parameters
[in]previousbranchindex of the original node in the tree array representation.
Returns
the index of the next branching brother of the node in the tree array representation.

Definition at line 493 of file wotd.c.

Uint getnextsibling ( Uint  node)

Returns the next sibling of a node.

Parameters
[in]nodeindex of the node in the tree array.
Returns
index of the first sibling in the tree array.

Definition at line 815 of file wotd.c.

static Uchar** getsbufferspaceeager ( Uchar **  left,
Uchar **  right 
)
static

Does space management for the sbuffer array.

If there is unused space in the suffixes buffer it is used here, otherwise the sbufferspace array is used (and possibly enlarged if neccesary).

Parameters
[in]leftleft boundary of the portion of the suffixes that will be processed.
[in]rightright boundary of the portion of the suffixes that will be processed.
Returns
a pointer to the selected memory area.

Definition at line 209 of file wotd.c.

static Uint grouplcp ( Uchar **  left,
Uchar **  right 
)
static

Finds the longest common prefix (LCP) length of a number of consecutive prefixes.

Parameters
[in]leftleft boundary of the portion of the suffixes that will be examined.
[in]rightright boundary of the portion of the suffixes that will be examined.
Returns
the LCP length.

Definition at line 325 of file wotd.c.

static void inittree ( )
static

Initializes the tree array structure and ancillary structures.

Definition at line 773 of file wotd.c.

static void prune ( Uint  node,
Uint  length,
Uint  branchLength 
)
static

Evaluates the cost function to decide if we have to prune this node and does it if necessary.

Parameters
[in]nodeindex of the node in the array tree.
[in]lengthlength of the label of this node (from the root of the tree).
[in]branchLengthlength of the label of this node alone.

Definition at line 596 of file wotd.c.

static void prune2 ( Uint  node,
Uint  length 
)
static

Prunes a node inconditionally.

Parameters
[in]nodeindex of the node in the array tree
[in]lengthlength of the label of this node (from the root of the tree).

Definition at line 688 of file wotd.c.

static void pruneRoot ( )
static

Specifically tests if it is necessary to prune the tree at the root and does it if necessary .

Definition at line 521 of file wotd.c.

static void sortByChar ( Uchar **  left,
Uchar **  right,
Uint  prefixlen 
)
static

Sorts lexicographically a portion of the suffixes array.

Parameters
[in]leftleft boundary of the portion of the suffixes that will be sorted.
[in]rightright boundary of the portion of the suffixes that will be sorted.
[in]prefixlenall pointers to the prefixes are advanced this number of chars because it is known that are all equal for all suffixes.

Definition at line 255 of file wotd.c.

static void sortByChar0 ( )
static

Sorts lexicographically a portion of the suffixes array.

It is used the first time to sort the whole array because it can use the suffixes array instead of sbuffer.

Definition at line 295 of file wotd.c.

static void wotd ( )
static

Builds a pruned context tree.

Definition at line 801 of file wotd.c.

Variable Documentation

Uchar ** bound[UCHAR_MAX+1]

Pointers into sbuffer while sorting.

Definition at line 185 of file wotd.c.

Uint maxsbufferwidth

Maximal number of elements in sbufferspace.

Definition at line 189 of file wotd.c.

Uint maxunusedsuffixes

When reached, then move and halve space for suffixes.

Definition at line 189 of file wotd.c.

Uint * nextfreeentry

Pointer to next unused element in streetab.

Definition at line 189 of file wotd.c.

Uint occurrence[UCHAR_MAX+1]

Number of occurrences of each character.

Definition at line 188 of file wotd.c.

Uint rootchildtab[UCHAR_MAX+1]

Constant time access to successors of root.

Definition at line 189 of file wotd.c.

BOOL rootevaluated

Flag indicating that the root has been evaluated.

Definition at line 199 of file wotd.c.

Uchar ** sbuffer

Buffer to sort suffixes in sortByChar.

Definition at line 181 of file wotd.c.

Uchar ** sbufferspace = NULL

Space to be used by sbuffer.

Definition at line 185 of file wotd.c.

Uint sbufferwidth

Number of elements in sbufferspace.

Definition at line 189 of file wotd.c.

Uchar* sentinel

Points to text[textlen] which is undefined.

Definition at line 181 of file wotd.c.

Uint * streetab = NULL

Array that holds the suffix tree representation.

Definition at line 189 of file wotd.c.

Uint streetabsize

Number of integers in the allocated streetab memory.

Definition at line 189 of file wotd.c.

Uchar ** suffixbase

Pointers into suffixes are considered w.r.t. this pointer.

Definition at line 181 of file wotd.c.

Uchar ** suffixes

Array of pointers to suffixes of t.

Definition at line 181 of file wotd.c.

Uint suffixessize

Number of unprocessed suffixes.

Definition at line 189 of file wotd.c.