Context algorithm
Semi-predictive context algorithm implementation
|
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <gsl/gsl_sf_gamma.h>
#include <gsl/gsl_math.h>
#include <sys/types.h>
#include "types.h"
#include "debug.h"
#include "spacedef.h"
#include "fsmTree.h"
#include "stack.h"
#include "alpha.h"
#include "gammaFunc.h"
#include "statistics.h"
#include "text.h"
Go to the source code of this file.
Macros | |
#define | INTWORDSIZE (UintConst(1) << LOGWORDSIZE) |
Number of bits in Uint. More... | |
#define | FIRSTBIT (UintConst(1) << (INTWORDSIZE-1)) |
Most significative bit of a Uint. More... | |
#define | SECONDBIT (FIRSTBIT >> 1) |
Second most significative bit of a Uint. More... | |
#define | BRANCHWIDTH UintConst(3) |
Number of array buckets per node. More... | |
#define | MAX_HEIGHT 30 |
Maximum allowed height for the tree. More... | |
#define | NODEINDEX(N) ((Uint) ((N) - streetab)) |
Given a pointer to the tree array returns the index in the array of that pointer. More... | |
#define | LEAFBIT FIRSTBIT |
Bit used to indicate that a node is a leaf. More... | |
#define | RIGHTMOSTCHILDBIT SECONDBIT |
Bit used to indicate that a node is the last (rightmost) child of a node. More... | |
#define | UNEVALUATEDBIT FIRSTBIT |
Bit used to indicate that a node has not been evaluated. More... | |
#define | ISLEAF(P) ((*(P)) & LEAFBIT) |
Indicates if a node of the tree is a leaf checking the corresponding bit. More... | |
#define | ISRIGHTMOSTCHILD(P) ((*(P)) & RIGHTMOSTCHILDBIT) |
Indicates if a node is the last (rightmost) child of its parent checking the corresponding bit. More... | |
#define | ISUNEVALUATED(P) ((*((P)+1)) & UNEVALUATEDBIT) |
Indicates if a node has not been evaluated checking the corresponding bit. More... | |
#define | GETLP(P) ((*(P)) & ~(LEAFBIT | RIGHTMOSTCHILDBIT)) |
Returns the index in the input string of the leftmost character in this node label. More... | |
#define | GETFIRSTCHILD(P) (*((P)+1)) |
Returns the index in the tree array of the first child of this node. More... | |
#define | SETLP(P, LP) *(P) = (*(P) & RIGHTMOSTCHILDBIT) | (LP) |
Sets the index in the input string of leftmost character of this node label. More... | |
#define | SETFIRSTCHILD(P, C) *((P)+1) = C |
Sets the index in the tree array of the first child of this node. More... | |
#define | SETLEAF(P, L) *(P) = (L) | LEAFBIT |
Adds a new leaf to the tree. More... | |
#define | GETSTATS(P) (*((P)+2)) |
Returns the statistics for a node. More... | |
#define | SETSTATS(P, C) *((P)+2) = C |
Sets the statistics for a node. More... | |
#define | UNDEFREFERENCE (UINT_MAX) |
Undefined reference. More... | |
#define | SUFFIXNUMBER(L) ((Uint) (*(L) - text)) |
Given a pointer to the input text returns the index in the array of such pointer. More... | |
#define | STOREBOUNDARIES(P, L, R) |
Store the boundaries of the portion of the text that needs to be read when evaluating this node. More... | |
#define | GETLEFTBOUNDARY(P) (suffixbase + *(P)) |
Given a pointer to an unevaluated node returns the stored left boundary. More... | |
#define | GETRIGHTBOUNDARY(P) (suffixbase + ((*((P)+1)) & ~UNEVALUATEDBIT)) |
Given a pointer to an unevaluated node returns the stored right boundary. More... | |
#define | UNDEFINEDSUCC (UINT_MAX) |
Undefined successor. More... | |
#define | MAXSUCCSPACE (BRANCHWIDTH * alphasize + 1) |
Maximum number of extra array nodes needed for the children of a node. More... | |
Functions | |
static Uchar ** | getsbufferspaceeager (Uchar **left, Uchar **right) |
Does space management for the sbuffer array. More... | |
static void | allocstreetab () |
Enlarges the tree array structure if the current free space is not enough for the evaluation of a new node. More... | |
static void | sortByChar (Uchar **left, Uchar **right, Uint prefixlen) |
Sorts lexicographically a portion of the suffixes array. More... | |
static void | sortByChar0 () |
Sorts lexicographically a portion of the suffixes array. More... | |
static Uint | grouplcp (Uchar **left, Uchar **right) |
Finds the longest common prefix (LCP) length of a number of consecutive prefixes. More... | |
static Uint | evalsuccedges (Uchar **left, Uchar **right) |
Given the boundaries of an unevaluated node, evaluates it and reserves space for all its children. More... | |
static Uint | evalrootsuccedges (Uchar **left, Uchar **right) |
Evaluates all the edges outgoing from the root of the tree. More... | |
static Uint | evaluatenodeeager (Uint node, Uint *length) |
Extracts the boundaries of an unevaluated node, evaluates it and creates all its children. More... | |
static Uint | getnextbranch (Uint previousbranch) |
Returns the next brother of this node that is not a leaf. More... | |
static void | pruneRoot () |
Specifically tests if it is necessary to prune the tree at the root and does it if necessary . More... | |
static void | prune (Uint node, Uint length, Uint branchLength) |
Evaluates the cost function to decide if we have to prune this node and does it if necessary. More... | |
static void | prune2 (Uint node, Uint length) |
Prunes a node inconditionally. More... | |
static void | evaluateeager () |
Main method to build and prune the tree. More... | |
static void | inittree () |
Initializes the tree array structure and ancillary structures. More... | |
static void | wotd () |
Builds a pruned context tree. More... | |
Uint | getnextsibling (Uint node) |
Returns the next sibling of a node. More... | |
static fsmTree_t | buildTree () |
Builds a new fsm tree from the pruned tree array representation. More... | |
fsmTree_t | buildSTree () |
Builds a pruned fsm tree from the input text. More... | |
Variables | |
Uchar * | sentinel |
Points to text[textlen] which is undefined. More... | |
Uchar ** | suffixes |
Array of pointers to suffixes of t. More... | |
Uchar ** | suffixbase |
Pointers into suffixes are considered w.r.t. this pointer. More... | |
Uchar ** | sbuffer |
Buffer to sort suffixes in sortByChar. More... | |
Uchar ** | sbufferspace = NULL |
Space to be used by sbuffer. More... | |
Uchar ** | bound [UCHAR_MAX+1] |
Pointers into sbuffer while sorting. More... | |
Uint | occurrence [UCHAR_MAX+1] |
Number of occurrences of each character. More... | |
Uint * | streetab = NULL |
Array that holds the suffix tree representation. More... | |
Uint | streetabsize |
Number of integers in the allocated streetab memory. More... | |
Uint * | nextfreeentry |
Pointer to next unused element in streetab. More... | |
Uint | sbufferwidth |
Number of elements in sbufferspace. More... | |
Uint | maxsbufferwidth |
Maximal number of elements in sbufferspace. More... | |
Uint | suffixessize |
Number of unprocessed suffixes. More... | |
Uint | maxunusedsuffixes |
When reached, then move and halve space for suffixes. More... | |
Uint | rootchildtab [UCHAR_MAX+1] |
Constant time access to successors of root. More... | |
BOOL | rootevaluated |
Flag indicating that the root has been evaluated. More... | |
#define BRANCHWIDTH UintConst(3) |
#define FIRSTBIT (UintConst(1) << (INTWORDSIZE-1)) |
#define GETFIRSTCHILD | ( | P | ) | (*((P)+1)) |
#define GETLEFTBOUNDARY | ( | P | ) | (suffixbase + *(P)) |
#define GETLP | ( | P | ) | ((*(P)) & ~(LEAFBIT | RIGHTMOSTCHILDBIT)) |
#define GETRIGHTBOUNDARY | ( | P | ) | (suffixbase + ((*((P)+1)) & ~UNEVALUATEDBIT)) |
#define GETSTATS | ( | P | ) | (*((P)+2)) |
#define INTWORDSIZE (UintConst(1) << LOGWORDSIZE) |
#define ISLEAF | ( | P | ) | ((*(P)) & LEAFBIT) |
#define ISRIGHTMOSTCHILD | ( | P | ) | ((*(P)) & RIGHTMOSTCHILDBIT) |
#define ISUNEVALUATED | ( | P | ) | ((*((P)+1)) & UNEVALUATEDBIT) |
#define LEAFBIT FIRSTBIT |
#define MAXSUCCSPACE (BRANCHWIDTH * alphasize + 1) |
#define RIGHTMOSTCHILDBIT SECONDBIT |
#define SECONDBIT (FIRSTBIT >> 1) |
#define SETFIRSTCHILD | ( | P, | |
C | |||
) | *((P)+1) = C |
#define SETLEAF | ( | P, | |
L | |||
) | *(P) = (L) | LEAFBIT |
#define SETLP | ( | P, | |
LP | |||
) | *(P) = (*(P) & RIGHTMOSTCHILDBIT) | (LP) |
#define SETSTATS | ( | P, | |
C | |||
) | *((P)+2) = C |
#define STOREBOUNDARIES | ( | P, | |
L, | |||
R | |||
) |
Store the boundaries of the portion of the text that needs to be read when evaluating this node.
This includes setting the unevaluated bit for that node.
[in] | P | pointer to the first index of a node in the tree array. |
[in] | L | the left boundary. |
[in] | R | the right boundary. |
#define UNEVALUATEDBIT FIRSTBIT |
|
static |
fsmTree_t buildSTree | ( | void | ) |
|
static |
Evaluates all the edges outgoing from the root of the tree.
It is a specialization of evaluatesuccedges for the root node.
[in] | left | the left boundary of the portion of the text that needs to be read for evaluation. |
[in] | right | the right boundary of the portion of the text that needs to be read for evaluation. |
Given the boundaries of an unevaluated node, evaluates it and reserves space for all its children.
[in] | left | the left boundary of the portion of the text that needs to be read for evaluation. |
[in] | right | the right boundary of the portion of the text that needs to be read for evaluation. |
|
static |
Extracts the boundaries of an unevaluated node, evaluates it and creates all its children.
[in] | node | index of the node to evaluate in the tree array representation. |
[out] | length | length of the label of this node. |
Does space management for the sbuffer array.
If there is unused space in the suffixes buffer it is used here, otherwise the sbufferspace array is used (and possibly enlarged if neccesary).
[in] | left | left boundary of the portion of the suffixes that will be processed. |
[in] | right | right boundary of the portion of the suffixes that will be processed. |
Finds the longest common prefix (LCP) length of a number of consecutive prefixes.
[in] | left | left boundary of the portion of the suffixes that will be examined. |
[in] | right | right boundary of the portion of the suffixes that will be examined. |
|
static |
Evaluates the cost function to decide if we have to prune this node and does it if necessary.
[in] | node | index of the node in the array tree. |
[in] | length | length of the label of this node (from the root of the tree). |
[in] | branchLength | length of the label of this node alone. |
|
static |
Sorts lexicographically a portion of the suffixes array.
[in] | left | left boundary of the portion of the suffixes that will be sorted. |
[in] | right | right boundary of the portion of the suffixes that will be sorted. |
[in] | prefixlen | all pointers to the prefixes are advanced this number of chars because it is known that are all equal for all suffixes. |
|
static |
Uchar ** bound[UCHAR_MAX+1] |
Uint maxsbufferwidth |
Uint maxunusedsuffixes |
Uint * nextfreeentry |
Uint occurrence[UCHAR_MAX+1] |
Uint rootchildtab[UCHAR_MAX+1] |
BOOL rootevaluated |
Uint * streetab = NULL |
Uint streetabsize |
Uchar ** suffixbase |