wbnf.cpp - Android社区 - https://www.androidos.net.cn/

/*
 ******************************************************************************
 * Copyright (C) 2005-2007, International Business Machines Corporation and   *
 * others. All Rights Reserved.                                               *
 ******************************************************************************
 */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>

#include "wbnf.h"
    
// Most of this code is meant to test the test code. It's a self test.
// Normally this isn't run.
#define TEST_WBNF_TEST 0

///////////////////////////////////////////////////////////
//
// Constants and the most basic helper classes
//
    
static const char DIGIT_CHAR[] = "0123456789";
static const char WHITE_SPACE[] = {'\t', ' ', '\r', '\n', 0};
static const char ALPHABET[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
static const char SPECIAL[] = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";

static inline UBool isInList(const char c /*in*/, const char list[] /*in*/){
    const char * p = list;
    for (;*p != 0 && *p != c; p++);
    return *p?TRUE:FALSE;
}
static inline UBool isDigit(char c) {return isInList(c, DIGIT_CHAR);}
static inline UBool isWhiteSpace(char c) {return isInList(c, WHITE_SPACE);}
static inline UBool isAlphabet(char c) {return isInList(c, ALPHABET);}
static inline UBool isSpecialAsciiChar(char c) {return isInList(c,SPECIAL);}

///////////////////////////////////////////////////////////
//
// Helper classes
//
    
class Buffer_byte{
// Utility class, can be treated as an auto expanded array. no boundary check.

typedef char byte;
    byte * start;
    byte * current;
    int buffer_size; // size unit is byte
public:
    inline int content_size(){return current - start;} // size unit is byte

private:
    inline void expand(int add_size = 100){ // size unit is byte
        int new_size = buffer_size + add_size;

int cs_snap = content_size();         
        start = (byte *) realloc(start, new_size);   // may change the value of start
        current = start + cs_snap;

memset(current, 0, add_size);
        buffer_size = new_size;
    }

inline void expand_to(int size){
        int r = size - buffer_size;
        if (r > 0) {
            expand(r);  // simply expand, no block alignment
        }
    }
    Buffer_byte(const Buffer_byte &);
    Buffer_byte & operator = (const Buffer_byte &);
public:
    Buffer_byte():start(NULL),current(start),buffer_size(0){
        expand();
    }
    ~Buffer_byte(){
        free(start);
    }

inline void reset(){
        start != NULL ? memset(start, 0, buffer_size) : 0;
        current = start;
    }

// Using memory copy method to append a C array to buffer, 
    inline void append(const void * c, int size){ // size unit is byte
        expand_to(content_size() + size) ;
        memcpy(current, c, size);
        current = current + size;
    }
    
    byte * buffer(){
        return start;
    }
};
    
/*
  The class(es) try to work as bulid-in array, so it overloads these two operators
    operator type *();
    type & operator[];
  The first is used to auto type convert, the latter is used to select member.

A small trick is the class does not overload the address-of operator. This 
  behavior is different from bulid-in array, but it give us the opportunity 
  to get the address of the class itself.
*/
//template<typename type>
//    class BUFFER{
//       typedef BUFFER name;
#define BUFFER(type, name)\
    class name {\
    private:\
       Buffer_byte buf;\
    public:\
        name & reset() {buf.reset(); return *this;}\
        name & append(type c) {buf.append(&c, sizeof(type)); return *this;}\
        name & append_array(const type * p, int size) {buf.append(p, sizeof(type)*size); return *this;}\
        type & operator [] (int i) { return ((type *) buf.buffer())[i];}\
        operator type *(){return (type *) buf.buffer();} \
        int content_size(){return buf.content_size() / sizeof(type);}\
    }

class Pick{
/* The Pick is the basic language generator element*/
public:
    // generate a string accroding the syntax 
    // Return a null-terminated c-string. The buffer is owned by callee.
    virtual const char* next() = 0; 
    virtual ~Pick(){};
};

//typedef BUFFER<char> Buffer_char;
//typedef BUFFER<int> Buffer_int;
//typedef BUFFER<Pick *> Buffer_pPick;
BUFFER(char, Buffer_char);
BUFFER(int, Buffer_int);
BUFFER(Pick *, Buffer_pPick);

class SymbolTable{
/* Helper class.
* It's a mapping table between 'variable name' and its 'active Pick object'
*/
private:
    Buffer_char  name_buffer;   // var names storage space

Buffer_int   names;         // points to name (offset in name_buffer)
    Buffer_pPick refs;          // points to Pick

int get_index(const char *const var_name){
        int len = names.content_size();
        for (int i=0; i< len; i++){
            if (strcmp(var_name, name_buffer + names[i]) == 0){
                return i;
            }
        }
        return -1;
    }

public:
    enum RESULT {EMPTY, NO_VAR, NO_REF, HAS_REF};

RESULT find(const char *const var_name /*[in] c-string*/, Pick * * ref = NULL /*[out] Pick* */){
        if (!var_name) return EMPTY; // NULL name

int i = get_index(var_name);
        if (i == -1){
            return NO_VAR;   // new name
        }
        if (!refs[i]){  // exist name, no ref
            return NO_REF;
        } else {
            if (ref) {
                *ref = refs[i];
            }
            return HAS_REF;   // exist name, has ref
        }
    }

void put(const char *const var_name, Pick *const var_ref = NULL){
        int i = get_index(var_name);
        switch(find(var_name)){
            case EMPTY:    // NULL name
                break;
            case NO_VAR:    // new name
                int offset;
                offset = name_buffer.content_size();
                name_buffer.append_array(var_name, strlen(var_name) + 1);
                names.append(offset);
                refs.append(var_ref);
                break;
            case NO_REF:    // exist name, no ref
                refs[i] = var_ref;    // link definition with variable
                break;
            case HAS_REF:    // exist name, has ref
                if (var_ref){
                    refs[i] = var_ref;
                }
                break;
            default:
                ; // ASSERT(FALSE);
        }
        return;
    }

UBool is_complete(){
        int n = names.content_size();
        for (int i=0; i<n; ++i){
            if (refs[i] == NULL){
                return FALSE;
            }
        }
        return TRUE;
    }

void reset(){
        names.reset();
        name_buffer.reset();

// release memory here
        int s = refs.content_size();
        for (int i=0; i < s; i++){
            delete refs[i]; // TOFIX: point alias/recursion problem
        }
        refs.reset();
    }

~SymbolTable(){
        reset();
    }
};

/*
// Document of class Escaper
//
// ATTENTION: 
// From http://icu-project.org/userguide/Collate_Customization.html.
// We get the precedence of escape/quote operations
//
//     (highest) 1. backslash               \
//               2. two single quotes       ''
//               3. quoting                 ' '
//
// ICU Collation should accept following as the same string.
//
// 1)  'ab'c        _
// 2)  a\bc          \
// 3)  a'b'\c        |- They are equal.
// 4)  abc          _/
//
// From "two single quotes", we have following deductions
//    D1. empty quoting is illgal. (obviously)
//    D2. no contact operation between two quotings   
//              '.''.'      is not ..   it is .'.
//    D3. "two single quotes" cannot contact two quoting simultaneously
//              '..''''.'   is not ..'. it is ..''.
//       NOTICE:
//        "two single quotes" can contact before one quoting
//              '''.'       is '.
//        "two single quotes" can literally contact after one quoting
//        But, from syntax, it's one quoting including a "two single quotes"
//              '.'''       is .'
//    D4. "two single quotes" cannot solely be included in quoting
//              ''''        is not '    it is ''
//       NOTICE:  These are legal
//              '.''.'      is .'.
//              '.'''       is .'
//
//                 dicision
//                    /\
//                   /__\
//      output buffer    input buffer
// 
// To make our dicision (within an atom operation) without caring input and output buffer,
// following calling pattern (within an atom operation) shall be avoided
//
//    P1 open_quoting()  then close_quoting()    (direct violation)   D1
//    P2 close_quoting() then open_quoting()     (direct violation)   D2
//    P3 empty open_quoting()                    (indirect violation) D1, D4
//    P4 empty close_quoting()                   (indirect violation) D2, D3
//    P5 open_quoting()  then two single quotes  (indirect violation) D4
//    P6 close_quoting() then two single quotes  (indirect violation) D3
//
// two single quotes escaping will not open_ or close_ quoting()
// The choice will not lose some quoing forms.
//
// For open_quoting(), 
// we may get this form quoting     '''         P5
// It may raise a bug               ''''x
// If we expect
//      '''.'       let the next char open the quoting
//      '.''.'      the quoting is already opened by preceding char
//
// For close_quoting()
// we will get this form quoting    '.'''       P6
// It may raise a bug               '.''''.'
// If we expect          
//      '.'''\.     let the next char close the quoting
//      '.''''.'    the expectation is wrong!  using  '.'\''.' instead
//
// It's a hard work to re-adjust generation opportunity for various escaping form.
// We just simply ignore it.
*/
class Escaper{
public:
    enum CHOICE {YES, NO, RAND};
    enum ESCAPE_FORM {BSLASH_ONLY, QUOTE_ONLY, QUOTE_AND_BSLAH, RAND_ESC};
private:
    class Bool{ // A wrapper class for CHOICE, to auto adapter UBool class
        private:
            const CHOICE tag;
        public:
            Bool(CHOICE flag=RAND):tag(flag){}
            operator UBool() {   // conversion operator
                return tag == RAND ? rand()%2 : tag == YES;
                //if (tag == RAND){
                //    return rand()%2 == 1;
                //} else {
                //    return tag == YES ? TRUE : FALSE;
                //}
            }
    };
public:
    Escaper(CHOICE escapeLiteral = RAND,
        CHOICE twoQuotesEscape = RAND,
        ESCAPE_FORM escapeForm = RAND_ESC):
        escape_form(escapeForm),
        escape_literal(escapeLiteral),
        two_quotes_escape(twoQuotesEscape),
        is_quoting(FALSE){}
private:
    Buffer_char str;
    ESCAPE_FORM escape_form;
    Bool escape_literal;
    Bool two_quotes_escape;
    UBool quote_escape;
    UBool bslash_escape;
    UBool is_quoting;

void set_options(){
        ESCAPE_FORM t = escape_form == RAND_ESC ? (ESCAPE_FORM) (rand()%3) : escape_form;
        switch (t){
                case BSLASH_ONLY :
                    bslash_escape = TRUE; quote_escape = FALSE; break;
                case QUOTE_ONLY:
                    bslash_escape = FALSE;quote_escape = TRUE;  break;
                case QUOTE_AND_BSLAH:
                    bslash_escape = TRUE; quote_escape = TRUE;  break;
                default:
                    ;// error
        }
    }

void reset(){
        str.reset();
        is_quoting = FALSE;
    }

inline void open_quoting(){ 
        if(is_quoting){
            // do nothing
        } else {
            str.append('\'');
            is_quoting = TRUE;
        }
    }
    inline void close_quoting(){
        if(is_quoting){
            str.append('\'');
            is_quoting = FALSE;
        } else {
            // do nothing
        }
    }

// str  [in]    null-terminated c-string
    void append(const char * strToAppend){
        for(;*strToAppend != 0; strToAppend++){
            append(*strToAppend);
        }
    }

inline void append(const char c){
        set_options();

if (c == '\\'){
            quote_escape ? open_quoting() : close_quoting();
            //bslash_escape always true here
            str.append('\\');
            str.append('\\');
        } else if (c == '\''){
            if (two_quotes_escape){     // quoted using two single quotes
                // See documents in anonymous.design
                str.append('\'');
                str.append('\'');
            } else{
                quote_escape ? open_quoting() : close_quoting();
                //bslash_escape always true here
                str.append('\\');
                str.append('\'');
            }
        } else if (isSpecialAsciiChar(c) || isWhiteSpace(c)){
            quote_escape  ? open_quoting()   : close_quoting();
            if (bslash_escape) str.append('\\');
            str.append(c);
        } else { //if (isAlphabet(c) || isDigit(c) || TRUE){ // treat others as literal
            if (escape_literal){
                quote_escape  ? open_quoting()   : close_quoting();
                if (bslash_escape)  str.append('\\');
                str.append(c);
            } else {
                close_quoting();
                str.append(c);
            }
        }
    }

public:
    // Return a null-terminate c-string. The buffer is owned by callee.
    char * operator()(const char * literal /*c-string*/){
        str.reset();
        for(;*literal != 0; literal++){
            append(*literal);
        }
        close_quoting();    // P4 exception, to close whole quoting
        return str;
    }
};
    
class WeightedRand{
// Return a random number in [0, size)
// Every number has different chance (aka weight) to be selected.
private:
    Buffer_int weights;
    double total;
    WeightedRand(const WeightedRand &);
    WeightedRand & operator = (const WeightedRand &);
public:
    WeightedRand(Buffer_int * weight_list = NULL, int size = 0){
        if ( weight_list == NULL){
            for (int i=0; i<size; ++i) weights.append(DEFAULT_WEIGHT);
        } else {
            int s = weight_list->content_size();
            if (s < size){
                weights.append_array( (*weight_list),s);
                for (int i=s; i<size; ++i) weights.append(DEFAULT_WEIGHT);
            } else { // s >= size
                weights.append_array( (*weight_list),size);
            }
        }
        total = 0;
        int c = weights.content_size();
        for (int i=0; i<c; ++i){
            total += weights[i];
        }
    }

void append(int weight){
        weights.append(weight);
        total += weight;
    }

// Give a random number with the consideration of weight.
    // Every random number is associated with a weight.
    // It identifies the chance to be selected, 
    // larger weight has more chance to be selected.
    //
    //
    //  ______________________   every slot has equal chance
    //
    //  [____][_][___][______]   each item has different slots, hence different chance
    //
    //
    //  The algorithms to generate the number is illustrated by preceding figure.
    //  First, a slot is selected by rand(). Then we translate the slot to corresponding item.
    //
    int next(){
        // get a random in [0,1]
        double reference_mark = (double)rand() / (double)RAND_MAX;

// get the slot's index, 0 <= mark <= total;
        double mark = total * reference_mark;

// translate the slot to corresponding item
        int i=0;
        for (;;){
            mark -= weights[i];  // 0 <= mark <= total
            if (mark <= 0)
                break;
            i++;
        }
        return i;
    }
};
    
///////////////////////////////////////////////////////////
//
// The parser result nodes
//
    
class Literal : public Pick {
public:
    virtual const char* next(){
        return str;
    }
    Literal(const char * s /*c-string*/){
        str.append_array(s, strlen(s) + 1);
    }
private:
    Buffer_char str; //null-terminated c-string
};

class Variable : public Pick {
public:
    Variable(SymbolTable * symbols, const char * varName, Pick * varRef = NULL){
        this->var_name.append_array(varName, strlen(varName) + 1);
        if ((symbol_table = symbols)){
            symbol_table->put(varName, varRef);
        }
    }

operator const char *(){
        return var_name;
    }

virtual const char* next(){
        if (symbol_table){
            Pick * var_ref = NULL;
            symbol_table->find(var_name, &var_ref);
            if (var_ref) {
                return var_ref->next();
            }
        }
        return "";  // dumb string
    }
private:
    Buffer_char var_name;
    SymbolTable * symbol_table;
};

class Quote : public Pick{
public:
    Quote(Pick & base):item(base),e(Escaper::NO, Escaper::NO, Escaper::BSLASH_ONLY){
    }
    virtual const char* next(){
        return e(item.next());
    }
private:
    Pick & item;
    Buffer_char str;
    Escaper e;
};

class Morph : public Pick{
/*
The difference between morph and an arbitrary random string is that 
a morph changes slowly. When we build collation rules, for example, 
it is a much better test if the strings we use are all in the same 
'neighborhood'; they share many common characters.
*/
public:
    Morph(Pick & base):item(base){}

virtual const char* next(){
        current.reset();
        const char * s = item.next();
        current.append_array(s, strlen(s) + 1);
        if  (last.content_size() == 0) {
            str.reset();
            last.reset();
            str.append_array(current, current.content_size());
            last.append_array(current, current.content_size());
        } else {
            morph();
        }
        return str;
    }
private:
    Pick & item;
    Buffer_char str;
    Buffer_char last;
    Buffer_char current;

char * p_last;
    char * p_curr;

void copy_curr(){
        if (*p_curr) {
            str.append(*p_curr);
            p_curr++;
        }
    }

void copy_last(){
        if (*p_last) {
            str.append(*p_last);
            p_last++;
        }
    }

// copy 0, 1, or 2 character(s) to str
    void copy(){
        static WeightedRand wr(& Buffer_int().append(DEFAULT_WEIGHT * 10), 5);

switch (wr.next()){
            case 0: // copy last  -- has 10 times chance than others
                copy_last();
                break;
            case 1: // copy both
                copy_curr();
                copy_last();
                break;
            case 2: // copy both
                copy_last();
                copy_curr();
                break;
            case 3:
                copy_curr();
                break;
            case 4:  // copy nothing
                break;
            default:
                // ASSERT(FALSE);
                ;
        }
    }

void morph(void){
        int min = strlen(last);
        int max = strlen(current);
        if (min > max){
            int temp  = min;
            min = max;
            max = temp;
        }

int len = min + rand()%(max - min + 1); // min + [0, diff]
        p_curr = current;
        p_last = last;
        str.reset();

for (; str.content_size()<len && *p_curr && *p_last;){
            copy(); // copy 0, 1, or 2 character(s) to str
        }

if (str.content_size() == len) {
            str.append(0);
            final();
            return;
        }

if (str.content_size() > len) { // if the last copy copied two characters
            str[len]=0;
            final();
            return;
        }

// str.content_size() < len
        if (*p_last) {
            for (; str.content_size() < len; copy_last());
        } else if (*p_curr){
            for (; str.content_size() < len; copy_curr());
        }

int last_len = last.content_size();
        for (;str.content_size() < len;){
            str.append(last[rand()%last_len]);
        }
        str.append(0);
        final();
    }

void final(){
        last.reset();
        last.append_array(current, current.content_size());
    }
};

class Sequence : public Pick {
public:
    virtual const char* next(){
        str.reset();
        int s = items.content_size();
        for(int i=0; i < s; i++){
            const char * t = items[i]->next();
            str.append_array(t, strlen(t));
        }
        str.append(0); // terminal null
        return str;
    }

void append (Pick * node){
        items.append(node);
    }

virtual ~Sequence(){
        int s = items.content_size();
        for(int i=0; i < s; i++){
            //How can assure the item is got from heap?
            //Let's assume it.
            delete items[i]; // TOFIX: point alias/recursion problem
            items[i] = NULL;
        }
    }
private:
    Buffer_pPick items;
    Buffer_char  str; //null-terminated c-string
};

class Repeat : public Pick {
private:
    Pick * item;
    Buffer_char str;
    WeightedRand wr;
    int min;
    int max;
    int select_a_count(){
        return min + wr.next();
    }
public:
    virtual const char* next(){
        str.reset();
        int c = select_a_count();
        for(int i=0; i< c; i++){
            const char * t = item->next();
            str.append_array(t, strlen(t));
        }
        str.append(0);
        return str;
    }

Repeat(Pick * base, int minCount =0, int maxCount = 1, Buffer_int * weights = NULL):
        wr(weights, maxCount-minCount +1) {
        this->item = base;
        this->min = minCount;
        this->max = maxCount;
    }
    virtual ~Repeat(){
        delete item;  // TOFIX: point alias/recursion problem
        item = NULL;
    }
};

class Alternation : public Pick {
public:
    virtual const char* next(){
        str.reset();
        int i = wr.next();
        const char * t = items[i]->next();
        str.append_array(t, strlen(t) + 1);
        return str;
    }
    virtual ~Alternation(){
        int s = items.content_size();
        for(int i=0; i < s; i++){
            delete items[i];  // TOFIX: point alias/recursion problem
            items[i] = NULL;
        }
    }

Alternation & append (Pick * node, int weight = DEFAULT_WEIGHT){
        items.append(node);
        wr.append(weight);
        return *this;
    }
private:
    Buffer_pPick items;
    Buffer_char str; // null-terminated c-string
    WeightedRand wr;
};
    
///////////////////////////////////////////////////////////
//
// The parser
//

enum TokenType {STRING, VAR, NUMBER, STREAM_END, ERROR, QUESTION, STAR, PLUS, LBRACE, RBRACE, LPAR, RPAR, SEMI, EQ, COMMA, BAR, AT, WAVE, PERCENT};

class Scanner{
friend int DumpScanner(Scanner & s, UBool dumb);
private:
    const char * source;
    const char * working;
    const char * history; // for debug
    enum StateType {START, IN_NUM, IN_VAR_FIRST, IN_VAR, IN_QUOTE, IN_QUOTE_BSLASH, IN_BSLASH, IN_STRING, DONE};
    StateType state;
    void terminated(TokenType t){
        working--;       // return the peeked character
        tokenType = t;
        token.append(0); // close buffer
        state = DONE;
    }
public:
    // the buffer of "source" is owned by caller
    Scanner(const char *src/*[in] c-string*/ = NULL):source(src){
        working = src;
        history = working;
        state = DONE;
        tokenType = ERROR;
    }

//void setSource(const char *const src /*[in] c-string*/){
    //    *(&const_cast<const char *>(source)) = src;
    //}

Buffer_char token;
    TokenType tokenType;

TokenType getNextToken(){
        token.reset();
        state = START;
        history = working; // for debug
        while (state != DONE){
            char c = *working++;
            if (c == 0 && state != START){//avoid buffer overflow. for IN_QUOE, IN_ESCAPE
                terminated(ERROR);
                break; // while
            }
            switch(state){
                case START:
                    tokenType = ERROR;
                    switch(c){
                        case '?'  : tokenType = QUESTION; break;
                        case '*'  : tokenType = STAR; break;
                        case '+'  : tokenType = PLUS; break;
                        case '{'  : tokenType = LBRACE; break;
                        case '}'  : tokenType = RBRACE; break;
                        case '('  : tokenType = LPAR; break;
                        case ')'  : tokenType = RPAR; break;
                        case ';'  : tokenType = SEMI; break;
                        case '='  : tokenType = EQ; break;
                        case ','  : tokenType = COMMA; break;
                        case '|'  : tokenType = BAR; break;
                        case '@'  : tokenType = AT; break;
                        case '~'  : tokenType = WAVE; break;
                        case '%'  : tokenType = PERCENT; break;
                        case 0    : tokenType = STREAM_END; working-- /*avoid buffer overflow*/; break;
                    }
                    if (tokenType != ERROR){
                        token.append(c);
                        token.append(0);
                        state = DONE;
                        break; // START
                    }
                    switch(c){
                        case '$'  : state = IN_VAR_FIRST; token.append(c); break;
                        case '\'' : state = IN_QUOTE;     break;
                        case '\\' : state = IN_BSLASH;    break;
                        default:
                            if (isWhiteSpace(c)){    // state = START;   //do nothing
                            } else if (isDigit(c)){     state = IN_NUM;    token.append(c);
                            } else if (isAlphabet(c)){  state = IN_STRING; token.append(c);
                            } else {terminated(ERROR);}
                    }
                    break;//START
                case IN_NUM:
                    if (isDigit(c)){
                        token.append(c);
                    } else {
                        terminated(NUMBER);
                    }
                    break;//IN_NUM
                case IN_VAR_FIRST:
                    if (isAlphabet(c)){
                        token.append(c);
                        state = IN_VAR;
                    } else {
                        terminated(ERROR);
                    }
                    break; // IN_VAR_FISRT
                case IN_VAR:
                    if (isAlphabet(c) || isDigit(c)){
                        token.append(c);
                    } else {
                        terminated(VAR);
                    }
                    break;//IN_VAR
                case IN_STRING:
                    // About the scanner's behavior for STRING, AT, and ESCAPE:
                    // All of them can be contacted with each other. 
                    // This means the scanner will eat up as much as possible strings
                    //   (STRING, AT, and ESCAPE) at one time, with no regard of their
                    //   combining sequence.
                    //
                    if (c == '\''){
                        state = IN_QUOTE; // the first time we see single quote
                    } else if (c =='\\'){ // back slash character
                        state = IN_BSLASH;
                    } else if (isAlphabet(c) || isDigit(c)){
                        token.append(c);
                    } else{
                        terminated(STRING);
                    }
                    break;//IN_STRING
                case IN_QUOTE:
                    if (c == '\''){ // the second time we see single quote
                        state = IN_STRING; // see document in IN_STRING
                    } else if ( c== '\\') { // backslah escape in quote
                        state = IN_QUOTE_BSLASH;
                    } else {
                        token.append(c);  // eat up everything, includes back slash
                    }
                    break;//IN_QUOTE
                case IN_QUOTE_BSLASH:
                case IN_BSLASH:
                    switch (c){
                        case 'n'  : token.append('\n'); break;
                        case 'r'  : token.append('\r'); break;
                        case 't'  : token.append('\t'); break;
                        case '\'' : token.append('\''); break;
                        case '\\' : token.append('\\'); break;
                        default: token.append(c); // unknown escaping, treat it as literal
                    }
                    if (state == IN_BSLASH){
                        state = IN_STRING; // see document in IN_STRING
                    } else { // state == IN_QUOTE_BSLASH
                        state = IN_QUOTE;
                    }
                    break;//IN_BSLASH
                case DONE:  /* should never happen */
                default:
                    working--;
                    tokenType = ERROR;
                    state = DONE;
                    break;
            }//switch(state) 
        }//while (state != DONE)

return tokenType;
    }
};//class Scanner

class Parser{
friend UBool TestParser();
friend class TestParserT;
friend class LanguageGenerator_impl;
private:
    Scanner s;
    TokenType & token;
    int min_max;   // for the evil infinite
    
    UBool match(TokenType expected){
        if (token == expected) {
            token = s.getNextToken();
            return TRUE;
        } else {
            //s.dumpCurrentPoint();
            return FALSE;
        }
    }

UBool weight(int & value){
        if (token == NUMBER){
            int temp = atoi(s.token);
            match(NUMBER);
            if (match(PERCENT)){
                value = temp;
                return TRUE;
            }
        }
        return FALSE;
    }

UBool repeat (Pick* &node /*in,out*/){
        if (node == NULL) return FALSE;

int count = -2;
        int min = -2;
        int max = -2;
        UBool question = FALSE;
        switch (token){
            case QUESTION:
                match(QUESTION);
                min = 0;
                max = 1;
                count = 2;
                question = TRUE;
                break;
            case STAR:
                match(STAR);
                min = 0;
                max = -1;
                count = -1;
                break;
            case PLUS:
                match(PLUS);
                min = 1;
                max = -1;
                count = -1;
                break;
            case LBRACE:
                match(LBRACE);
                if (token != NUMBER){
                    return FALSE;
                }else {
                    min = atoi(s.token);
                    match(NUMBER);
                    if (token == RBRACE){
                        match(RBRACE);
                        max = min;
                        count = 1;
                    } else if (token == COMMA) {
                        match(COMMA);
                        if (token == RBRACE){
                            match(RBRACE);
                            max = -1;
                            count = -1;
                        } else if (token == NUMBER) {
                            max = atoi(s.token);
                            match(NUMBER);
                            count = max - min + 1;
                            if (!match(RBRACE)) {
                                return FALSE;
                            }
                        } else {
                            return FALSE;
                        }
                    } else {
                        return FALSE;
                    }
                }
                break;
            default:
                return FALSE;
        }

if (count == -2 || min == -2 || max == -2){
            //ASSERT(FALSE);
            return FALSE;
        }

// eat up following weights
        Buffer_int weights;
        int w;
        while (weight(w)){
            weights.append(w);
        }

// for the evil infinite
        min_max = min_max > min ? min_max : min;
        min_max = min_max > max ? min_max : max;
        if (min_max > PSEUDO_INFINIT){
            return FALSE; // PSEUDO_INFINIT is less than the real maximum
        }
        if (max == -1){ // the evil infinite
            max = PSEUDO_INFINIT;
        }
        // for the strange question mark
        if (question && weights.content_size() > 0){
            Buffer_int w2;
            w2.append(DEFAULT_WEIGHT - weights[0]).append(weights[0]);
            node = new Repeat(node,min,max,&w2);
            return TRUE;
        }
        node = new Repeat(node,min,max,&weights);
        return TRUE;
    }

UBool core(Pick* &node /*out*/){
        if (node != NULL) return FALSE; //assert node == NULL

switch(token){
            case LPAR:
                match(LPAR);
                if(defination(node) && match(RPAR)){
                    return TRUE;
                }
                return FALSE;
            case VAR:
                node = new Variable(&symbols, s.token);
                match(VAR);
                return TRUE;
            case STRING:
                node = new Literal(s.token);
                match(STRING);
                return TRUE;
            default:
                return FALSE;
        }
    }
    UBool modified(Pick* &node /*out*/){
        if (node != NULL) return FALSE; //assert node == NULL

if (!core(node)) {
            return FALSE;
        }

for (;;){
            switch(token){
                case WAVE:
                    match(WAVE);
                    node = new Morph(*node);
                    break;
                case AT:
                    match(AT);
                    node = new Quote(*node);
                    break;
                case QUESTION:
                case STAR:
                case PLUS:
                case LBRACE:
                    if (!repeat(node)) return FALSE;
                    break;
                case SEMI:      // rule definiation closed
                case RPAR:      // within parenthesis (core closed)
                case BAR:       // in alternation
                case NUMBER:    // in alternation, with weight
                case LPAR:      // in sequence
                case VAR:       // in sequence
                case STRING:    // in sequence
                    return TRUE;
                default:
                    return FALSE;
            }
        }
    }

UBool sequence_list(Pick* &node /*in,out*/){
        if (node == NULL) return FALSE; // assert node != NULL

Sequence* seq = new Sequence();
        Pick * n = node;

while (token == VAR || token == STRING || token == LPAR){
            seq->append(n);
            n = NULL;
            if (modified(n)){
                // go on
            } else {
                goto FAIL;
            }
        }

if (token == SEMI || token == RPAR || token == BAR){
            seq->append(n);
            node = seq;
            return TRUE;
        }
FAIL:
        delete seq;
        return FALSE;

}

UBool sequence(Pick* &node /*out*/){
        if (node != NULL) return FALSE; //assert node == NULL

if (!modified(node)) {
            return FALSE;
        }

if (token == VAR || token == STRING || token == LPAR){
            return sequence_list(node);
        } else {
            return TRUE; // just a modified
        }
    }

UBool alternation_list(Pick* &node /*in,out*/){
        if (node == NULL) return FALSE; // assert node != NULL

Alternation * alt = new Alternation();
        Pick * n = node;
        int w = DEFAULT_WEIGHT;

while (token == NUMBER || token == BAR){
            if(token == NUMBER) {
                if (weight(w)){ 
                    if (token == BAR){ 
                        // the middle item, go on
                    } else {
                        // the last item or encounter error
                        break; //while
                    } 
                } else {
                    goto FAIL;
                }
            } // else token == BAR
            match(BAR);
            alt->append(n,w);

n = NULL;
            w = DEFAULT_WEIGHT;
            if (sequence(n)){
                // go on
            } else {
                goto FAIL;
            }
        }

if (token == SEMI || token == RPAR) {
            alt->append(n,w);
            node = alt;
            return TRUE;
        }
FAIL:
        delete alt;
        return FALSE;
    }

UBool alternation(Pick* &node /*out*/){
        if (node != NULL) return FALSE; //assert node == NULL

// 'sequence' has higher precedence than 'alternation'
        if (!sequence(node)){
            return FALSE;
        }

if (token == BAR || token == NUMBER){ // find a real alternation1, create it.
            return alternation_list(node);
        } else {
            return TRUE;    // just a sequence_old
        }
    }

UBool defination(Pick* &node /*out*/){
        if (node != NULL) return FALSE; //assert node == NULL
        return alternation(node);
    }

UBool rule(){
        if (token == VAR){
            Buffer_char name;
            name.append_array(s.token, strlen(s.token) + 1);
            match(VAR);

if (match(EQ)){
                Pick * t = NULL;
                if(defination(t)){
                    symbols.put(name, t);
                    return match(SEMI);
                }
            }
        }
        return FALSE;
    }
public:
    UBool rules(){
        symbols.reset();
        token = s.getNextToken();
        while (rule()){
        }
        if (token == STREAM_END){
            return TRUE;
        } else {
            //s.dumpCurrentPoint();
            return FALSE;
        }
    }

public:
    SymbolTable symbols;

Parser(const char *const source):s(source), token(s.tokenType){
        min_max = -2;
    }
    UBool parse(){
        return rules();
    }

}; // class Parser

///////////////////////////////////////////////////////////
//
// 
//

int DumpScanner(Scanner & s, UBool dump = TRUE){
    int len = strlen(s.source);
    int error_start_offset = s.history - s.source;
    if (dump){
        printf("\n=================== DumpScanner ================\n");
        fwrite(s.source, len, 1, stdout);
        printf("\n-----parsed-------------------------------------\n");
        fwrite(s.source, s.history - s.source, 1, stdout);
        printf("\n-----current------------------------------------\n");
        fwrite(s.history, s.working - s.history, 1, stdout);
        printf("\n-----unparsed-----------------------------------\n");
        fwrite(s.working, (s.source + len - s.working), 1, stdout);
        printf("\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");
    }
    return error_start_offset;
}

class LanguageGenerator_impl{
public:
    LanguageGenerator_impl(const char *const bnf_definition, const char *const top_node)
        :par(bnf_definition), top_node_name(top_node){
        srand((unsigned)time( NULL ));
    }

LanguageGenerator::PARSE_RESULT parseBNF(UBool debug = TRUE){
        if (par.parse()){
            if (par.symbols.find(top_node_name, &top_node_ref) == SymbolTable::HAS_REF) {
                if (par.symbols.is_complete()) {
                    return LanguageGenerator::OK;
                } else {
                    if (debug) printf("The bnf definition is incomplete.\n");
                    return LanguageGenerator::INCOMPLETE;
                }
            } else {
                if (debug) printf("No top node is found.\n");
                return LanguageGenerator::NO_TOP_NODE;
            }
        } else {
            if(debug) {
                printf("The bnf definition is wrong\n");
                DumpScanner(par.s, TRUE);
            }
            return LanguageGenerator::BNF_DEF_WRONG;
        }
    }
    const char * next(){
        return top_node_ref->next();
    }

private:
    Parser par;
    const char *const top_node_name;
    Pick * top_node_ref;
};

LanguageGenerator::LanguageGenerator():lang_gen(NULL){
}

LanguageGenerator::~LanguageGenerator(){
    delete lang_gen;
}

LanguageGenerator::PARSE_RESULT LanguageGenerator::parseBNF(const char *const bnf_definition /*in*/, const char *const top_node/*in*/, UBool debug){
    if (lang_gen){
        delete lang_gen;
    }
    lang_gen = new LanguageGenerator_impl(bnf_definition, top_node);
    PARSE_RESULT r = lang_gen->parseBNF(debug);
    if (r != OK){
        delete lang_gen;
        lang_gen = NULL;
        return r;
    } else {
        return r;
    }
}
const char *LanguageGenerator::next(){ // Return a null-terminated c-string. The buffer is owned by callee.
    if (lang_gen){
        return lang_gen->next();
    }else { 
        return "";
    }
}

///////////////////////////////////////////////////////////
//
// The test code for WBNF
//

#define CALL(fun) \
    if (fun()){ \
        printf("Pass: " #fun "\n");\
    } else { \
        printf("FAILED: !!! " #fun " !!!\n"); \
    }

#define DUMP_R(fun, var, times) \
    {printf("\n========= " #fun " =============\n"); \
    for (int i=0; i<times; i++) { \
        const char * t = var.next();\
        fwrite(t,strlen(t),1,stdout); \
        printf("\n");   \
    }   \
    printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");}

#if TEST_WBNF_TEST    
static UBool TestQuote(){
    const char *const str = "This ' A !,z| qq [] .new\tline";
    //const char *const str_r = "This \\' A '!,'z'|' qq '[]' '.'new\tline";
    ////
    //// :(  we must quote our string to following C syntax
    ////     cannot type the literal here, it makes our code rather human unreadable
    ////     very very unconformable!
    ////
    ///* 
    //*/

//const char *const s1    =   "ab'c";
    //const char (* s1_r1) [] = { "ab''c",    // ab''c
    //                            "ab\\'c",   // ab\'c
    //                           };//
    ///*
    // .      '.'     \.
    // ..     \.\.    '.'\.   '.'\.   '..'    // '.''.'  wrong
    //*/

//const char *const s2    =   "a..'.b";       // a..'.b
    //const char (*s2_r) []   = { "a'..''.'b"     // a'..''.'b   
    //                           ,"a'..\\'.'b"    // a'..\'.'b
    //                           ,"a'..'\\''.'b"  // a'..'\''.'b
    //                          };//

//const char *const s3    =   "a..\\.b";      // a..\.b
    //const char (*s3_r) []   = { "a'..\\\\.'b"   // a'..\\.'b
    //                           ,"a'..'\\\\'.'b" // a'..'\\'.'b
    //                          };//

//                            // no catact operation, no choice, must be compact

srand((unsigned)time( NULL ));
    
    //Escaper l(Escaper::NO, Escaper::NO, Escaper::RAND_ESC);
    Pick *p = new Literal(str);
    Quote q(*p);

DUMP_R(TestQuote, (*p), 1);
    DUMP_R(TestQuote, q, 20);
    return FALSE;
}
static UBool TestLiteral(){
    const char * s = "test string99.";
    Literal n(s);
    const char * r = n.next();
    return strcmp(s,r) == 0;
}

static UBool TestSequence(){
    Sequence seq;
    seq.append(new Literal("abc "));
    seq.append(new Literal(", s"));

return strcmp(seq.next(), "abc , s") == 0;
}
static UBool TestAlternation(){
    srand((unsigned)time( NULL ));
    Alternation alt;
    alt.append(new Literal("aaa_10%"),10);
    alt.append(new Literal("bbb_0%"),0);
    alt.append(new Literal("ccc_10%"),10);
    alt.append(new Literal("ddddddd_50%"),50);

DUMP_R(TestAlternation, alt, 50);

return FALSE;
}

static UBool TestBuffer(){
    Buffer_int t;
    t.append(1).append(0).append(5);
    int s = t.content_size();
    for (int i=0; i<s; ++i){
        printf("%d\n", t[i]);
    }
    return FALSE;
}

static UBool TestWeightedRand(){
    srand((unsigned)time( NULL ));
    Buffer_int t;
    t.append(1).append(0).append(5);
    WeightedRand wr(&Buffer_int().append(10).append(0).append(50),4);
//    WeightedRand wr(&t,3);
    for (int i=0; i< 50; ++i){
        printf("%d\n", wr.next());
    }
    return FALSE;
}

static UBool TestRepeat(){
    srand((unsigned)time( NULL ));
    Repeat rep(new Literal("aaa1-5 "), 1, 5);
    DUMP_R(TestRepeat, rep, 50);

Repeat r2(new Literal("b{1,3}1%0%5% "), 1, 3, &Buffer_int().append(1).append(0).append(5));
    DUMP_R(TestRepeat, r2, 50);

Repeat r3(new Literal("aaa5-5 "), 5, 5);
    DUMP_R(TestRepeat, r3, 50);

return FALSE;
}

static UBool TestVariable(){
    SymbolTable tab;
    Pick * value = new Literal("string1");
    Variable var1(&tab, "x", value);

Variable var2(&tab, "y");
//    tab.put(var2, value); // TOFIX: point alias/recursion problem
    Pick * value2 = new Literal("string2");
    tab.put(var2, value2);

Pick * value3 = new Literal("string3");
    Variable var3(&tab, "z");
    tab.put("z", value3);

UBool pass;
    pass = strcmp(var1.next(), value->next()) == 0;
    pass = pass && strcmp(var2.next(), value2->next()) == 0;
    pass = pass && strcmp(var3.next(), value3->next()) == 0;
    return pass;
}

static UBool TestSymbolTable(){
    Literal * n1 = new Literal("string1");
    Literal * n2 = new Literal("string2");
    SymbolTable t;
    t.put("abc", n1);
    t.put("$aaa", n2);
//    t.put("alias", n1);  // TOFIX: point alias/recursion problem
    t.put("bbb");

UBool pass;
    pass = t.find(NULL) == SymbolTable::EMPTY;
    pass = pass && t.find("ccc") == SymbolTable::NO_VAR;
    pass = pass && t.find("bbb") == SymbolTable::NO_REF;
    pass = pass && t.find("abc") == SymbolTable::HAS_REF;
    pass = pass && t.find("$aaa") == SymbolTable::HAS_REF;

t.reset();
    pass = pass && t.find("abc") == SymbolTable::NO_VAR;
    return pass;
}

static UBool TestScanner(void){
    //const char str1[] = "$root = $command{0,5} $reset $mostRules{1,20};";
    //const char str1_r[][20] = {"$root", "=", "$command", "{", "0", ",", "5", "}", 
    //    "$reset", "$mostRules", "{", "1", ",", "20", "}", ";"};

const char str2[] = "$p2 =(\\\\ $s $string $s)? 25%;";
    const char str2_r[][20] = {"$p2", "=", "(", "\\", "$s", "$string", "$s", ")", "?", "25", "%", ";"};

const char *str = str2;
    const char (*str_r)[20] = str2_r;
    int tokenNum = sizeof(str2_r)/sizeof(char[20]);

Scanner t(str);
    UBool pass = TRUE;
    t.getNextToken();
    int i = 0;
    while (pass){
        if (t.tokenType == STREAM_END){
            pass = pass? i == tokenNum : FALSE;
            break;//while
        } else if (t.tokenType == ERROR){
            pass = FALSE;
            break;//while
        } else {
            pass = strcmp( &(t.token[0]), str_r[i++]) == 0;
            t.getNextToken();
        }
    }

//const char ts[] = "$commandList = '['"
    //" ( alternate ' ' $alternateOptions"
    //" | backwards ' 2'"
    //" | normalization ' ' $onoff "
    //" | caseLevel ' ' $onoff "
    //" | hiraganaQ ' ' $onoff"
    //" | caseFirst ' ' $caseFirstOptions"
    //" | strength ' ' $strengthOptions"
    //" ) ']';" ;

//Scanner t2(ts);
    //pass = TRUE;
    //do {
    //    t2.getNextToken();
    //    if (t2.tokenType == ERROR){
    //        DumpScanner(t2);
    //        return FALSE;
    //    }
    //}while (t.tokenType != STREAM_END);

return pass;
}

class TestParserT {
public:
UBool operator () (const char *const str, const int exp_error_offset = -1, const UBool dump = TRUE){
    Parser par(str);
    if (par.rules()){
        if ( exp_error_offset == -1){
            return TRUE;
        }else {
            DumpScanner(par.s,dump);
            return FALSE;
        }
    }else {
        return DumpScanner(par.s, dump) == exp_error_offset;
    }
}
};

UBool TestParser(){
    TestParserT test;

UBool pass = TRUE;
    pass = pass && test ("$s = ' ' ? 50%;");
    pass = pass && test("$x = ($var {1,2}) 3%;");         // legal
    pass = pass && test("$x = $var {1,2} 3% | b 4%;");    // legal
    pass = pass && test("$x = $var {1,2} 3%;");           // legal
    pass = pass && test("$m = $c ? 2% 4% | $r 5% | $n 25%;"); // legal
    pass = pass && test("$a = b ? 2% | c 5%;");               // legal
    pass = pass && test("$x = A B 5% C 10% | D;", 8, FALSE);  // illegal 5%
    pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc
    pass = pass && test("$x = (b 5%) (c 6%);");               // legal
    pass = pass && test("$x = (b 5%) c 6%;", 13, FALSE);      // illegal 6%
    pass = pass && test("$x = b 5% (c 6%);", 9, FALSE);       // illegal (c 6%)
    pass = pass && test("$x = b 5% c 6%;", 9, FALSE);         // illegal c 6%
    pass = pass && test("$x = b 5%;");                        // legal
    pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc
    pass = pass && test("$x = a | b  | c 4% | d 5%;");        // legal
    pass = pass && test("$s = ' ' ? 50% abc;");               // legal
    pass = pass && test("$s =  a | c d | e f;");              // legal
    pass = pass && test( "$z = q 0% | p 1% | r 100%;");         // legal How to check parsed tree??

pass = pass && test("$s = ' ' ? 50%;");
    pass = pass && test("$relationList = '<' | '<<' |  ';' | '<<<' | ',' | '=';");
    pass = pass && test("$p1 = ($string $s '|' $s)? 25%;");
    pass = pass && test("$p2 = (\\\\ $s $string $s)? 25%;");
    pass = pass && test("$rel2 = $p1 $string $s $p2;");
    pass = pass && test("$relation = $relationList $s ($rel1 | $rel2) $crlf;");
    pass = pass && test("$command = $commandList $crlf;");
    pass = pass && test("$reset = '&' $s ($beforeList $s)? 10% ($positionList 100% | $string 10%) $crlf;");
    pass = pass && test("$mostRules = $command 1% | $reset 5% | $relation 25%;");
    pass = pass && test("$root = $command{0,5} $reset $mostRules{1,20};");

const char collationBNF[] =
    "$s = ' '? 50%;" 
    "$crlf = '\r\n';"

"$beforeList = '[before ' ('1' | '2' | '3') ']';"

"$relationList = ("
    "   '<'"
    " | '<<'"
    " | ';'" 
    " | '<<<'"
    " | ','" 
    " | '='"
    ");"
    "$string = $magic;" 
    "$rel1 = '[variable top]' $s;" 
    "$p1 = ($string $s '|' $s)? 25%;" 
    "$p2 = (\\\\ $s $string $s)? 25%;" 
    "$rel2 = $p1 $string $s $p2;" 
    "$relation = $relationList $s ($rel1 | $rel2) $crlf;"

"$reset = '&' $s ($beforeList $s)? 10% ($positionList 1% | $string 10%) $crlf;" 
    "$mostRules = $command 1% | $reset 5% | $relation 25%;"
    "$root = $command{0,5} $reset $mostRules{1,20};"
    ;
    
    pass = pass && test(collationBNF);

return pass;
}

static UBool TestMorph(){
    srand((unsigned)time( NULL ));

Alternation * alt = new Alternation();

(*alt)
    .append(new Literal("a")).append(new Literal("b")).append(new Literal("c"))
    .append(new Literal("d")).append(new Literal("e")).append(new Literal("f"))
    .append(new Literal("g")).append(new Literal("h")).append(new Literal("i"))
    .append(new Literal("j")).append(new Literal("k")).append(new Literal("l"))
    .append(new Literal("m")).append(new Literal("n")).append(new Literal("o"))
    ;

Repeat * rep = new Repeat( alt ,5,5 );
    Morph m( *rep);

//    DUMP_R(TestMorph,(*rep),20);
    DUMP_R(TestMorph,m,100);

return FALSE;
}

#endif

static UBool TestLanguageGenerator(){
    //LanguageGenerator g;
    //const char *const s = "$s = p 0% | q 1%;";
    //g.parseBNF(s, "$s");
    UBool pass;
    //= strcmp("q", g.next()) == 0;

const char *const def = 
        //"$a = $b;"
        //"$b = $c;"
        //"$c = $t;"
        //"$t = abc $z{1,2};"
        //"$k = a | b | c | d | e | f | g ;"
        //"$z = q 0% | p 1% | r 1%;"
        "$x = a ? 0%;"
        ; // end of string
//    const char * s = "abczz";
//
//
    LanguageGenerator g;
    pass = g.parseBNF(def, "$x",TRUE);
////    LanguageGenerator g(collationBNF, "$root", "$magic", new MagicNode());
//  
    if (pass != LanguageGenerator::OK) return FALSE;
    
    DUMP_R(TestLanguageGenerator, g, 20);
    return pass;

////UBool pass = strcmp(s,r) == 0;

//if (pass){
    //    printf("TestRandomLanguageGenerator passed.\n");
    //} else {
    //    printf("TestRandomLanguageGenerator FAILED!!!\n");
    //}
    //return pass;
}

void TestWbnf(void){
    srand((unsigned)time( NULL ));

//CALL(TestLiteral);
    //CALL(TestSequence);
    //CALL(TestSymbolTable);
    //CALL(TestVariable);

//TestRepeat();
    //TestAlternation();
    //TestMorph();

//TestQuote();
    //TestBuffer();
    //TestWeightedRand();

//CALL(TestScanner);
    //CALL(TestParser);
    CALL(TestLanguageGenerator);
}

C++程序 | 1772行 | 52.32 KB

/*
 ******************************************************************************
 * Copyright (C) 2005-2007, International Business Machines Corporation and   *
 * others. All Rights Reserved.                                               *
 ******************************************************************************
 */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>

#include "wbnf.h"
    
// Most of this code is meant to test the test code. It's a self test.
// Normally this isn't run.
#define TEST_WBNF_TEST 0

///////////////////////////////////////////////////////////
//
// Constants and the most basic helper classes
//
    
static const char DIGIT_CHAR[] = "0123456789";
static const char WHITE_SPACE[] = {'\t', ' ', '\r', '\n', 0};
static const char ALPHABET[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
static const char SPECIAL[] = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";

static inline UBool isInList(const char c /*in*/, const char list[] /*in*/){
    const char * p = list;
    for (;*p != 0 && *p != c; p++);
    return *p?TRUE:FALSE;
}
static inline UBool isDigit(char c) {return isInList(c, DIGIT_CHAR);}
static inline UBool isWhiteSpace(char c) {return isInList(c, WHITE_SPACE);}
static inline UBool isAlphabet(char c) {return isInList(c, ALPHABET);}
static inline UBool isSpecialAsciiChar(char c) {return isInList(c,SPECIAL);}

    

///////////////////////////////////////////////////////////
//
// Helper classes
//
    
class Buffer_byte{
// Utility class, can be treated as an auto expanded array. no boundary check.

    typedef char byte;
    byte * start;
    byte * current;
    int buffer_size; // size unit is byte
public:
    inline int content_size(){return current - start;} // size unit is byte

private:
    inline void expand(int add_size = 100){ // size unit is byte
        int new_size = buffer_size + add_size;

        int cs_snap = content_size();         
        start = (byte *) realloc(start, new_size);   // may change the value of start
        current = start + cs_snap;

        memset(current, 0, add_size);
        buffer_size = new_size;
    }

    inline void expand_to(int size){
        int r = size - buffer_size;
        if (r > 0) {
            expand(r);  // simply expand, no block alignment
        }
    }
    Buffer_byte(const Buffer_byte &);
    Buffer_byte & operator = (const Buffer_byte &);
public:
    Buffer_byte():start(NULL),current(start),buffer_size(0){
        expand();
    }
    ~Buffer_byte(){
        free(start);
    }

    inline void reset(){
        start != NULL ? memset(start, 0, buffer_size) : 0;
        current = start;
    }

    // Using memory copy method to append a C array to buffer, 
    inline void append(const void * c, int size){ // size unit is byte
        expand_to(content_size() + size) ;
        memcpy(current, c, size);
        current = current + size;
    }
    
    byte * buffer(){
        return start;
    }
};
    
/*
  The class(es) try to work as bulid-in array, so it overloads these two operators
    operator type *();
    type & operator[];
  The first is used to auto type convert, the latter is used to select member.

  A small trick is the class does not overload the address-of operator. This 
  behavior is different from bulid-in array, but it give us the opportunity 
  to get the address of the class itself.
*/
//template<typename type>
//    class BUFFER{
//       typedef BUFFER name;
#define BUFFER(type, name)\
    class name {\
    private:\
       Buffer_byte buf;\
    public:\
        name & reset() {buf.reset(); return *this;}\
        name & append(type c) {buf.append(&c, sizeof(type)); return *this;}\
        name & append_array(const type * p, int size) {buf.append(p, sizeof(type)*size); return *this;}\
        type & operator [] (int i) { return ((type *) buf.buffer())[i];}\
        operator type *(){return (type *) buf.buffer();} \
        int content_size(){return buf.content_size() / sizeof(type);}\
    }
    

class Pick{
/* The Pick is the basic language generator element*/
public:
    // generate a string accroding the syntax 
    // Return a null-terminated c-string. The buffer is owned by callee.
    virtual const char* next() = 0; 
    virtual ~Pick(){};
};

//typedef BUFFER<char> Buffer_char;
//typedef BUFFER<int> Buffer_int;
//typedef BUFFER<Pick *> Buffer_pPick;
BUFFER(char, Buffer_char);
BUFFER(int, Buffer_int);
BUFFER(Pick *, Buffer_pPick);

class SymbolTable{
/* Helper class.
* It's a mapping table between 'variable name' and its 'active Pick object'
*/
private:
    Buffer_char  name_buffer;   // var names storage space

    Buffer_int   names;         // points to name (offset in name_buffer)
    Buffer_pPick refs;          // points to Pick

    int get_index(const char *const var_name){
        int len = names.content_size();
        for (int i=0; i< len; i++){
            if (strcmp(var_name, name_buffer + names[i]) == 0){
                return i;
            }
        }
        return -1;
    }

public:
    enum RESULT {EMPTY, NO_VAR, NO_REF, HAS_REF};

    RESULT find(const char *const var_name /*[in] c-string*/, Pick * * ref = NULL /*[out] Pick* */){
        if (!var_name) return EMPTY; // NULL name

        int i = get_index(var_name);
        if (i == -1){
            return NO_VAR;   // new name
        }
        if (!refs[i]){  // exist name, no ref
            return NO_REF;
        } else {
            if (ref) {
                *ref = refs[i];
            }
            return HAS_REF;   // exist name, has ref
        }
    }

    void put(const char *const var_name, Pick *const var_ref = NULL){
        int i = get_index(var_name);
        switch(find(var_name)){
            case EMPTY:    // NULL name
                break;
            case NO_VAR:    // new name
                int offset;
                offset = name_buffer.content_size();
                name_buffer.append_array(var_name, strlen(var_name) + 1);
                names.append(offset);
                refs.append(var_ref);
                break;
            case NO_REF:    // exist name, no ref
                refs[i] = var_ref;    // link definition with variable
                break;
            case HAS_REF:    // exist name, has ref
                if (var_ref){
                    refs[i] = var_ref;
                }
                break;
            default:
                ; // ASSERT(FALSE);
        }
        return;
    }

    UBool is_complete(){
        int n = names.content_size();
        for (int i=0; i<n; ++i){
            if (refs[i] == NULL){
                return FALSE;
            }
        }
        return TRUE;
    }

    void reset(){
        names.reset();
        name_buffer.reset();

        // release memory here
        int s = refs.content_size();
        for (int i=0; i < s; i++){
            delete refs[i]; // TOFIX: point alias/recursion problem
        }
        refs.reset();
    }

    ~SymbolTable(){
        reset();
    }
};


/*
// Document of class Escaper
//
// ATTENTION: 
// From http://icu-project.org/userguide/Collate_Customization.html.
// We get the precedence of escape/quote operations
//
//     (highest) 1. backslash               \
//               2. two single quotes       ''
//               3. quoting                 ' '
//
// ICU Collation should accept following as the same string.
//
// 1)  'ab'c        _
// 2)  a\bc          \
// 3)  a'b'\c        |- They are equal.
// 4)  abc          _/
//
// From "two single quotes", we have following deductions
//    D1. empty quoting is illgal. (obviously)
//    D2. no contact operation between two quotings   
//              '.''.'      is not ..   it is .'.
//    D3. "two single quotes" cannot contact two quoting simultaneously
//              '..''''.'   is not ..'. it is ..''.
//       NOTICE:
//        "two single quotes" can contact before one quoting
//              '''.'       is '.
//        "two single quotes" can literally contact after one quoting
//        But, from syntax, it's one quoting including a "two single quotes"
//              '.'''       is .'
//    D4. "two single quotes" cannot solely be included in quoting
//              ''''        is not '    it is ''
//       NOTICE:  These are legal
//              '.''.'      is .'.
//              '.'''       is .'
//
//                 dicision
//                    /\
//                   /__\
//      output buffer    input buffer
// 
// To make our dicision (within an atom operation) without caring input and output buffer,
// following calling pattern (within an atom operation) shall be avoided
//
//    P1 open_quoting()  then close_quoting()    (direct violation)   D1
//    P2 close_quoting() then open_quoting()     (direct violation)   D2
//    P3 empty open_quoting()                    (indirect violation) D1, D4
//    P4 empty close_quoting()                   (indirect violation) D2, D3
//    P5 open_quoting()  then two single quotes  (indirect violation) D4
//    P6 close_quoting() then two single quotes  (indirect violation) D3
//
// two single quotes escaping will not open_ or close_ quoting()
// The choice will not lose some quoing forms.
//
// For open_quoting(), 
// we may get this form quoting     '''         P5
// It may raise a bug               ''''x
// If we expect
//      '''.'       let the next char open the quoting
//      '.''.'      the quoting is already opened by preceding char
//
// For close_quoting()
// we will get this form quoting    '.'''       P6
// It may raise a bug               '.''''.'
// If we expect          
//      '.'''\.     let the next char close the quoting
//      '.''''.'    the expectation is wrong!  using  '.'\''.' instead
//
// It's a hard work to re-adjust generation opportunity for various escaping form.
// We just simply ignore it.
*/
class Escaper{
public:
    enum CHOICE {YES, NO, RAND};
    enum ESCAPE_FORM {BSLASH_ONLY, QUOTE_ONLY, QUOTE_AND_BSLAH, RAND_ESC};
private:
    class Bool{ // A wrapper class for CHOICE, to auto adapter UBool class
        private:
            const CHOICE tag;
        public:
            Bool(CHOICE flag=RAND):tag(flag){}
            operator UBool() {   // conversion operator
                return tag == RAND ? rand()%2 : tag == YES;
                //if (tag == RAND){
                //    return rand()%2 == 1;
                //} else {
                //    return tag == YES ? TRUE : FALSE;
                //}
            }
    };
public:
    Escaper(CHOICE escapeLiteral = RAND,
        CHOICE twoQuotesEscape = RAND,
        ESCAPE_FORM escapeForm = RAND_ESC):
        escape_form(escapeForm),
        escape_literal(escapeLiteral),
        two_quotes_escape(twoQuotesEscape),
        is_quoting(FALSE){}
private:
    Buffer_char str;
    ESCAPE_FORM escape_form;
    Bool escape_literal;
    Bool two_quotes_escape;
    UBool quote_escape;
    UBool bslash_escape;
    UBool is_quoting;

    void set_options(){
        ESCAPE_FORM t = escape_form == RAND_ESC ? (ESCAPE_FORM) (rand()%3) : escape_form;
        switch (t){
                case BSLASH_ONLY :
                    bslash_escape = TRUE; quote_escape = FALSE; break;
                case QUOTE_ONLY:
                    bslash_escape = FALSE;quote_escape = TRUE;  break;
                case QUOTE_AND_BSLAH:
                    bslash_escape = TRUE; quote_escape = TRUE;  break;
                default:
                    ;// error
        }
    }

    void reset(){
        str.reset();
        is_quoting = FALSE;
    }

    inline void open_quoting(){ 
        if(is_quoting){
            // do nothing
        } else {
            str.append('\'');
            is_quoting = TRUE;
        }
    }
    inline void close_quoting(){
        if(is_quoting){
            str.append('\'');
            is_quoting = FALSE;
        } else {
            // do nothing
        }
    }

    // str  [in]    null-terminated c-string
    void append(const char * strToAppend){
        for(;*strToAppend != 0; strToAppend++){
            append(*strToAppend);
        }
    }

    inline void append(const char c){
        set_options();

        if (c == '\\'){
            quote_escape ? open_quoting() : close_quoting();
            //bslash_escape always true here
            str.append('\\');
            str.append('\\');
        } else if (c == '\''){
            if (two_quotes_escape){     // quoted using two single quotes
                // See documents in anonymous.design
                str.append('\'');
                str.append('\'');
            } else{
                quote_escape ? open_quoting() : close_quoting();
                //bslash_escape always true here
                str.append('\\');
                str.append('\'');
            }
        } else if (isSpecialAsciiChar(c) || isWhiteSpace(c)){
            quote_escape  ? open_quoting()   : close_quoting();
            if (bslash_escape) str.append('\\');
            str.append(c);
        } else { //if (isAlphabet(c) || isDigit(c) || TRUE){ // treat others as literal
            if (escape_literal){
                quote_escape  ? open_quoting()   : close_quoting();
                if (bslash_escape)  str.append('\\');
                str.append(c);
            } else {
                close_quoting();
                str.append(c);
            }
        }
    }

public:
    // Return a null-terminate c-string. The buffer is owned by callee.
    char * operator()(const char * literal /*c-string*/){
        str.reset();
        for(;*literal != 0; literal++){
            append(*literal);
        }
        close_quoting();    // P4 exception, to close whole quoting
        return str;
    }
};
    
class WeightedRand{
// Return a random number in [0, size)
// Every number has different chance (aka weight) to be selected.
private:
    Buffer_int weights;
    double total;
    WeightedRand(const WeightedRand &);
    WeightedRand & operator = (const WeightedRand &);
public:
    WeightedRand(Buffer_int * weight_list = NULL, int size = 0){
        if ( weight_list == NULL){
            for (int i=0; i<size; ++i) weights.append(DEFAULT_WEIGHT);
        } else {
            int s = weight_list->content_size();
            if (s < size){
                weights.append_array( (*weight_list),s);
                for (int i=s; i<size; ++i) weights.append(DEFAULT_WEIGHT);
            } else { // s >= size
                weights.append_array( (*weight_list),size);
            }
        }
        total = 0;
        int c = weights.content_size();
        for (int i=0; i<c; ++i){
            total += weights[i];
        }
    }

    void append(int weight){
        weights.append(weight);
        total += weight;
    }

    // Give a random number with the consideration of weight.
    // Every random number is associated with a weight.
    // It identifies the chance to be selected, 
    // larger weight has more chance to be selected.
    //
    //
    //  ______________________   every slot has equal chance
    //
    //  [____][_][___][______]   each item has different slots, hence different chance
    //
    //
    //  The algorithms to generate the number is illustrated by preceding figure.
    //  First, a slot is selected by rand(). Then we translate the slot to corresponding item.
    //
    int next(){
        // get a random in [0,1]
        double reference_mark = (double)rand() / (double)RAND_MAX;

        // get the slot's index, 0 <= mark <= total;
        double mark = total * reference_mark;

        // translate the slot to corresponding item
        int i=0;
        for (;;){
            mark -= weights[i];  // 0 <= mark <= total
            if (mark <= 0)
                break;
            i++;
        }
        return i;
    }
};
    
///////////////////////////////////////////////////////////
//
// The parser result nodes
//
    
class Literal : public Pick {
public:
    virtual const char* next(){
        return str;
    }
    Literal(const char * s /*c-string*/){
        str.append_array(s, strlen(s) + 1);
    }
private:
    Buffer_char str; //null-terminated c-string
};

class Variable : public Pick {
public:
    Variable(SymbolTable * symbols, const char * varName, Pick * varRef = NULL){
        this->var_name.append_array(varName, strlen(varName) + 1);
        if ((symbol_table = symbols)){
            symbol_table->put(varName, varRef);
        }
    }

    operator const char *(){
        return var_name;
    }

    virtual const char* next(){
        if (symbol_table){
            Pick * var_ref = NULL;
            symbol_table->find(var_name, &var_ref);
            if (var_ref) {
                return var_ref->next();
            }
        }
        return "";  // dumb string
    }
private:
    Buffer_char var_name;
    SymbolTable * symbol_table;
};

class Quote : public Pick{
public:
    Quote(Pick & base):item(base),e(Escaper::NO, Escaper::NO, Escaper::BSLASH_ONLY){
    }
    virtual const char* next(){
        return e(item.next());
    }
private:
    Pick & item;
    Buffer_char str;
    Escaper e;
};


class Morph : public Pick{
/*
The difference between morph and an arbitrary random string is that 
a morph changes slowly. When we build collation rules, for example, 
it is a much better test if the strings we use are all in the same 
'neighborhood'; they share many common characters.
*/
public:
    Morph(Pick & base):item(base){}

    virtual const char* next(){
        current.reset();
        const char * s = item.next();
        current.append_array(s, strlen(s) + 1);
        if  (last.content_size() == 0) {
            str.reset();
            last.reset();
            str.append_array(current, current.content_size());
            last.append_array(current, current.content_size());
        } else {
            morph();
        }
        return str;
    }
private:
    Pick & item;
    Buffer_char str;
    Buffer_char last;
    Buffer_char current;

    char * p_last;
    char * p_curr;

    void copy_curr(){
        if (*p_curr) {
            str.append(*p_curr);
            p_curr++;
        }
    }

    void copy_last(){
        if (*p_last) {
            str.append(*p_last);
            p_last++;
        }
    }

    // copy 0, 1, or 2 character(s) to str
    void copy(){
        static WeightedRand wr(& Buffer_int().append(DEFAULT_WEIGHT * 10), 5);

        switch (wr.next()){
            case 0: // copy last  -- has 10 times chance than others
                copy_last();
                break;
            case 1: // copy both
                copy_curr();
                copy_last();
                break;
            case 2: // copy both
                copy_last();
                copy_curr();
                break;
            case 3:
                copy_curr();
                break;
            case 4:  // copy nothing
                break;
            default:
                // ASSERT(FALSE);
                ;
        }
    }

    void morph(void){
        int min = strlen(last);
        int max = strlen(current);
        if (min > max){
            int temp  = min;
            min = max;
            max = temp;
        }

        int len = min + rand()%(max - min + 1); // min + [0, diff]
        p_curr = current;
        p_last = last;
        str.reset();

        for (; str.content_size()<len && *p_curr && *p_last;){
            copy(); // copy 0, 1, or 2 character(s) to str
        }

        if (str.content_size() == len) {
            str.append(0);
            final();
            return;
        }

        if (str.content_size() > len) { // if the last copy copied two characters
            str[len]=0;
            final();
            return;
        }

        // str.content_size() < len
        if (*p_last) {
            for (; str.content_size() < len; copy_last());
        } else if (*p_curr){
            for (; str.content_size() < len; copy_curr());
        }

        int last_len = last.content_size();
        for (;str.content_size() < len;){
            str.append(last[rand()%last_len]);
        }
        str.append(0);
        final();
    }

    void final(){
        last.reset();
        last.append_array(current, current.content_size());
    }
};

class Sequence : public Pick {
public:
    virtual const char* next(){
        str.reset();
        int s = items.content_size();
        for(int i=0; i < s; i++){
            const char * t = items[i]->next();
            str.append_array(t, strlen(t));
        }
        str.append(0); // terminal null
        return str;
    }

    void append (Pick * node){
        items.append(node);
    }

    virtual ~Sequence(){
        int s = items.content_size();
        for(int i=0; i < s; i++){
            //How can assure the item is got from heap?
            //Let's assume it.
            delete items[i]; // TOFIX: point alias/recursion problem
            items[i] = NULL;
        }
    }
private:
    Buffer_pPick items;
    Buffer_char  str; //null-terminated c-string
};

class Repeat : public Pick {
private:
    Pick * item;
    Buffer_char str;
    WeightedRand wr;
    int min;
    int max;
    int select_a_count(){
        return min + wr.next();
    }
public:
    virtual const char* next(){
        str.reset();
        int c = select_a_count();
        for(int i=0; i< c; i++){
            const char * t = item->next();
            str.append_array(t, strlen(t));
        }
        str.append(0);
        return str;
    }

    Repeat(Pick * base, int minCount =0, int maxCount = 1, Buffer_int * weights = NULL):
        wr(weights, maxCount-minCount +1) {
        this->item = base;
        this->min = minCount;
        this->max = maxCount;
    }
    virtual ~Repeat(){
        delete item;  // TOFIX: point alias/recursion problem
        item = NULL;
    }
};


class Alternation : public Pick {
public:
    virtual const char* next(){
        str.reset();
        int i = wr.next();
        const char * t = items[i]->next();
        str.append_array(t, strlen(t) + 1);
        return str;
    }
    virtual ~Alternation(){
        int s = items.content_size();
        for(int i=0; i < s; i++){
            delete items[i];  // TOFIX: point alias/recursion problem
            items[i] = NULL;
        }
    }

    Alternation & append (Pick * node, int weight = DEFAULT_WEIGHT){
        items.append(node);
        wr.append(weight);
        return *this;
    }
private:
    Buffer_pPick items;
    Buffer_char str; // null-terminated c-string
    WeightedRand wr;
};
    
///////////////////////////////////////////////////////////
//
// The parser
//

enum TokenType {STRING, VAR, NUMBER, STREAM_END, ERROR, QUESTION, STAR, PLUS, LBRACE, RBRACE, LPAR, RPAR, SEMI, EQ, COMMA, BAR, AT, WAVE, PERCENT};

class Scanner{
friend int DumpScanner(Scanner & s, UBool dumb);
private:
    const char * source;
    const char * working;
    const char * history; // for debug
    enum StateType {START, IN_NUM, IN_VAR_FIRST, IN_VAR, IN_QUOTE, IN_QUOTE_BSLASH, IN_BSLASH, IN_STRING, DONE};
    StateType state;
    void terminated(TokenType t){
        working--;       // return the peeked character
        tokenType = t;
        token.append(0); // close buffer
        state = DONE;
    }
public:
    // the buffer of "source" is owned by caller
    Scanner(const char *src/*[in] c-string*/ = NULL):source(src){
        working = src;
        history = working;
        state = DONE;
        tokenType = ERROR;
    }

    //void setSource(const char *const src /*[in] c-string*/){
    //    *(&const_cast<const char *>(source)) = src;
    //}

    Buffer_char token;
    TokenType tokenType;

    TokenType getNextToken(){
        token.reset();
        state = START;
        history = working; // for debug
        while (state != DONE){
            char c = *working++;
            if (c == 0 && state != START){//avoid buffer overflow. for IN_QUOE, IN_ESCAPE
                terminated(ERROR);
                break; // while
            }
            switch(state){
                case START:
                    tokenType = ERROR;
                    switch(c){
                        case '?'  : tokenType = QUESTION; break;
                        case '*'  : tokenType = STAR; break;
                        case '+'  : tokenType = PLUS; break;
                        case '{'  : tokenType = LBRACE; break;
                        case '}'  : tokenType = RBRACE; break;
                        case '('  : tokenType = LPAR; break;
                        case ')'  : tokenType = RPAR; break;
                        case ';'  : tokenType = SEMI; break;
                        case '='  : tokenType = EQ; break;
                        case ','  : tokenType = COMMA; break;
                        case '|'  : tokenType = BAR; break;
                        case '@'  : tokenType = AT; break;
                        case '~'  : tokenType = WAVE; break;
                        case '%'  : tokenType = PERCENT; break;
                        case 0    : tokenType = STREAM_END; working-- /*avoid buffer overflow*/; break;
                    }
                    if (tokenType != ERROR){
                        token.append(c);
                        token.append(0);
                        state = DONE;
                        break; // START
                    }
                    switch(c){
                        case '$'  : state = IN_VAR_FIRST; token.append(c); break;
                        case '\'' : state = IN_QUOTE;     break;
                        case '\\' : state = IN_BSLASH;    break;
                        default:
                            if (isWhiteSpace(c)){    // state = START;   //do nothing
                            } else if (isDigit(c)){     state = IN_NUM;    token.append(c);
                            } else if (isAlphabet(c)){  state = IN_STRING; token.append(c);
                            } else {terminated(ERROR);}
                    }
                    break;//START
                case IN_NUM:
                    if (isDigit(c)){
                        token.append(c);
                    } else {
                        terminated(NUMBER);
                    }
                    break;//IN_NUM
                case IN_VAR_FIRST:
                    if (isAlphabet(c)){
                        token.append(c);
                        state = IN_VAR;
                    } else {
                        terminated(ERROR);
                    }
                    break; // IN_VAR_FISRT
                case IN_VAR:
                    if (isAlphabet(c) || isDigit(c)){
                        token.append(c);
                    } else {
                        terminated(VAR);
                    }
                    break;//IN_VAR
                case IN_STRING:
                    // About the scanner's behavior for STRING, AT, and ESCAPE:
                    // All of them can be contacted with each other. 
                    // This means the scanner will eat up as much as possible strings
                    //   (STRING, AT, and ESCAPE) at one time, with no regard of their
                    //   combining sequence.
                    //
                    if (c == '\''){
                        state = IN_QUOTE; // the first time we see single quote
                    } else if (c =='\\'){ // back slash character
                        state = IN_BSLASH;
                    } else if (isAlphabet(c) || isDigit(c)){
                        token.append(c);
                    } else{
                        terminated(STRING);
                    }
                    break;//IN_STRING
                case IN_QUOTE:
                    if (c == '\''){ // the second time we see single quote
                        state = IN_STRING; // see document in IN_STRING
                    } else if ( c== '\\') { // backslah escape in quote
                        state = IN_QUOTE_BSLASH;
                    } else {
                        token.append(c);  // eat up everything, includes back slash
                    }
                    break;//IN_QUOTE
                case IN_QUOTE_BSLASH:
                case IN_BSLASH:
                    switch (c){
                        case 'n'  : token.append('\n'); break;
                        case 'r'  : token.append('\r'); break;
                        case 't'  : token.append('\t'); break;
                        case '\'' : token.append('\''); break;
                        case '\\' : token.append('\\'); break;
                        default: token.append(c); // unknown escaping, treat it as literal
                    }
                    if (state == IN_BSLASH){
                        state = IN_STRING; // see document in IN_STRING
                    } else { // state == IN_QUOTE_BSLASH
                        state = IN_QUOTE;
                    }
                    break;//IN_BSLASH
                case DONE:  /* should never happen */
                default:
                    working--;
                    tokenType = ERROR;
                    state = DONE;
                    break;
            }//switch(state) 
        }//while (state != DONE)

        return tokenType;
    }
};//class Scanner

class Parser{
friend UBool TestParser();
friend class TestParserT;
friend class LanguageGenerator_impl;
private:
    Scanner s;
    TokenType & token;
    int min_max;   // for the evil infinite
    
    UBool match(TokenType expected){
        if (token == expected) {
            token = s.getNextToken();
            return TRUE;
        } else {
            //s.dumpCurrentPoint();
            return FALSE;
        }
    }

    UBool weight(int & value){
        if (token == NUMBER){
            int temp = atoi(s.token);
            match(NUMBER);
            if (match(PERCENT)){
                value = temp;
                return TRUE;
            }
        }
        return FALSE;
    }

    UBool repeat (Pick* &node /*in,out*/){
        if (node == NULL) return FALSE;

        int count = -2;
        int min = -2;
        int max = -2;
        UBool question = FALSE;
        switch (token){
            case QUESTION:
                match(QUESTION);
                min = 0;
                max = 1;
                count = 2;
                question = TRUE;
                break;
            case STAR:
                match(STAR);
                min = 0;
                max = -1;
                count = -1;
                break;
            case PLUS:
                match(PLUS);
                min = 1;
                max = -1;
                count = -1;
                break;
            case LBRACE:
                match(LBRACE);
                if (token != NUMBER){
                    return FALSE;
                }else {
                    min = atoi(s.token);
                    match(NUMBER);
                    if (token == RBRACE){
                        match(RBRACE);
                        max = min;
                        count = 1;
                    } else if (token == COMMA) {
                        match(COMMA);
                        if (token == RBRACE){
                            match(RBRACE);
                            max = -1;
                            count = -1;
                        } else if (token == NUMBER) {
                            max = atoi(s.token);
                            match(NUMBER);
                            count = max - min + 1;
                            if (!match(RBRACE)) {
                                return FALSE;
                            }
                        } else {
                            return FALSE;
                        }
                    } else {
                        return FALSE;
                    }
                }
                break;
            default:
                return FALSE;
        }

        if (count == -2 || min == -2 || max == -2){
            //ASSERT(FALSE);
            return FALSE;
        }

        // eat up following weights
        Buffer_int weights;
        int w;
        while (weight(w)){
            weights.append(w);
        }

        // for the evil infinite
        min_max = min_max > min ? min_max : min;
        min_max = min_max > max ? min_max : max;
        if (min_max > PSEUDO_INFINIT){
            return FALSE; // PSEUDO_INFINIT is less than the real maximum
        }
        if (max == -1){ // the evil infinite
            max = PSEUDO_INFINIT;
        }
        // for the strange question mark
        if (question && weights.content_size() > 0){
            Buffer_int w2;
            w2.append(DEFAULT_WEIGHT - weights[0]).append(weights[0]);
            node = new Repeat(node,min,max,&w2);
            return TRUE;
        }
        node = new Repeat(node,min,max,&weights);
        return TRUE;
    }

    UBool core(Pick* &node /*out*/){
        if (node != NULL) return FALSE; //assert node == NULL

        switch(token){
            case LPAR:
                match(LPAR);
                if(defination(node) && match(RPAR)){
                    return TRUE;
                }
                return FALSE;
            case VAR:
                node = new Variable(&symbols, s.token);
                match(VAR);
                return TRUE;
            case STRING:
                node = new Literal(s.token);
                match(STRING);
                return TRUE;
            default:
                return FALSE;
        }
    }
    UBool modified(Pick* &node /*out*/){
        if (node != NULL) return FALSE; //assert node == NULL

        if (!core(node)) {
            return FALSE;
        }

        for (;;){
            switch(token){
                case WAVE:
                    match(WAVE);
                    node = new Morph(*node);
                    break;
                case AT:
                    match(AT);
                    node = new Quote(*node);
                    break;
                case QUESTION:
                case STAR:
                case PLUS:
                case LBRACE:
                    if (!repeat(node)) return FALSE;
                    break;
                case SEMI:      // rule definiation closed
                case RPAR:      // within parenthesis (core closed)
                case BAR:       // in alternation
                case NUMBER:    // in alternation, with weight
                case LPAR:      // in sequence
                case VAR:       // in sequence
                case STRING:    // in sequence
                    return TRUE;
                default:
                    return FALSE;
            }
        }
    }


    UBool sequence_list(Pick* &node /*in,out*/){
        if (node == NULL) return FALSE; // assert node != NULL

        Sequence* seq = new Sequence();
        Pick * n = node;

        while (token == VAR || token == STRING || token == LPAR){
            seq->append(n);
            n = NULL;
            if (modified(n)){
                // go on
            } else {
                goto FAIL;
            }
        }

        if (token == SEMI || token == RPAR || token == BAR){
            seq->append(n);
            node = seq;
            return TRUE;
        }
FAIL:
        delete seq;
        return FALSE;

    }

    UBool sequence(Pick* &node /*out*/){
        if (node != NULL) return FALSE; //assert node == NULL

        if (!modified(node)) {
            return FALSE;
        }

        if (token == VAR || token == STRING || token == LPAR){
            return sequence_list(node);
        } else {
            return TRUE; // just a modified
        }
    }

    UBool alternation_list(Pick* &node /*in,out*/){
        if (node == NULL) return FALSE; // assert node != NULL

        Alternation * alt = new Alternation();
        Pick * n = node;
        int w = DEFAULT_WEIGHT;

        while (token == NUMBER || token == BAR){
            if(token == NUMBER) {
                if (weight(w)){ 
                    if (token == BAR){ 
                        // the middle item, go on
                    } else {
                        // the last item or encounter error
                        break; //while
                    } 
                } else {
                    goto FAIL;
                }
            } // else token == BAR
            match(BAR);
            alt->append(n,w);

            n = NULL;
            w = DEFAULT_WEIGHT;
            if (sequence(n)){
                // go on
            } else {
                goto FAIL;
            }
        }

        if (token == SEMI || token == RPAR) {
            alt->append(n,w);
            node = alt;
            return TRUE;
        }
FAIL:
        delete alt;
        return FALSE;
    }

    UBool alternation(Pick* &node /*out*/){
        if (node != NULL) return FALSE; //assert node == NULL

        // 'sequence' has higher precedence than 'alternation'
        if (!sequence(node)){
            return FALSE;
        }

        if (token == BAR || token == NUMBER){ // find a real alternation1, create it.
            return alternation_list(node);
        } else {
            return TRUE;    // just a sequence_old
        }
    }


    UBool defination(Pick* &node /*out*/){
        if (node != NULL) return FALSE; //assert node == NULL
        return alternation(node);
    }

    UBool rule(){
        if (token == VAR){
            Buffer_char name;
            name.append_array(s.token, strlen(s.token) + 1);
            match(VAR);

            if (match(EQ)){
                Pick * t = NULL;
                if(defination(t)){
                    symbols.put(name, t);
                    return match(SEMI);
                }
            }
        }
        return FALSE;
    }
public:
    UBool rules(){
        symbols.reset();
        token = s.getNextToken();
        while (rule()){
        }
        if (token == STREAM_END){
            return TRUE;
        } else {
            //s.dumpCurrentPoint();
            return FALSE;
        }
    }

public:
    SymbolTable symbols;

    Parser(const char *const source):s(source), token(s.tokenType){
        min_max = -2;
    }
    UBool parse(){
        return rules();
    }

}; // class Parser
        

///////////////////////////////////////////////////////////
//
// 
//

int DumpScanner(Scanner & s, UBool dump = TRUE){
    int len = strlen(s.source);
    int error_start_offset = s.history - s.source;
    if (dump){
        printf("\n=================== DumpScanner ================\n");
        fwrite(s.source, len, 1, stdout);
        printf("\n-----parsed-------------------------------------\n");
        fwrite(s.source, s.history - s.source, 1, stdout);
        printf("\n-----current------------------------------------\n");
        fwrite(s.history, s.working - s.history, 1, stdout);
        printf("\n-----unparsed-----------------------------------\n");
        fwrite(s.working, (s.source + len - s.working), 1, stdout);
        printf("\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");
    }
    return error_start_offset;
}

class LanguageGenerator_impl{
public:
    LanguageGenerator_impl(const char *const bnf_definition, const char *const top_node)
        :par(bnf_definition), top_node_name(top_node){
        srand((unsigned)time( NULL ));
    }

    LanguageGenerator::PARSE_RESULT parseBNF(UBool debug = TRUE){
        if (par.parse()){
            if (par.symbols.find(top_node_name, &top_node_ref) == SymbolTable::HAS_REF) {
                if (par.symbols.is_complete()) {
                    return LanguageGenerator::OK;
                } else {
                    if (debug) printf("The bnf definition is incomplete.\n");
                    return LanguageGenerator::INCOMPLETE;
                }
            } else {
                if (debug) printf("No top node is found.\n");
                return LanguageGenerator::NO_TOP_NODE;
            }
        } else {
            if(debug) {
                printf("The bnf definition is wrong\n");
                DumpScanner(par.s, TRUE);
            }
            return LanguageGenerator::BNF_DEF_WRONG;
        }
    }
    const char * next(){
        return top_node_ref->next();
    }

private:
    Parser par;
    const char *const top_node_name;
    Pick * top_node_ref;
};

LanguageGenerator::LanguageGenerator():lang_gen(NULL){
}

LanguageGenerator::~LanguageGenerator(){
    delete lang_gen;
}

LanguageGenerator::PARSE_RESULT LanguageGenerator::parseBNF(const char *const bnf_definition /*in*/, const char *const top_node/*in*/, UBool debug){
    if (lang_gen){
        delete lang_gen;
    }
    lang_gen = new LanguageGenerator_impl(bnf_definition, top_node);
    PARSE_RESULT r = lang_gen->parseBNF(debug);
    if (r != OK){
        delete lang_gen;
        lang_gen = NULL;
        return r;
    } else {
        return r;
    }
}
const char *LanguageGenerator::next(){ // Return a null-terminated c-string. The buffer is owned by callee.
    if (lang_gen){
        return lang_gen->next();
    }else { 
        return "";
    }
}

///////////////////////////////////////////////////////////
//
// The test code for WBNF
//

#define CALL(fun) \
    if (fun()){ \
        printf("Pass: " #fun "\n");\
    } else { \
        printf("FAILED: !!! " #fun " !!!\n"); \
    }

#define DUMP_R(fun, var, times) \
    {printf("\n========= " #fun " =============\n"); \
    for (int i=0; i<times; i++) { \
        const char * t = var.next();\
        fwrite(t,strlen(t),1,stdout); \
        printf("\n");   \
    }   \
    printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");}



#if TEST_WBNF_TEST    
static UBool TestQuote(){
    const char *const str = "This ' A !,z| qq [] .new\tline";
    //const char *const str_r = "This \\' A '!,'z'|' qq '[]' '.'new\tline";
    ////
    //// :(  we must quote our string to following C syntax
    ////     cannot type the literal here, it makes our code rather human unreadable
    ////     very very unconformable!
    ////
    ///* 
    //*/

    //const char *const s1    =   "ab'c";
    //const char (* s1_r1) [] = { "ab''c",    // ab''c
    //                            "ab\\'c",   // ab\'c
    //                           };//
    ///*
    // .      '.'     \.
    // ..     \.\.    '.'\.   '.'\.   '..'    // '.''.'  wrong
    //*/

    //const char *const s2    =   "a..'.b";       // a..'.b
    //const char (*s2_r) []   = { "a'..''.'b"     // a'..''.'b   
    //                           ,"a'..\\'.'b"    // a'..\'.'b
    //                           ,"a'..'\\''.'b"  // a'..'\''.'b
    //                          };//

    //const char *const s3    =   "a..\\.b";      // a..\.b
    //const char (*s3_r) []   = { "a'..\\\\.'b"   // a'..\\.'b
    //                           ,"a'..'\\\\'.'b" // a'..'\\'.'b
    //                          };//

    //                            // no catact operation, no choice, must be compact

    srand((unsigned)time( NULL ));
    
    //Escaper l(Escaper::NO, Escaper::NO, Escaper::RAND_ESC);
    Pick *p = new Literal(str);
    Quote q(*p);

    DUMP_R(TestQuote, (*p), 1);
    DUMP_R(TestQuote, q, 20);
    return FALSE;
}
static UBool TestLiteral(){
    const char * s = "test string99.";
    Literal n(s);
    const char * r = n.next();
    return strcmp(s,r) == 0;
}

static UBool TestSequence(){
    Sequence seq;
    seq.append(new Literal("abc "));
    seq.append(new Literal(", s"));

    return strcmp(seq.next(), "abc , s") == 0;
}
static UBool TestAlternation(){
    srand((unsigned)time( NULL ));
    Alternation alt;
    alt.append(new Literal("aaa_10%"),10);
    alt.append(new Literal("bbb_0%"),0);
    alt.append(new Literal("ccc_10%"),10);
    alt.append(new Literal("ddddddd_50%"),50);

    DUMP_R(TestAlternation, alt, 50);

    return FALSE;
}

static UBool TestBuffer(){
    Buffer_int t;
    t.append(1).append(0).append(5);
    int s = t.content_size();
    for (int i=0; i<s; ++i){
        printf("%d\n", t[i]);
    }
    return FALSE;
}

static UBool TestWeightedRand(){
    srand((unsigned)time( NULL ));
    Buffer_int t;
    t.append(1).append(0).append(5);
    WeightedRand wr(&Buffer_int().append(10).append(0).append(50),4);
//    WeightedRand wr(&t,3);
    for (int i=0; i< 50; ++i){
        printf("%d\n", wr.next());
    }
    return FALSE;
}

static UBool TestRepeat(){
    srand((unsigned)time( NULL ));
    Repeat rep(new Literal("aaa1-5 "), 1, 5);
    DUMP_R(TestRepeat, rep, 50);

    Repeat r2(new Literal("b{1,3}1%0%5% "), 1, 3, &Buffer_int().append(1).append(0).append(5));
    DUMP_R(TestRepeat, r2, 50);

    Repeat r3(new Literal("aaa5-5 "), 5, 5);
    DUMP_R(TestRepeat, r3, 50);

    return FALSE;
}

static UBool TestVariable(){
    SymbolTable tab;
    Pick * value = new Literal("string1");
    Variable var1(&tab, "x", value);

    Variable var2(&tab, "y");
//    tab.put(var2, value); // TOFIX: point alias/recursion problem
    Pick * value2 = new Literal("string2");
    tab.put(var2, value2);

    Pick * value3 = new Literal("string3");
    Variable var3(&tab, "z");
    tab.put("z", value3);

    UBool pass;
    pass = strcmp(var1.next(), value->next()) == 0;
    pass = pass && strcmp(var2.next(), value2->next()) == 0;
    pass = pass && strcmp(var3.next(), value3->next()) == 0;
    return pass;
}

static UBool TestSymbolTable(){
    Literal * n1 = new Literal("string1");
    Literal * n2 = new Literal("string2");
    SymbolTable t;
    t.put("abc", n1);
    t.put("$aaa", n2);
//    t.put("alias", n1);  // TOFIX: point alias/recursion problem
    t.put("bbb");

    UBool pass;
    pass = t.find(NULL) == SymbolTable::EMPTY;
    pass = pass && t.find("ccc") == SymbolTable::NO_VAR;
    pass = pass && t.find("bbb") == SymbolTable::NO_REF;
    pass = pass && t.find("abc") == SymbolTable::HAS_REF;
    pass = pass && t.find("$aaa") == SymbolTable::HAS_REF;

    t.reset();
    pass = pass && t.find("abc") == SymbolTable::NO_VAR;
    return pass;
}


static UBool TestScanner(void){
    //const char str1[] = "$root = $command{0,5} $reset $mostRules{1,20};";
    //const char str1_r[][20] = {"$root", "=", "$command", "{", "0", ",", "5", "}", 
    //    "$reset", "$mostRules", "{", "1", ",", "20", "}", ";"};

    const char str2[] = "$p2 =(\\\\ $s $string $s)? 25%;";
    const char str2_r[][20] = {"$p2", "=", "(", "\\", "$s", "$string", "$s", ")", "?", "25", "%", ";"};

    const char *str = str2;
    const char (*str_r)[20] = str2_r;
    int tokenNum = sizeof(str2_r)/sizeof(char[20]);

    Scanner t(str);
    UBool pass = TRUE;
    t.getNextToken();
    int i = 0;
    while (pass){
        if (t.tokenType == STREAM_END){
            pass = pass? i == tokenNum : FALSE;
            break;//while
        } else if (t.tokenType == ERROR){
            pass = FALSE;
            break;//while
        } else {
            pass = strcmp( &(t.token[0]), str_r[i++]) == 0;
            t.getNextToken();
        }
    }

    //const char ts[] = "$commandList = '['"
    //" ( alternate ' ' $alternateOptions"
    //" | backwards ' 2'"
    //" | normalization ' ' $onoff "
    //" | caseLevel ' ' $onoff "
    //" | hiraganaQ ' ' $onoff"
    //" | caseFirst ' ' $caseFirstOptions"
    //" | strength ' ' $strengthOptions"
    //" ) ']';" ;

    //Scanner t2(ts);
    //pass = TRUE;
    //do {
    //    t2.getNextToken();
    //    if (t2.tokenType == ERROR){
    //        DumpScanner(t2);
    //        return FALSE;
    //    }
    //}while (t.tokenType != STREAM_END);

    return pass;
}

class TestParserT {
public:
UBool operator () (const char *const str, const int exp_error_offset = -1, const UBool dump = TRUE){
    Parser par(str);
    if (par.rules()){
        if ( exp_error_offset == -1){
            return TRUE;
        }else {
            DumpScanner(par.s,dump);
            return FALSE;
        }
    }else {
        return DumpScanner(par.s, dump) == exp_error_offset;
    }
}
};

UBool TestParser(){
    TestParserT test;

    UBool pass = TRUE;
    pass = pass && test ("$s = ' ' ? 50%;");
    pass = pass && test("$x = ($var {1,2}) 3%;");         // legal
    pass = pass && test("$x = $var {1,2} 3% | b 4%;");    // legal
    pass = pass && test("$x = $var {1,2} 3%;");           // legal
    pass = pass && test("$m = $c ? 2% 4% | $r 5% | $n 25%;"); // legal
    pass = pass && test("$a = b ? 2% | c 5%;");               // legal
    pass = pass && test("$x = A B 5% C 10% | D;", 8, FALSE);  // illegal 5%
    pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc
    pass = pass && test("$x = (b 5%) (c 6%);");               // legal
    pass = pass && test("$x = (b 5%) c 6%;", 13, FALSE);      // illegal 6%
    pass = pass && test("$x = b 5% (c 6%);", 9, FALSE);       // illegal (c 6%)
    pass = pass && test("$x = b 5% c 6%;", 9, FALSE);         // illegal c 6%
    pass = pass && test("$x = b 5%;");                        // legal
    pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc
    pass = pass && test("$x = a | b  | c 4% | d 5%;");        // legal
    pass = pass && test("$s = ' ' ? 50% abc;");               // legal
    pass = pass && test("$s =  a | c d | e f;");              // legal
    pass = pass && test( "$z = q 0% | p 1% | r 100%;");         // legal How to check parsed tree??

    pass = pass && test("$s = ' ' ? 50%;");
    pass = pass && test("$relationList = '<' | '<<' |  ';' | '<<<' | ',' | '=';");
    pass = pass && test("$p1 = ($string $s '|' $s)? 25%;");
    pass = pass && test("$p2 = (\\\\ $s $string $s)? 25%;");
    pass = pass && test("$rel2 = $p1 $string $s $p2;");
    pass = pass && test("$relation = $relationList $s ($rel1 | $rel2) $crlf;");
    pass = pass && test("$command = $commandList $crlf;");
    pass = pass && test("$reset = '&' $s ($beforeList $s)? 10% ($positionList 100% | $string 10%) $crlf;");
    pass = pass && test("$mostRules = $command 1% | $reset 5% | $relation 25%;");
    pass = pass && test("$root = $command{0,5} $reset $mostRules{1,20};");

    const char collationBNF[] =
    "$s = ' '? 50%;" 
    "$crlf = '\r\n';" 

    "$alternateOptions = non'-'ignorable | shifted;" 
    "$onoff = on | off;" 
    "$caseFirstOptions = off | upper | lower;" 
    "$strengthOptions = '1' | '2' | '3' | '4' | 'I';" 
    "$commandList = '['"
    " ( alternate ' ' $alternateOptions"
    " | backwards ' 2'"
    " | normalization ' ' $onoff "
    " | caseLevel ' ' $onoff "
    " | hiraganaQ ' ' $onoff"
    " | caseFirst ' ' $caseFirstOptions"
    " | strength ' ' $strengthOptions"
    " ) ']';" 
    "$command = $commandList $crlf;" 

    "$ignorableTypes = (tertiary | secondary | primary) ' ' ignorable;" 
    "$allTypes = variable | regular | implicit | trailing | $ignorableTypes;" 
    "$positionList = '[' (first | last) ' ' $allTypes ']';"

    "$beforeList = '[before ' ('1' | '2' | '3') ']';"

    "$relationList = ("
    "   '<'"
    " | '<<'"
    " | ';'" 
    " | '<<<'"
    " | ','" 
    " | '='"
    ");"
    "$string = $magic;" 
    "$rel1 = '[variable top]' $s;" 
    "$p1 = ($string $s '|' $s)? 25%;" 
    "$p2 = (\\\\ $s $string $s)? 25%;" 
    "$rel2 = $p1 $string $s $p2;" 
    "$relation = $relationList $s ($rel1 | $rel2) $crlf;" 

    "$reset = '&' $s ($beforeList $s)? 10% ($positionList 1% | $string 10%) $crlf;" 
    "$mostRules = $command 1% | $reset 5% | $relation 25%;"
    "$root = $command{0,5} $reset $mostRules{1,20};"
    ;
    
    pass = pass && test(collationBNF);


    return pass;
}

static UBool TestMorph(){
    srand((unsigned)time( NULL ));

    Alternation * alt = new Alternation();

    (*alt)
    .append(new Literal("a")).append(new Literal("b")).append(new Literal("c"))
    .append(new Literal("d")).append(new Literal("e")).append(new Literal("f"))
    .append(new Literal("g")).append(new Literal("h")).append(new Literal("i"))
    .append(new Literal("j")).append(new Literal("k")).append(new Literal("l"))
    .append(new Literal("m")).append(new Literal("n")).append(new Literal("o"))
    ;

    Repeat * rep = new Repeat( alt ,5,5 );
    Morph m( *rep);

//    DUMP_R(TestMorph,(*rep),20);
    DUMP_R(TestMorph,m,100);

    return FALSE;
}

#endif

static UBool TestLanguageGenerator(){
    //LanguageGenerator g;
    //const char *const s = "$s = p 0% | q 1%;";
    //g.parseBNF(s, "$s");
    UBool pass;
    //= strcmp("q", g.next()) == 0;

    const char *const def = 
        //"$a = $b;"
        //"$b = $c;"
        //"$c = $t;"
        //"$t = abc $z{1,2};"
        //"$k = a | b | c | d | e | f | g ;"
        //"$z = q 0% | p 1% | r 1%;"
        "$x = a ? 0%;"
        ; // end of string
//    const char * s = "abczz";
//
//
    LanguageGenerator g;
    pass = g.parseBNF(def, "$x",TRUE);
////    LanguageGenerator g(collationBNF, "$root", "$magic", new MagicNode());
//  
    if (pass != LanguageGenerator::OK) return FALSE;
    
    DUMP_R(TestLanguageGenerator, g, 20);
    return pass;

    ////UBool pass = strcmp(s,r) == 0;

    //if (pass){
    //    printf("TestRandomLanguageGenerator passed.\n");
    //} else {
    //    printf("TestRandomLanguageGenerator FAILED!!!\n");
    //}
    //return pass;
}

void TestWbnf(void){
    srand((unsigned)time( NULL ));

    //CALL(TestLiteral);
    //CALL(TestSequence);
    //CALL(TestSymbolTable);
    //CALL(TestVariable);

    //TestRepeat();
    //TestAlternation();
    //TestMorph();

    //TestQuote();
    //TestBuffer();
    //TestWeightedRand();

    //CALL(TestScanner);
    //CALL(TestParser);
    CALL(TestLanguageGenerator);
}

登录后可以享受更多权益