// Copyright (C) 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 1999-2011, International Business Machines Corporation * and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/17/99 aliu Creation. ********************************************************************** */ #ifndef RBT_PARS_H #define RBT_PARS_H #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #ifdef __cplusplus #include "unicode/uobject.h" #include "unicode/parseerr.h" #include "unicode/unorm.h" #include "rbt.h" #include "hash.h" #include "uvector.h" U_NAMESPACE_BEGIN class TransliterationRuleData; class UnicodeFunctor; class ParseData; class RuleHalf; class ParsePosition; class StringMatcher; class TransliteratorParser : public UMemory { public: /** * A Vector of TransliterationRuleData objects, one for each discrete group * of rules in the rule set */ UVector dataVector; /** * PUBLIC data member. * A Vector of UnicodeStrings containing all of the ID blocks in the rule set */ UVector idBlockVector; /** * PUBLIC data member containing the parsed compound filter, if any. */ UnicodeSet* compoundFilter; private: /** * The current data object for which we are parsing rules */ TransliterationRuleData* curData; UTransDirection direction; /** * Parse error information. */ UParseError parseError; /** * Temporary symbol table used during parsing. */ ParseData* parseData; /** * Temporary vector of matcher variables. When parsing is complete, this * is copied into the array data.variables. As with data.variables, * element 0 corresponds to character data.variablesBase. */ UVector variablesVector; /** * Temporary table of variable names. When parsing is complete, this is * copied into data.variableNames. */ Hashtable variableNames; /** * String of standins for segments. Used during the parsing of a single * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds * to StringMatcher object segmentObjects.elementAt(0), etc. */ UnicodeString segmentStandins; /** * Vector of StringMatcher objects for segments. Used during the * parsing of a single rule. * segmentStandins.charAt(0) is the standin for "$1" and corresponds * to StringMatcher object segmentObjects.elementAt(0), etc. */ UVector segmentObjects; /** * The next available stand-in for variables. This starts at some point in * the private use area (discovered dynamically) and increments up toward * <code>variableLimit</code>. At any point during parsing, available * variables are <code>variableNext..variableLimit-1</code>. */ UChar variableNext; /** * The last available stand-in for variables. This is discovered * dynamically. At any point during parsing, available variables are * <code>variableNext..variableLimit-1</code>. */ UChar variableLimit; /** * When we encounter an undefined variable, we do not immediately signal * an error, in case we are defining this variable, e.g., "$a = [a-z];". * Instead, we save the name of the undefined variable, and substitute * in the placeholder char variableLimit - 1, and decrement * variableLimit. */ UnicodeString undefinedVariableName; /** * The stand-in character for the 'dot' set, represented by '.' in * patterns. This is allocated the first time it is needed, and * reused thereafter. */ UChar dotStandIn; public: /** * Constructor. */ TransliteratorParser(UErrorCode &statusReturn); /** * Destructor. */ ~TransliteratorParser(); /** * Parse the given string as a sequence of rules, separated by newline * characters ('\n'), and cause this object to implement those rules. Any * previous rules are discarded. Typically this method is called exactly * once after construction. * * Parse the given rules, in the given direction. After this call * returns, query the public data members for results. The caller * owns the 'data' and 'compoundFilter' data members after this * call returns. * @param rules rules, separated by ';' * @param direction either FORWARD or REVERSE. * @param pe Struct to recieve information on position * of error if an error is encountered * @param ec Output param set to success/failure code. */ void parse(const UnicodeString& rules, UTransDirection direction, UParseError& pe, UErrorCode& ec); /** * Return the compound filter parsed by parse(). Caller owns result. * @return the compound filter parsed by parse(). */ UnicodeSet* orphanCompoundFilter(); private: /** * Return a representation of this transliterator as source rules. * @param rules Output param to receive the rules. * @param direction either FORWARD or REVERSE. */ void parseRules(const UnicodeString& rules, UTransDirection direction, UErrorCode& status); /** * MAIN PARSER. Parse the next rule in the given rule string, starting * at pos. Return the index after the last character parsed. Do not * parse characters at or after limit. * * Important: The character at pos must be a non-whitespace character * that is not the comment character. * * This method handles quoting, escaping, and whitespace removal. It * parses the end-of-rule character. It recognizes context and cursor * indicators. Once it does a lexical breakdown of the rule at pos, it * creates a rule object and adds it to our rule list. * @param rules Output param to receive the rules. * @param pos the starting position. * @param limit pointer past the last character of the rule. * @return the index after the last character parsed. */ int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); /** * Set the variable range to [start, end] (inclusive). * @param start the start value of the range. * @param end the end value of the range. */ void setVariableRange(int32_t start, int32_t end, UErrorCode& status); /** * Assert that the given character is NOT within the variable range. * If it is, return FALSE. This is neccesary to ensure that the * variable range does not overlap characters used in a rule. * @param ch the given character. * @return True, if the given character is NOT within the variable range. */ UBool checkVariableRange(UChar32 ch) const; /** * Set the maximum backup to 'backup', in response to a pragma * statement. * @param backup the new value to be set. */ void pragmaMaximumBackup(int32_t backup); /** * Begin normalizing all rules using the given mode, in response * to a pragma statement. * @param mode the given mode. */ void pragmaNormalizeRules(UNormalizationMode mode); /** * Return true if the given rule looks like a pragma. * @param pos offset to the first non-whitespace character * of the rule. * @param limit pointer past the last character of the rule. * @return true if the given rule looks like a pragma. */ static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit); /** * Parse a pragma. This method assumes resemblesPragma() has * already returned true. * @param pos offset to the first non-whitespace character * of the rule. * @param limit pointer past the last character of the rule. * @return the position index after the final ';' of the pragma, * or -1 on failure. */ int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); /** * Called by main parser upon syntax error. Search the rule string * for the probable end of the rule. Of course, if the error is that * the end of rule marker is missing, then the rule end will not be found. * In any case the rule start will be correctly reported. * @param parseErrorCode error code. * @param msg error description. * @param start position of first character of current rule. * @return start position of first character of current rule. */ int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start, UErrorCode& status); /** * Parse a UnicodeSet out, store it, and return the stand-in character * used to represent it. * * @param rule the rule for UnicodeSet. * @param pos the position in pattern at which to start parsing. * @return the stand-in character used to represent it. */ UChar parseSet(const UnicodeString& rule, ParsePosition& pos, UErrorCode& status); /** * Generate and return a stand-in for a new UnicodeFunctor. Store * the matcher (adopt it). * @param adopted the UnicodeFunctor to be adopted. * @return a stand-in for a new UnicodeFunctor. */ UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); /** * Return the standin for segment seg (1-based). * @param seg the given segment. * @return the standIn character for the given segment. */ UChar getSegmentStandin(int32_t seg, UErrorCode& status); /** * Set the object for segment seg (1-based). * @param seg the given segment. * @param adopted the StringMatcher to be adopted. */ void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status); /** * Return the stand-in for the dot set. It is allocated the first * time and reused thereafter. * @return the stand-in for the dot set. */ UChar getDotStandIn(UErrorCode& status); /** * Append the value of the given variable name to the given * UnicodeString. * @param name the variable name to be appended. * @param buf the given UnicodeString to append to. */ void appendVariableDef(const UnicodeString& name, UnicodeString& buf, UErrorCode& status); /** * Glue method to get around access restrictions in C++. */ /*static Transliterator* createBasicInstance(const UnicodeString& id, const UnicodeString* canonID);*/ friend class RuleHalf; // Disallowed methods; no impl. /** * Copy constructor */ TransliteratorParser(const TransliteratorParser&); /** * Assignment operator */ TransliteratorParser& operator=(const TransliteratorParser&); }; U_NAMESPACE_END #endif /* #ifdef __cplusplus */ /** * Strip/convert the following from the transliterator rules: * comments * newlines * white space at the beginning and end of a line * unescape \u notation * * The target must be equal in size as the source. * @internal */ U_CAPI int32_t utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status); #endif /* #if !UCONFIG_NO_TRANSLITERATION */ #endif