// Copyright 2012 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef V8_JSREGEXP_H_ #define V8_JSREGEXP_H_ #include "allocation.h" #include "assembler.h" #include "zone-inl.h" namespace v8 { namespace internal { class NodeVisitor; class RegExpCompiler; class RegExpMacroAssembler; class RegExpNode; class RegExpTree; class RegExpImpl { public: // Whether V8 is compiled with native regexp support or not. static bool UsesNativeRegExp() { #ifdef V8_INTERPRETED_REGEXP return false; #else return true; #endif } // Creates a regular expression literal in the old space. // This function calls the garbage collector if necessary. static Handle<Object> CreateRegExpLiteral(Handle<JSFunction> constructor, Handle<String> pattern, Handle<String> flags, bool* has_pending_exception); // Returns a string representation of a regular expression. // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4. // This function calls the garbage collector if necessary. static Handle<String> ToString(Handle<Object> value); // Parses the RegExp pattern and prepares the JSRegExp object with // generic data and choice of implementation - as well as what // the implementation wants to store in the data field. // Returns false if compilation fails. static Handle<Object> Compile(Handle<JSRegExp> re, Handle<String> pattern, Handle<String> flags); // See ECMA-262 section 15.10.6.2. // This function calls the garbage collector if necessary. static Handle<Object> Exec(Handle<JSRegExp> regexp, Handle<String> subject, int index, Handle<JSArray> lastMatchInfo); // Prepares a JSRegExp object with Irregexp-specific data. static void IrregexpInitialize(Handle<JSRegExp> re, Handle<String> pattern, JSRegExp::Flags flags, int capture_register_count); static void AtomCompile(Handle<JSRegExp> re, Handle<String> pattern, JSRegExp::Flags flags, Handle<String> match_pattern); static Handle<Object> AtomExec(Handle<JSRegExp> regexp, Handle<String> subject, int index, Handle<JSArray> lastMatchInfo); enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 }; // Prepare a RegExp for being executed one or more times (using // IrregexpExecOnce) on the subject. // This ensures that the regexp is compiled for the subject, and that // the subject is flat. // Returns the number of integer spaces required by IrregexpExecOnce // as its "registers" argument. If the regexp cannot be compiled, // an exception is set as pending, and this function returns negative. static int IrregexpPrepare(Handle<JSRegExp> regexp, Handle<String> subject); // Execute a regular expression once on the subject, starting from // character "index". // If successful, returns RE_SUCCESS and set the capture positions // in the first registers. // If matching fails, returns RE_FAILURE. // If execution fails, sets a pending exception and returns RE_EXCEPTION. static IrregexpResult IrregexpExecOnce(Handle<JSRegExp> regexp, Handle<String> subject, int index, Vector<int> registers); // Execute an Irregexp bytecode pattern. // On a successful match, the result is a JSArray containing // captured positions. On a failure, the result is the null value. // Returns an empty handle in case of an exception. static Handle<Object> IrregexpExec(Handle<JSRegExp> regexp, Handle<String> subject, int index, Handle<JSArray> lastMatchInfo); // Array index in the lastMatchInfo array. static const int kLastCaptureCount = 0; static const int kLastSubject = 1; static const int kLastInput = 2; static const int kFirstCapture = 3; static const int kLastMatchOverhead = 3; // Direct offset into the lastMatchInfo array. static const int kLastCaptureCountOffset = FixedArray::kHeaderSize + kLastCaptureCount * kPointerSize; static const int kLastSubjectOffset = FixedArray::kHeaderSize + kLastSubject * kPointerSize; static const int kLastInputOffset = FixedArray::kHeaderSize + kLastInput * kPointerSize; static const int kFirstCaptureOffset = FixedArray::kHeaderSize + kFirstCapture * kPointerSize; // Used to access the lastMatchInfo array. static int GetCapture(FixedArray* array, int index) { return Smi::cast(array->get(index + kFirstCapture))->value(); } static void SetLastCaptureCount(FixedArray* array, int to) { array->set(kLastCaptureCount, Smi::FromInt(to)); } static void SetLastSubject(FixedArray* array, String* to) { array->set(kLastSubject, to); } static void SetLastInput(FixedArray* array, String* to) { array->set(kLastInput, to); } static void SetCapture(FixedArray* array, int index, int to) { array->set(index + kFirstCapture, Smi::FromInt(to)); } static int GetLastCaptureCount(FixedArray* array) { return Smi::cast(array->get(kLastCaptureCount))->value(); } // For acting on the JSRegExp data FixedArray. static int IrregexpMaxRegisterCount(FixedArray* re); static void SetIrregexpMaxRegisterCount(FixedArray* re, int value); static int IrregexpNumberOfCaptures(FixedArray* re); static int IrregexpNumberOfRegisters(FixedArray* re); static ByteArray* IrregexpByteCode(FixedArray* re, bool is_ascii); static Code* IrregexpNativeCode(FixedArray* re, bool is_ascii); // Limit the space regexps take up on the heap. In order to limit this we // would like to keep track of the amount of regexp code on the heap. This // is not tracked, however. As a conservative approximation we track the // total regexp code compiled including code that has subsequently been freed // and the total executable memory at any point. static const int kRegExpExecutableMemoryLimit = 16 * MB; static const int kRegWxpCompiledLimit = 1 * MB; private: static String* last_ascii_string_; static String* two_byte_cached_string_; static bool CompileIrregexp(Handle<JSRegExp> re, bool is_ascii); static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii); // Set the subject cache. The previous string buffer is not deleted, so the // caller should ensure that it doesn't leak. static void SetSubjectCache(String* subject, char* utf8_subject, int uft8_length, int character_position, int utf8_position); // A one element cache of the last utf8_subject string and its length. The // subject JS String object is cached in the heap. We also cache a // translation between position and utf8 position. static char* utf8_subject_cache_; static int utf8_length_cache_; static int utf8_position_; static int character_position_; }; // Represents the location of one element relative to the intersection of // two sets. Corresponds to the four areas of a Venn diagram. enum ElementInSetsRelation { kInsideNone = 0, kInsideFirst = 1, kInsideSecond = 2, kInsideBoth = 3 }; // Represents the relation of two sets. // Sets can be either disjoint, partially or fully overlapping, or equal. class SetRelation BASE_EMBEDDED { public: // Relation is represented by a bit saying whether there are elements in // one set that is not in the other, and a bit saying that there are elements // that are in both sets. // Location of an element. Corresponds to the internal areas of // a Venn diagram. enum { kInFirst = 1 << kInsideFirst, kInSecond = 1 << kInsideSecond, kInBoth = 1 << kInsideBoth }; SetRelation() : bits_(0) {} ~SetRelation() {} // Add the existence of objects in a particular void SetElementsInFirstSet() { bits_ |= kInFirst; } void SetElementsInSecondSet() { bits_ |= kInSecond; } void SetElementsInBothSets() { bits_ |= kInBoth; } // Check the currently known relation of the sets (common functions only, // for other combinations, use value() to get the bits and check them // manually). // Sets are completely disjoint. bool Disjoint() { return (bits_ & kInBoth) == 0; } // Sets are equal. bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; } // First set contains second. bool Contains() { return (bits_ & kInSecond) == 0; } // Second set contains first. bool ContainedIn() { return (bits_ & kInFirst) == 0; } bool NonTrivialIntersection() { return (bits_ == (kInFirst | kInSecond | kInBoth)); } int value() { return bits_; } private: int bits_; }; class CharacterRange { public: CharacterRange() : from_(0), to_(0) { } // For compatibility with the CHECK_OK macro CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { } static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges); static Vector<const uc16> GetWordBounds(); static inline CharacterRange Singleton(uc16 value) { return CharacterRange(value, value); } static inline CharacterRange Range(uc16 from, uc16 to) { ASSERT(from <= to); return CharacterRange(from, to); } static inline CharacterRange Everything() { return CharacterRange(0, 0xFFFF); } bool Contains(uc16 i) { return from_ <= i && i <= to_; } uc16 from() const { return from_; } void set_from(uc16 value) { from_ = value; } uc16 to() const { return to_; } void set_to(uc16 value) { to_ = value; } bool is_valid() { return from_ <= to_; } bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } bool IsSingleton() { return (from_ == to_); } void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii); static void Split(ZoneList<CharacterRange>* base, Vector<const uc16> overlay, ZoneList<CharacterRange>** included, ZoneList<CharacterRange>** excluded); // Whether a range list is in canonical form: Ranges ordered by from value, // and ranges non-overlapping and non-adjacent. static bool IsCanonical(ZoneList<CharacterRange>* ranges); // Convert range list to canonical form. The characters covered by the ranges // will still be the same, but no character is in more than one range, and // adjacent ranges are merged. The resulting list may be shorter than the // original, but cannot be longer. static void Canonicalize(ZoneList<CharacterRange>* ranges); // Check how the set of characters defined by a CharacterRange list relates // to the set of word characters. List must be in canonical form. static SetRelation WordCharacterRelation(ZoneList<CharacterRange>* ranges); // Takes two character range lists (representing character sets) in canonical // form and merges them. // The characters that are only covered by the first set are added to // first_set_only_out. the characters that are only in the second set are // added to second_set_only_out, and the characters that are in both are // added to both_sets_out. // The pointers to first_set_only_out, second_set_only_out and both_sets_out // should be to empty lists, but they need not be distinct, and may be NULL. // If NULL, the characters are dropped, and if two arguments are the same // pointer, the result is the union of the two sets that would be created // if the pointers had been distinct. // This way, the Merge function can compute all the usual set operations: // union (all three out-sets are equal), intersection (only both_sets_out is // non-NULL), and set difference (only first_set is non-NULL). static void Merge(ZoneList<CharacterRange>* first_set, ZoneList<CharacterRange>* second_set, ZoneList<CharacterRange>* first_set_only_out, ZoneList<CharacterRange>* second_set_only_out, ZoneList<CharacterRange>* both_sets_out); // Negate the contents of a character range in canonical form. static void Negate(ZoneList<CharacterRange>* src, ZoneList<CharacterRange>* dst); static const int kStartMarker = (1 << 24); static const int kPayloadMask = (1 << 24) - 1; private: uc16 from_; uc16 to_; }; // A set of unsigned integers that behaves especially well on small // integers (< 32). May do zone-allocation. class OutSet: public ZoneObject { public: OutSet() : first_(0), remaining_(NULL), successors_(NULL) { } OutSet* Extend(unsigned value); bool Get(unsigned value); static const unsigned kFirstLimit = 32; private: // Destructively set a value in this set. In most cases you want // to use Extend instead to ensure that only one instance exists // that contains the same values. void Set(unsigned value); // The successors are a list of sets that contain the same values // as this set and the one more value that is not present in this // set. ZoneList<OutSet*>* successors() { return successors_; } OutSet(uint32_t first, ZoneList<unsigned>* remaining) : first_(first), remaining_(remaining), successors_(NULL) { } uint32_t first_; ZoneList<unsigned>* remaining_; ZoneList<OutSet*>* successors_; friend class Trace; }; // A mapping from integers, specified as ranges, to a set of integers. // Used for mapping character ranges to choices. class DispatchTable : public ZoneObject { public: class Entry { public: Entry() : from_(0), to_(0), out_set_(NULL) { } Entry(uc16 from, uc16 to, OutSet* out_set) : from_(from), to_(to), out_set_(out_set) { } uc16 from() { return from_; } uc16 to() { return to_; } void set_to(uc16 value) { to_ = value; } void AddValue(int value) { out_set_ = out_set_->Extend(value); } OutSet* out_set() { return out_set_; } private: uc16 from_; uc16 to_; OutSet* out_set_; }; class Config { public: typedef uc16 Key; typedef Entry Value; static const uc16 kNoKey; static const Entry NoValue() { return Value(); } static inline int Compare(uc16 a, uc16 b) { if (a == b) return 0; else if (a < b) return -1; else return 1; } }; void AddRange(CharacterRange range, int value); OutSet* Get(uc16 value); void Dump(); template <typename Callback> void ForEach(Callback* callback) { return tree()->ForEach(callback); } private: // There can't be a static empty set since it allocates its // successors in a zone and caches them. OutSet* empty() { return &empty_; } OutSet empty_; ZoneSplayTree<Config>* tree() { return &tree_; } ZoneSplayTree<Config> tree_; }; #define FOR_EACH_NODE_TYPE(VISIT) \ VISIT(End) \ VISIT(Action) \ VISIT(Choice) \ VISIT(BackReference) \ VISIT(Assertion) \ VISIT(Text) #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ VISIT(Disjunction) \ VISIT(Alternative) \ VISIT(Assertion) \ VISIT(CharacterClass) \ VISIT(Atom) \ VISIT(Quantifier) \ VISIT(Capture) \ VISIT(Lookahead) \ VISIT(BackReference) \ VISIT(Empty) \ VISIT(Text) #define FORWARD_DECLARE(Name) class RegExp##Name; FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) #undef FORWARD_DECLARE class TextElement { public: enum Type {UNINITIALIZED, ATOM, CHAR_CLASS}; TextElement() : type(UNINITIALIZED) { } explicit TextElement(Type t) : type(t), cp_offset(-1) { } static TextElement Atom(RegExpAtom* atom); static TextElement CharClass(RegExpCharacterClass* char_class); int length(); Type type; union { RegExpAtom* u_atom; RegExpCharacterClass* u_char_class; } data; int cp_offset; }; class Trace; struct NodeInfo { NodeInfo() : being_analyzed(false), been_analyzed(false), follows_word_interest(false), follows_newline_interest(false), follows_start_interest(false), at_end(false), visited(false) { } // Returns true if the interests and assumptions of this node // matches the given one. bool Matches(NodeInfo* that) { return (at_end == that->at_end) && (follows_word_interest == that->follows_word_interest) && (follows_newline_interest == that->follows_newline_interest) && (follows_start_interest == that->follows_start_interest); } // Updates the interests of this node given the interests of the // node preceding it. void AddFromPreceding(NodeInfo* that) { at_end |= that->at_end; follows_word_interest |= that->follows_word_interest; follows_newline_interest |= that->follows_newline_interest; follows_start_interest |= that->follows_start_interest; } bool HasLookbehind() { return follows_word_interest || follows_newline_interest || follows_start_interest; } // Sets the interests of this node to include the interests of the // following node. void AddFromFollowing(NodeInfo* that) { follows_word_interest |= that->follows_word_interest; follows_newline_interest |= that->follows_newline_interest; follows_start_interest |= that->follows_start_interest; } void ResetCompilationState() { being_analyzed = false; been_analyzed = false; } bool being_analyzed: 1; bool been_analyzed: 1; // These bits are set of this node has to know what the preceding // character was. bool follows_word_interest: 1; bool follows_newline_interest: 1; bool follows_start_interest: 1; bool at_end: 1; bool visited: 1; }; class SiblingList { public: SiblingList() : list_(NULL) { } int length() { return list_ == NULL ? 0 : list_->length(); } void Ensure(RegExpNode* parent) { if (list_ == NULL) { list_ = new ZoneList<RegExpNode*>(2); list_->Add(parent); } } void Add(RegExpNode* node) { list_->Add(node); } RegExpNode* Get(int index) { return list_->at(index); } private: ZoneList<RegExpNode*>* list_; }; // Details of a quick mask-compare check that can look ahead in the // input stream. class QuickCheckDetails { public: QuickCheckDetails() : characters_(0), mask_(0), value_(0), cannot_match_(false) { } explicit QuickCheckDetails(int characters) : characters_(characters), mask_(0), value_(0), cannot_match_(false) { } bool Rationalize(bool ascii); // Merge in the information from another branch of an alternation. void Merge(QuickCheckDetails* other, int from_index); // Advance the current position by some amount. void Advance(int by, bool ascii); void Clear(); bool cannot_match() { return cannot_match_; } void set_cannot_match() { cannot_match_ = true; } struct Position { Position() : mask(0), value(0), determines_perfectly(false) { } uc16 mask; uc16 value; bool determines_perfectly; }; int characters() { return characters_; } void set_characters(int characters) { characters_ = characters; } Position* positions(int index) { ASSERT(index >= 0); ASSERT(index < characters_); return positions_ + index; } uint32_t mask() { return mask_; } uint32_t value() { return value_; } private: // How many characters do we have quick check information from. This is // the same for all branches of a choice node. int characters_; Position positions_[4]; // These values are the condensate of the above array after Rationalize(). uint32_t mask_; uint32_t value_; // If set to true, there is no way this quick check can match at all. // E.g., if it requires to be at the start of the input, and isn't. bool cannot_match_; }; class RegExpNode: public ZoneObject { public: RegExpNode() : first_character_set_(NULL), trace_count_(0) { } virtual ~RegExpNode(); virtual void Accept(NodeVisitor* visitor) = 0; // Generates a goto to this node or actually generates the code at this point. virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0; // How many characters must this node consume at a minimum in order to // succeed. If we have found at least 'still_to_find' characters that // must be consumed there is no need to ask any following nodes whether // they are sure to eat any more characters. The not_at_start argument is // used to indicate that we know we are not at the start of the input. In // this case anchored branches will always fail and can be ignored when // determining how many characters are consumed on success. virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start) = 0; // Emits some quick code that checks whether the preloaded characters match. // Falls through on certain failure, jumps to the label on possible success. // If the node cannot make a quick check it does nothing and returns false. bool EmitQuickCheck(RegExpCompiler* compiler, Trace* trace, bool preload_has_checked_bounds, Label* on_possible_success, QuickCheckDetails* details_return, bool fall_through_on_failure); // For a given number of characters this returns a mask and a value. The // next n characters are anded with the mask and compared with the value. // A comparison failure indicates the node cannot match the next n characters. // A comparison success indicates the node may match. virtual void GetQuickCheckDetails(QuickCheckDetails* details, RegExpCompiler* compiler, int characters_filled_in, bool not_at_start) = 0; static const int kNodeIsTooComplexForGreedyLoops = -1; virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } Label* label() { return &label_; } // If non-generic code is generated for a node (i.e. the node is not at the // start of the trace) then it cannot be reused. This variable sets a limit // on how often we allow that to happen before we insist on starting a new // trace and generating generic code for a node that can be reused by flushing // the deferred actions in the current trace and generating a goto. static const int kMaxCopiesCodeGenerated = 10; NodeInfo* info() { return &info_; } void AddSibling(RegExpNode* node) { siblings_.Add(node); } // Static version of EnsureSibling that expresses the fact that the // result has the same type as the input. template <class C> static C* EnsureSibling(C* node, NodeInfo* info, bool* cloned) { return static_cast<C*>(node->EnsureSibling(info, cloned)); } SiblingList* siblings() { return &siblings_; } void set_siblings(SiblingList* other) { siblings_ = *other; } // Return the set of possible next characters recognized by the regexp // (or a safe subset, potentially the set of all characters). ZoneList<CharacterRange>* FirstCharacterSet(); // Compute (if possible within the budget of traversed nodes) the // possible first characters of the input matched by this node and // its continuation. Returns the remaining budget after the computation. // If the budget is spent, the result is negative, and the cached // first_character_set_ value isn't set. virtual int ComputeFirstCharacterSet(int budget); // Get and set the cached first character set value. ZoneList<CharacterRange>* first_character_set() { return first_character_set_; } void set_first_character_set(ZoneList<CharacterRange>* character_set) { first_character_set_ = character_set; } protected: enum LimitResult { DONE, CONTINUE }; static const int kComputeFirstCharacterSetFail = -1; LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); // Returns a sibling of this node whose interests and assumptions // match the ones in the given node info. If no sibling exists NULL // is returned. RegExpNode* TryGetSibling(NodeInfo* info); // Returns a sibling of this node whose interests match the ones in // the given node info. The info must not contain any assertions. // If no node exists a new one will be created by cloning the current // node. The result will always be an instance of the same concrete // class as this node. RegExpNode* EnsureSibling(NodeInfo* info, bool* cloned); // Returns a clone of this node initialized using the copy constructor // of its concrete class. Note that the node may have to be pre- // processed before it is on a usable state. virtual RegExpNode* Clone() = 0; private: static const int kFirstCharBudget = 10; Label label_; NodeInfo info_; SiblingList siblings_; ZoneList<CharacterRange>* first_character_set_; // This variable keeps track of how many times code has been generated for // this node (in different traces). We don't keep track of where the // generated code is located unless the code is generated at the start of // a trace, in which case it is generic and can be reused by flushing the // deferred operations in the current trace and generating a goto. int trace_count_; }; // A simple closed interval. class Interval { public: Interval() : from_(kNone), to_(kNone) { } Interval(int from, int to) : from_(from), to_(to) { } Interval Union(Interval that) { if (that.from_ == kNone) return *this; else if (from_ == kNone) return that; else return Interval(Min(from_, that.from_), Max(to_, that.to_)); } bool Contains(int value) { return (from_ <= value) && (value <= to_); } bool is_empty() { return from_ == kNone; } int from() { return from_; } int to() { return to_; } static Interval Empty() { return Interval(); } static const int kNone = -1; private: int from_; int to_; }; class SeqRegExpNode: public RegExpNode { public: explicit SeqRegExpNode(RegExpNode* on_success) : on_success_(on_success) { } RegExpNode* on_success() { return on_success_; } void set_on_success(RegExpNode* node) { on_success_ = node; } private: RegExpNode* on_success_; }; class ActionNode: public SeqRegExpNode { public: enum Type { SET_REGISTER, INCREMENT_REGISTER, STORE_POSITION, BEGIN_SUBMATCH, POSITIVE_SUBMATCH_SUCCESS, EMPTY_MATCH_CHECK, CLEAR_CAPTURES }; static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success); static ActionNode* IncrementRegister(int reg, RegExpNode* on_success); static ActionNode* StorePosition(int reg, bool is_capture, RegExpNode* on_success); static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success); static ActionNode* BeginSubmatch(int stack_pointer_reg, int position_reg, RegExpNode* on_success); static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg, int restore_reg, int clear_capture_count, int clear_capture_from, RegExpNode* on_success); static ActionNode* EmptyMatchCheck(int start_register, int repetition_register, int repetition_limit, RegExpNode* on_success); virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start); virtual void GetQuickCheckDetails(QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in, bool not_at_start) { return on_success()->GetQuickCheckDetails( details, compiler, filled_in, not_at_start); } Type type() { return type_; } // TODO(erikcorry): We should allow some action nodes in greedy loops. virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } virtual ActionNode* Clone() { return new ActionNode(*this); } virtual int ComputeFirstCharacterSet(int budget); private: union { struct { int reg; int value; } u_store_register; struct { int reg; } u_increment_register; struct { int reg; bool is_capture; } u_position_register; struct { int stack_pointer_register; int current_position_register; int clear_register_count; int clear_register_from; } u_submatch; struct { int start_register; int repetition_register; int repetition_limit; } u_empty_match_check; struct { int range_from; int range_to; } u_clear_captures; } data_; ActionNode(Type type, RegExpNode* on_success) : SeqRegExpNode(on_success), type_(type) { } Type type_; friend class DotPrinter; }; class TextNode: public SeqRegExpNode { public: TextNode(ZoneList<TextElement>* elms, RegExpNode* on_success) : SeqRegExpNode(on_success), elms_(elms) { } TextNode(RegExpCharacterClass* that, RegExpNode* on_success) : SeqRegExpNode(on_success), elms_(new ZoneList<TextElement>(1)) { elms_->Add(TextElement::CharClass(that)); } virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start); virtual void GetQuickCheckDetails(QuickCheckDetails* details, RegExpCompiler* compiler, int characters_filled_in, bool not_at_start); ZoneList<TextElement>* elements() { return elms_; } void MakeCaseIndependent(bool is_ascii); virtual int GreedyLoopTextLength(); virtual TextNode* Clone() { TextNode* result = new TextNode(*this); result->CalculateOffsets(); return result; } void CalculateOffsets(); virtual int ComputeFirstCharacterSet(int budget); private: enum TextEmitPassType { NON_ASCII_MATCH, // Check for characters that can't match. SIMPLE_CHARACTER_MATCH, // Case-dependent single character check. NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs. CASE_CHARACTER_MATCH, // Case-independent single character check. CHARACTER_CLASS_MATCH // Character class. }; static bool SkipPass(int pass, bool ignore_case); static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH; static const int kLastPass = CHARACTER_CLASS_MATCH; void TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, bool preloaded, Trace* trace, bool first_element_checked, int* checked_up_to); int Length(); ZoneList<TextElement>* elms_; }; class AssertionNode: public SeqRegExpNode { public: enum AssertionNodeType { AT_END, AT_START, AT_BOUNDARY, AT_NON_BOUNDARY, AFTER_NEWLINE, // Types not directly expressible in regexp syntax. // Used for modifying a boundary node if its following character is // known to be word and/or non-word. AFTER_NONWORD_CHARACTER, AFTER_WORD_CHARACTER }; static AssertionNode* AtEnd(RegExpNode* on_success) { return new AssertionNode(AT_END, on_success); } static AssertionNode* AtStart(RegExpNode* on_success) { return new AssertionNode(AT_START, on_success); } static AssertionNode* AtBoundary(RegExpNode* on_success) { return new AssertionNode(AT_BOUNDARY, on_success); } static AssertionNode* AtNonBoundary(RegExpNode* on_success) { return new AssertionNode(AT_NON_BOUNDARY, on_success); } static AssertionNode* AfterNewline(RegExpNode* on_success) { return new AssertionNode(AFTER_NEWLINE, on_success); } virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start); virtual void GetQuickCheckDetails(QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in, bool not_at_start); virtual int ComputeFirstCharacterSet(int budget); virtual AssertionNode* Clone() { return new AssertionNode(*this); } AssertionNodeType type() { return type_; } void set_type(AssertionNodeType type) { type_ = type; } private: AssertionNode(AssertionNodeType t, RegExpNode* on_success) : SeqRegExpNode(on_success), type_(t) { } AssertionNodeType type_; }; class BackReferenceNode: public SeqRegExpNode { public: BackReferenceNode(int start_reg, int end_reg, RegExpNode* on_success) : SeqRegExpNode(on_success), start_reg_(start_reg), end_reg_(end_reg) { } virtual void Accept(NodeVisitor* visitor); int start_register() { return start_reg_; } int end_register() { return end_reg_; } virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start); virtual void GetQuickCheckDetails(QuickCheckDetails* details, RegExpCompiler* compiler, int characters_filled_in, bool not_at_start) { return; } virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); } virtual int ComputeFirstCharacterSet(int budget); private: int start_reg_; int end_reg_; }; class EndNode: public RegExpNode { public: enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS }; explicit EndNode(Action action) : action_(action) { } virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start) { return 0; } virtual void GetQuickCheckDetails(QuickCheckDetails* details, RegExpCompiler* compiler, int characters_filled_in, bool not_at_start) { // Returning 0 from EatsAtLeast should ensure we never get here. UNREACHABLE(); } virtual EndNode* Clone() { return new EndNode(*this); } private: Action action_; }; class NegativeSubmatchSuccess: public EndNode { public: NegativeSubmatchSuccess(int stack_pointer_reg, int position_reg, int clear_capture_count, int clear_capture_start) : EndNode(NEGATIVE_SUBMATCH_SUCCESS), stack_pointer_register_(stack_pointer_reg), current_position_register_(position_reg), clear_capture_count_(clear_capture_count), clear_capture_start_(clear_capture_start) { } virtual void Emit(RegExpCompiler* compiler, Trace* trace); private: int stack_pointer_register_; int current_position_register_; int clear_capture_count_; int clear_capture_start_; }; class Guard: public ZoneObject { public: enum Relation { LT, GEQ }; Guard(int reg, Relation op, int value) : reg_(reg), op_(op), value_(value) { } int reg() { return reg_; } Relation op() { return op_; } int value() { return value_; } private: int reg_; Relation op_; int value_; }; class GuardedAlternative { public: explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { } void AddGuard(Guard* guard); RegExpNode* node() { return node_; } void set_node(RegExpNode* node) { node_ = node; } ZoneList<Guard*>* guards() { return guards_; } private: RegExpNode* node_; ZoneList<Guard*>* guards_; }; class AlternativeGeneration; class ChoiceNode: public RegExpNode { public: explicit ChoiceNode(int expected_size) : alternatives_(new ZoneList<GuardedAlternative>(expected_size)), table_(NULL), not_at_start_(false), being_calculated_(false) { } virtual void Accept(NodeVisitor* visitor); void AddAlternative(GuardedAlternative node) { alternatives()->Add(node); } ZoneList<GuardedAlternative>* alternatives() { return alternatives_; } DispatchTable* GetTable(bool ignore_case); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start); int EatsAtLeastHelper(int still_to_find, int recursion_depth, RegExpNode* ignore_this_node, bool not_at_start); virtual void GetQuickCheckDetails(QuickCheckDetails* details, RegExpCompiler* compiler, int characters_filled_in, bool not_at_start); virtual ChoiceNode* Clone() { return new ChoiceNode(*this); } bool being_calculated() { return being_calculated_; } bool not_at_start() { return not_at_start_; } void set_not_at_start() { not_at_start_ = true; } void set_being_calculated(bool b) { being_calculated_ = b; } virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; } protected: int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative); ZoneList<GuardedAlternative>* alternatives_; private: friend class DispatchTableConstructor; friend class Analysis; void GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard* guard, Trace* trace); int CalculatePreloadCharacters(RegExpCompiler* compiler, bool not_at_start); void EmitOutOfLineContinuation(RegExpCompiler* compiler, Trace* trace, GuardedAlternative alternative, AlternativeGeneration* alt_gen, int preload_characters, bool next_expects_preload); DispatchTable* table_; // If true, this node is never checked at the start of the input. // Allows a new trace to start with at_start() set to false. bool not_at_start_; bool being_calculated_; }; class NegativeLookaheadChoiceNode: public ChoiceNode { public: explicit NegativeLookaheadChoiceNode(GuardedAlternative this_must_fail, GuardedAlternative then_do_this) : ChoiceNode(2) { AddAlternative(this_must_fail); AddAlternative(then_do_this); } virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start); virtual void GetQuickCheckDetails(QuickCheckDetails* details, RegExpCompiler* compiler, int characters_filled_in, bool not_at_start); // For a negative lookahead we don't emit the quick check for the // alternative that is expected to fail. This is because quick check code // starts by loading enough characters for the alternative that takes fewest // characters, but on a negative lookahead the negative branch did not take // part in that calculation (EatsAtLeast) so the assumptions don't hold. virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; } virtual int ComputeFirstCharacterSet(int budget); }; class LoopChoiceNode: public ChoiceNode { public: explicit LoopChoiceNode(bool body_can_be_zero_length) : ChoiceNode(2), loop_node_(NULL), continue_node_(NULL), body_can_be_zero_length_(body_can_be_zero_length) { } void AddLoopAlternative(GuardedAlternative alt); void AddContinueAlternative(GuardedAlternative alt); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, int recursion_depth, bool not_at_start); virtual void GetQuickCheckDetails(QuickCheckDetails* details, RegExpCompiler* compiler, int characters_filled_in, bool not_at_start); virtual int ComputeFirstCharacterSet(int budget); virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); } RegExpNode* loop_node() { return loop_node_; } RegExpNode* continue_node() { return continue_node_; } bool body_can_be_zero_length() { return body_can_be_zero_length_; } virtual void Accept(NodeVisitor* visitor); private: // AddAlternative is made private for loop nodes because alternatives // should not be added freely, we need to keep track of which node // goes back to the node itself. void AddAlternative(GuardedAlternative node) { ChoiceNode::AddAlternative(node); } RegExpNode* loop_node_; RegExpNode* continue_node_; bool body_can_be_zero_length_; }; // There are many ways to generate code for a node. This class encapsulates // the current way we should be generating. In other words it encapsulates // the current state of the code generator. The effect of this is that we // generate code for paths that the matcher can take through the regular // expression. A given node in the regexp can be code-generated several times // as it can be part of several traces. For example for the regexp: // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code // to match foo is generated only once (the traces have a common prefix). The // code to store the capture is deferred and generated (twice) after the places // where baz has been matched. class Trace { public: // A value for a property that is either known to be true, know to be false, // or not known. enum TriBool { UNKNOWN = -1, FALSE = 0, TRUE = 1 }; class DeferredAction { public: DeferredAction(ActionNode::Type type, int reg) : type_(type), reg_(reg), next_(NULL) { } DeferredAction* next() { return next_; } bool Mentions(int reg); int reg() { return reg_; } ActionNode::Type type() { return type_; } private: ActionNode::Type type_; int reg_; DeferredAction* next_; friend class Trace; }; class DeferredCapture : public DeferredAction { public: DeferredCapture(int reg, bool is_capture, Trace* trace) : DeferredAction(ActionNode::STORE_POSITION, reg), cp_offset_(trace->cp_offset()), is_capture_(is_capture) { } int cp_offset() { return cp_offset_; } bool is_capture() { return is_capture_; } private: int cp_offset_; bool is_capture_; void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } }; class DeferredSetRegister : public DeferredAction { public: DeferredSetRegister(int reg, int value) : DeferredAction(ActionNode::SET_REGISTER, reg), value_(value) { } int value() { return value_; } private: int value_; }; class DeferredClearCaptures : public DeferredAction { public: explicit DeferredClearCaptures(Interval range) : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) { } Interval range() { return range_; } private: Interval range_; }; class DeferredIncrementRegister : public DeferredAction { public: explicit DeferredIncrementRegister(int reg) : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { } }; Trace() : cp_offset_(0), actions_(NULL), backtrack_(NULL), stop_node_(NULL), loop_label_(NULL), characters_preloaded_(0), bound_checked_up_to_(0), flush_budget_(100), at_start_(UNKNOWN) { } // End the trace. This involves flushing the deferred actions in the trace // and pushing a backtrack location onto the backtrack stack. Once this is // done we can start a new trace or go to one that has already been // generated. void Flush(RegExpCompiler* compiler, RegExpNode* successor); int cp_offset() { return cp_offset_; } DeferredAction* actions() { return actions_; } // A trivial trace is one that has no deferred actions or other state that // affects the assumptions used when generating code. There is no recorded // backtrack location in a trivial trace, so with a trivial trace we will // generate code that, on a failure to match, gets the backtrack location // from the backtrack stack rather than using a direct jump instruction. We // always start code generation with a trivial trace and non-trivial traces // are created as we emit code for nodes or add to the list of deferred // actions in the trace. The location of the code generated for a node using // a trivial trace is recorded in a label in the node so that gotos can be // generated to that code. bool is_trivial() { return backtrack_ == NULL && actions_ == NULL && cp_offset_ == 0 && characters_preloaded_ == 0 && bound_checked_up_to_ == 0 && quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN; } TriBool at_start() { return at_start_; } void set_at_start(bool at_start) { at_start_ = at_start ? TRUE : FALSE; } Label* backtrack() { return backtrack_; } Label* loop_label() { return loop_label_; } RegExpNode* stop_node() { return stop_node_; } int characters_preloaded() { return characters_preloaded_; } int bound_checked_up_to() { return bound_checked_up_to_; } int flush_budget() { return flush_budget_; } QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; } bool mentions_reg(int reg); // Returns true if a deferred position store exists to the specified // register and stores the offset in the out-parameter. Otherwise // returns false. bool GetStoredPosition(int reg, int* cp_offset); // These set methods and AdvanceCurrentPositionInTrace should be used only on // new traces - the intention is that traces are immutable after creation. void add_action(DeferredAction* new_action) { ASSERT(new_action->next_ == NULL); new_action->next_ = actions_; actions_ = new_action; } void set_backtrack(Label* backtrack) { backtrack_ = backtrack; } void set_stop_node(RegExpNode* node) { stop_node_ = node; } void set_loop_label(Label* label) { loop_label_ = label; } void set_characters_preloaded(int count) { characters_preloaded_ = count; } void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; } void set_flush_budget(int to) { flush_budget_ = to; } void set_quick_check_performed(QuickCheckDetails* d) { quick_check_performed_ = *d; } void InvalidateCurrentCharacter(); void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler); private: int FindAffectedRegisters(OutSet* affected_registers); void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register, OutSet& affected_registers, OutSet* registers_to_pop, OutSet* registers_to_clear); void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register, OutSet& registers_to_pop, OutSet& registers_to_clear); int cp_offset_; DeferredAction* actions_; Label* backtrack_; RegExpNode* stop_node_; Label* loop_label_; int characters_preloaded_; int bound_checked_up_to_; QuickCheckDetails quick_check_performed_; int flush_budget_; TriBool at_start_; }; class NodeVisitor { public: virtual ~NodeVisitor() { } #define DECLARE_VISIT(Type) \ virtual void Visit##Type(Type##Node* that) = 0; FOR_EACH_NODE_TYPE(DECLARE_VISIT) #undef DECLARE_VISIT virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); } }; // Node visitor used to add the start set of the alternatives to the // dispatch table of a choice node. class DispatchTableConstructor: public NodeVisitor { public: DispatchTableConstructor(DispatchTable* table, bool ignore_case) : table_(table), choice_index_(-1), ignore_case_(ignore_case) { } void BuildTable(ChoiceNode* node); void AddRange(CharacterRange range) { table()->AddRange(range, choice_index_); } void AddInverse(ZoneList<CharacterRange>* ranges); #define DECLARE_VISIT(Type) \ virtual void Visit##Type(Type##Node* that); FOR_EACH_NODE_TYPE(DECLARE_VISIT) #undef DECLARE_VISIT DispatchTable* table() { return table_; } void set_choice_index(int value) { choice_index_ = value; } protected: DispatchTable* table_; int choice_index_; bool ignore_case_; }; // Assertion propagation moves information about assertions such as // \b to the affected nodes. For instance, in /.\b./ information must // be propagated to the first '.' that whatever follows needs to know // if it matched a word or a non-word, and to the second '.' that it // has to check if it succeeds a word or non-word. In this case the // result will be something like: // // +-------+ +------------+ // | . | | . | // +-------+ ---> +------------+ // | word? | | check word | // +-------+ +------------+ class Analysis: public NodeVisitor { public: Analysis(bool ignore_case, bool is_ascii) : ignore_case_(ignore_case), is_ascii_(is_ascii), error_message_(NULL) { } void EnsureAnalyzed(RegExpNode* node); #define DECLARE_VISIT(Type) \ virtual void Visit##Type(Type##Node* that); FOR_EACH_NODE_TYPE(DECLARE_VISIT) #undef DECLARE_VISIT virtual void VisitLoopChoice(LoopChoiceNode* that); bool has_failed() { return error_message_ != NULL; } const char* error_message() { ASSERT(error_message_ != NULL); return error_message_; } void fail(const char* error_message) { error_message_ = error_message; } private: bool ignore_case_; bool is_ascii_; const char* error_message_; DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); }; struct RegExpCompileData { RegExpCompileData() : tree(NULL), node(NULL), simple(true), contains_anchor(false), capture_count(0) { } RegExpTree* tree; RegExpNode* node; bool simple; bool contains_anchor; Handle<String> error; int capture_count; }; class RegExpEngine: public AllStatic { public: struct CompilationResult { explicit CompilationResult(const char* error_message) : error_message(error_message), code(HEAP->the_hole_value()), num_registers(0) {} CompilationResult(Object* code, int registers) : error_message(NULL), code(code), num_registers(registers) {} const char* error_message; Object* code; int num_registers; }; static CompilationResult Compile(RegExpCompileData* input, bool ignore_case, bool multiline, Handle<String> pattern, bool is_ascii); static void DotPrint(const char* label, RegExpNode* node, bool ignore_case); }; class OffsetsVector { public: inline OffsetsVector(int num_registers, Isolate* isolate) : offsets_vector_length_(num_registers) { if (offsets_vector_length_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { vector_ = NewArray<int>(offsets_vector_length_); } else { vector_ = isolate->jsregexp_static_offsets_vector(); } } inline ~OffsetsVector() { if (offsets_vector_length_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { DeleteArray(vector_); vector_ = NULL; } } inline int* vector() { return vector_; } inline int length() { return offsets_vector_length_; } static const int kStaticOffsetsVectorSize = 50; private: static Address static_offsets_vector_address(Isolate* isolate) { return reinterpret_cast<Address>(isolate->jsregexp_static_offsets_vector()); } int* vector_; int offsets_vector_length_; friend class ExternalReference; }; } } // namespace v8::internal #endif // V8_JSREGEXP_H_