/* * Copyright (C) 2009 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef RegexInterpreter_h #define RegexInterpreter_h #include <wtf/Platform.h> #if ENABLE(YARR) #include <wtf/unicode/Unicode.h> #include "RegexParser.h" #include "RegexPattern.h" namespace JSC { namespace Yarr { class ByteDisjunction; struct ByteTerm { enum Type { TypeBodyAlternativeBegin, TypeBodyAlternativeDisjunction, TypeBodyAlternativeEnd, TypeAlternativeBegin, TypeAlternativeDisjunction, TypeAlternativeEnd, TypeSubpatternBegin, TypeSubpatternEnd, TypeAssertionBOL, TypeAssertionEOL, TypeAssertionWordBoundary, TypePatternCharacterOnce, TypePatternCharacterFixed, TypePatternCharacterGreedy, TypePatternCharacterNonGreedy, TypePatternCasedCharacterOnce, TypePatternCasedCharacterFixed, TypePatternCasedCharacterGreedy, TypePatternCasedCharacterNonGreedy, TypeCharacterClass, TypeBackReference, TypeParenthesesSubpattern, TypeParenthesesSubpatternOnceBegin, TypeParenthesesSubpatternOnceEnd, TypeParentheticalAssertionBegin, TypeParentheticalAssertionEnd, TypeCheckInput, } type; bool invertOrCapture; union { struct { union { UChar patternCharacter; struct { UChar lo; UChar hi; } casedCharacter; CharacterClass* characterClass; unsigned subpatternId; }; union { ByteDisjunction* parenthesesDisjunction; unsigned parenthesesWidth; }; QuantifierType quantityType; unsigned quantityCount; } atom; struct { int next; int end; } alternative; unsigned checkInputCount; }; unsigned frameLocation; int inputPosition; ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) : frameLocation(frameLocation) { switch (quantityType) { case QuantifierFixedCount: type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; break; case QuantifierGreedy: type = ByteTerm::TypePatternCharacterGreedy; break; case QuantifierNonGreedy: type = ByteTerm::TypePatternCharacterNonGreedy; break; } atom.patternCharacter = ch; atom.quantityType = quantityType; atom.quantityCount = quantityCount; inputPosition = inputPos; } ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) : frameLocation(frameLocation) { switch (quantityType) { case QuantifierFixedCount: type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; break; case QuantifierGreedy: type = ByteTerm::TypePatternCasedCharacterGreedy; break; case QuantifierNonGreedy: type = ByteTerm::TypePatternCasedCharacterNonGreedy; break; } atom.casedCharacter.lo = lo; atom.casedCharacter.hi = hi; atom.quantityType = quantityType; atom.quantityCount = quantityCount; inputPosition = inputPos; } ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) : type(ByteTerm::TypeCharacterClass) , invertOrCapture(invert) { atom.characterClass = characterClass; atom.quantityType = QuantifierFixedCount; atom.quantityCount = 1; inputPosition = inputPos; } ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool invertOrCapture, int inputPos) : type(type) , invertOrCapture(invertOrCapture) { atom.subpatternId = subpatternId; atom.parenthesesDisjunction = parenthesesInfo; atom.quantityType = QuantifierFixedCount; atom.quantityCount = 1; inputPosition = inputPos; } ByteTerm(Type type, bool invert = false) : type(type) , invertOrCapture(invert) { atom.quantityType = QuantifierFixedCount; atom.quantityCount = 1; } ByteTerm(Type type, unsigned subpatternId, bool invertOrCapture, int inputPos) : type(type) , invertOrCapture(invertOrCapture) { atom.subpatternId = subpatternId; atom.quantityType = QuantifierFixedCount; atom.quantityCount = 1; inputPosition = inputPos; } static ByteTerm BOL(int inputPos) { ByteTerm term(TypeAssertionBOL); term.inputPosition = inputPos; return term; } static ByteTerm CheckInput(unsigned count) { ByteTerm term(TypeCheckInput); term.checkInputCount = count; return term; } static ByteTerm EOL(int inputPos) { ByteTerm term(TypeAssertionEOL); term.inputPosition = inputPos; return term; } static ByteTerm WordBoundary(bool invert, int inputPos) { ByteTerm term(TypeAssertionWordBoundary, invert); term.inputPosition = inputPos; return term; } static ByteTerm BackReference(unsigned subpatternId, int inputPos) { return ByteTerm(TypeBackReference, subpatternId, false, inputPos); } static ByteTerm BodyAlternativeBegin() { ByteTerm term(TypeBodyAlternativeBegin); term.alternative.next = 0; term.alternative.end = 0; return term; } static ByteTerm BodyAlternativeDisjunction() { ByteTerm term(TypeBodyAlternativeDisjunction); term.alternative.next = 0; term.alternative.end = 0; return term; } static ByteTerm BodyAlternativeEnd() { ByteTerm term(TypeBodyAlternativeEnd); term.alternative.next = 0; term.alternative.end = 0; return term; } static ByteTerm AlternativeBegin() { ByteTerm term(TypeAlternativeBegin); term.alternative.next = 0; term.alternative.end = 0; return term; } static ByteTerm AlternativeDisjunction() { ByteTerm term(TypeAlternativeDisjunction); term.alternative.next = 0; term.alternative.end = 0; return term; } static ByteTerm AlternativeEnd() { ByteTerm term(TypeAlternativeEnd); term.alternative.next = 0; term.alternative.end = 0; return term; } static ByteTerm SubpatternBegin() { return ByteTerm(TypeSubpatternBegin); } static ByteTerm SubpatternEnd() { return ByteTerm(TypeSubpatternEnd); } bool invert() { return invertOrCapture; } bool capture() { return invertOrCapture; } }; class ByteDisjunction : public FastAllocBase { public: ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) : m_numSubpatterns(numSubpatterns) , m_frameSize(frameSize) { } Vector<ByteTerm> terms; unsigned m_numSubpatterns; unsigned m_frameSize; }; struct BytecodePattern : FastAllocBase { BytecodePattern(ByteDisjunction* body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern) : m_body(body) , m_ignoreCase(pattern.m_ignoreCase) , m_multiline(pattern.m_multiline) { newlineCharacterClass = pattern.newlineCharacterClass(); wordcharCharacterClass = pattern.wordcharCharacterClass(); m_allParenthesesInfo.append(allParenthesesInfo); m_userCharacterClasses.append(pattern.m_userCharacterClasses); // 'Steal' the RegexPattern's CharacterClasses! We clear its // array, so that it won't delete them on destruction. We'll // take responsibility for that. pattern.m_userCharacterClasses.clear(); } ~BytecodePattern() { deleteAllValues(m_allParenthesesInfo); deleteAllValues(m_userCharacterClasses); } OwnPtr<ByteDisjunction> m_body; bool m_ignoreCase; bool m_multiline; CharacterClass* newlineCharacterClass; CharacterClass* wordcharCharacterClass; private: Vector<ByteDisjunction*> m_allParenthesesInfo; Vector<CharacterClass*> m_userCharacterClasses; }; BytecodePattern* byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false); int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output); } } // namespace JSC::Yarr #endif #endif // RegexInterpreter_h