普通文本  |  172行  |  4 KB

// Copyright 2007 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Test prog.cc, compile.cc

#include <string>
#include <vector>
#include "util/test.h"
#include "re2/regexp.h"
#include "re2/prog.h"

DEFINE_string(show, "", "regular expression to compile and dump");

namespace re2 {

// Simple input/output tests checking that
// the regexp compiles to the expected code.
// These are just to sanity check the basic implementation.
// The real confidence tests happen by testing the NFA/DFA
// that run the compiled code.

struct Test {
  const char* regexp;
  const char* code;
};

static Test tests[] = {
  { "a",
    "1. byte [61-61] -> 2\n"
    "2. match! 0\n" },
  { "ab",
    "1. byte [61-61] -> 2\n"
    "2. byte [62-62] -> 3\n"
    "3. match! 0\n" },
  { "a|c",
    "3. alt -> 1 | 2\n"
    "1. byte [61-61] -> 4\n"
    "2. byte [63-63] -> 4\n"
    "4. match! 0\n" },
  { "a|b",
    "1. byte [61-62] -> 2\n"
    "2. match! 0\n" },
  { "[ab]",
    "1. byte [61-62] -> 2\n"
    "2. match! 0\n" },
  { "a+",
    "1. byte [61-61] -> 2\n"
    "2. alt -> 1 | 3\n"
    "3. match! 0\n" },
  { "a+?",
    "1. byte [61-61] -> 2\n"
    "2. alt -> 3 | 1\n"
    "3. match! 0\n" },
  { "a*",
    "2. alt -> 1 | 3\n"
    "1. byte [61-61] -> 2\n"
    "3. match! 0\n" },
  { "a*?",
    "2. alt -> 3 | 1\n"
    "3. match! 0\n"
    "1. byte [61-61] -> 2\n" },
  { "a?",
    "2. alt -> 1 | 3\n"
    "1. byte [61-61] -> 3\n"
    "3. match! 0\n" },
  { "a??",
    "2. alt -> 3 | 1\n"
    "3. match! 0\n"
    "1. byte [61-61] -> 3\n" },
  { "a{4}",
    "1. byte [61-61] -> 2\n"
    "2. byte [61-61] -> 3\n"
    "3. byte [61-61] -> 4\n"
    "4. byte [61-61] -> 5\n"
    "5. match! 0\n" },
  { "(a)",
    "2. capture 2 -> 1\n"
    "1. byte [61-61] -> 3\n"
    "3. capture 3 -> 4\n"
    "4. match! 0\n" },
  { "(?:a)",
    "1. byte [61-61] -> 2\n"
    "2. match! 0\n" },
  { "",
    "2. match! 0\n" },
  { ".",
    "3. alt -> 1 | 2\n"
    "1. byte [00-09] -> 4\n"
    "2. byte [0b-ff] -> 4\n"
    "4. match! 0\n" },
  { "[^ab]",
    "5. alt -> 3 | 4\n"
    "3. alt -> 1 | 2\n"
    "4. byte [63-ff] -> 6\n"
    "1. byte [00-09] -> 6\n"
    "2. byte [0b-60] -> 6\n"
    "6. match! 0\n" },
  { "[Aa]",
    "1. byte/i [61-61] -> 2\n"
    "2. match! 0\n" },
};

TEST(TestRegexpCompileToProg, Simple) {
  int failed = 0;
  for (int i = 0; i < arraysize(tests); i++) {
    const re2::Test& t = tests[i];
    Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
    if (re == NULL) {
      LOG(ERROR) << "Cannot parse: " << t.regexp;
      failed++;
      continue;
    }
    Prog* prog = re->CompileToProg(0);
    if (prog == NULL) {
      LOG(ERROR) << "Cannot compile: " << t.regexp;
      re->Decref();
      failed++;
      continue;
    }
    CHECK(re->CompileToProg(1) == NULL);
    string s = prog->Dump();
    if (s != t.code) {
      LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
      LOG(ERROR) << "Want:\n" << t.code;
      LOG(ERROR) << "Got:\n" << s;
      failed++;
    }
    delete prog;
    re->Decref();
  }
  EXPECT_EQ(failed, 0);
}

// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
// Once, erroneously split between 0x3f and 0x40 because it is
// a 6-bit boundary.
static struct UTF8ByteRange {
  int lo;
  int hi;
} utf8ranges[] = {
  { 0x00, 0x09 },
  { 0x0A, 0x0A },
  { 0x10, 0x7F },
  { 0x80, 0x8F },
  { 0x90, 0x9F },
  { 0xA0, 0xBF },
  { 0xC0, 0xC1 },
  { 0xC2, 0xDF },
  { 0xE0, 0xE0 },
  { 0xE1, 0xEF },
  { 0xF0, 0xF0 },
  { 0xF1, 0xF3 },
  { 0xF4, 0xF4 },
  { 0xF5, 0xFF },
};

TEST(TestCompile, ByteRanges) {
  Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
  EXPECT_TRUE(re != NULL);
  Prog* prog = re->CompileToProg(0);
  EXPECT_TRUE(prog != NULL);
  EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
  for (int i = 0; i < arraysize(utf8ranges); i++)
    for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
      EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
  delete prog;
  re->Decref();
}

}  // namespace re2