普通文本  |  265行  |  7.62 KB

// Copyright 2008 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Regular expression generator: generates all possible
// regular expressions within parameters (see regexp_generator.h for details).

// The regexp generator first generates a sequence of commands in a simple
// postfix language.  Each command in the language is a string,
// like "a" or "%s*" or "%s|%s".
//
// To evaluate a command, enough arguments are popped from the value stack to
// plug into the %s slots.  Then the result is pushed onto the stack.
// For example, the command sequence
//      a b %s%s c
// results in the stack
//      ab c
//
// GeneratePostfix generates all possible command sequences.
// Then RunPostfix turns each sequence into a regular expression
// and passes the regexp to HandleRegexp.

#include <string.h>
#include <string>
#include <stack>
#include <vector>
#include "util/test.h"
#include "re2/testing/regexp_generator.h"

namespace re2 {

// Returns a vector of the egrep regexp operators.
const vector<string>& RegexpGenerator::EgrepOps() {
  static const char *ops[] = {
    "%s%s",
    "%s|%s",
    "%s*",
    "%s+",
    "%s?",
    "%s\\C*",
  };
  static vector<string> v(ops, ops + arraysize(ops));
  return v;
}

RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
                                 const vector<string>& atoms,
                                 const vector<string>& ops)
    : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
  // Degenerate case.
  if (atoms_.size() == 0)
    maxatoms_ = 0;
  if (ops_.size() == 0)
    maxops_ = 0;
}

// Generates all possible regular expressions (within the parameters),
// calling HandleRegexp for each one.
void RegexpGenerator::Generate() {
  vector<string> postfix;
  GeneratePostfix(&postfix, 0, 0, 0);
}

// Generates random regular expressions, calling HandleRegexp for each one.
void RegexpGenerator::GenerateRandom(int32 seed, int n) {
  ACMRandom acm(seed);
  acm_ = &acm;

  for (int i = 0; i < n; i++) {
    vector<string> postfix;
    GenerateRandomPostfix(&postfix, 0, 0, 0);
  }

  acm_ = NULL;
}

// Counts and returns the number of occurrences of "%s" in s.
static int CountArgs(const string& s) {
  const char *p = s.c_str();
  int n = 0;
  while ((p = strstr(p, "%s")) != NULL) {
    p += 2;
    n++;
  }
  return n;
}

// Generates all possible postfix command sequences.
// Each sequence is handed off to RunPostfix to generate a regular expression.
// The arguments are:
//   post:  the current postfix sequence
//   nstk:  the number of elements that would be on the stack after executing
//          the sequence
//   ops:   the number of operators used in the sequence
//   atoms: the number of atoms used in the sequence
// For example, if post were ["a", "b", "%s%s", "c"],
// then nstk = 2, ops = 1, atoms = 3.
//
// The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
//
void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk,
                                      int ops, int atoms) {
  if (nstk == 1)
    RunPostfix(*post);

  // Early out: if used too many operators or can't
  // get back down to a single expression on the stack
  // using binary operators, give up.
  if (ops + nstk - 1 > maxops_)
    return;

  // Add atoms if there is room.
  if (atoms < maxatoms_) {
    for (int i = 0; i < atoms_.size(); i++) {
      post->push_back(atoms_[i]);
      GeneratePostfix(post, nstk + 1, ops, atoms + 1);
      post->pop_back();
    }
  }

  // Add operators if there are enough arguments.
  if (ops < maxops_) {
    for (int i = 0; i < ops_.size(); i++) {
      const string& fmt = ops_[i];
      int nargs = CountArgs(fmt);
      if (nargs <= nstk) {
        post->push_back(fmt);
        GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
        post->pop_back();
      }
    }
  }
}

// Generates a random postfix command sequence.
// Stops and returns true once a single sequence has been generated.
bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk,
                                            int ops, int atoms) {
  for (;;) {
    // Stop if we get to a single element, but only sometimes.
    if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) {
      RunPostfix(*post);
      return true;
    }

    // Early out: if used too many operators or can't
    // get back down to a single expression on the stack
    // using binary operators, give up.
    if (ops + nstk - 1 > maxops_)
      return false;

    // Add operators if there are enough arguments.
    if (ops < maxops_ && acm_->Uniform(2) == 0) {
      const string& fmt = ops_[acm_->Uniform(ops_.size())];
      int nargs = CountArgs(fmt);
      if (nargs <= nstk) {
        post->push_back(fmt);
        bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
                                         ops + 1, atoms);
        post->pop_back();
        if (ret)
          return true;
      }
    }

    // Add atoms if there is room.
    if (atoms < maxatoms_ && acm_->Uniform(2) == 0) {
      post->push_back(atoms_[acm_->Uniform(atoms_.size())]);
      bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
      post->pop_back();
      if (ret)
        return true;
    }
  }
}

// Interprets the postfix command sequence to create a regular expression
// passed to HandleRegexp.  The results of operators like %s|%s are wrapped
// in (?: ) to avoid needing to maintain a precedence table.
void RegexpGenerator::RunPostfix(const vector<string>& post) {
  stack<string> regexps;
  for (int i = 0; i < post.size(); i++) {
    switch (CountArgs(post[i])) {
      default:
        LOG(FATAL) << "Bad operator: " << post[i];
      case 0:
        regexps.push(post[i]);
        break;
      case 1: {
        string a = regexps.top();
        regexps.pop();
        regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
        break;
      }
      case 2: {
        string b = regexps.top();
        regexps.pop();
        string a = regexps.top();
        regexps.pop();
        regexps.push("(?:" +
                     StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
                     ")");
        break;
      }
    }
  }

  if (regexps.size() != 1) {
    // Internal error - should never happen.
    printf("Bad regexp program:\n");
    for (int i = 0; i < post.size(); i++) {
      printf("  %s\n", CEscape(post[i]).c_str());
    }
    printf("Stack after running program:\n");
    while (!regexps.empty()) {
      printf("  %s\n", CEscape(regexps.top()).c_str());
      regexps.pop();
    }
    LOG(FATAL) << "Bad regexp program.";
  }

  HandleRegexp(regexps.top());
  HandleRegexp("^(?:" + regexps.top() + ")$");
  HandleRegexp("^(?:" + regexps.top() + ")");
  HandleRegexp("(?:" + regexps.top() + ")$");
}

// Split s into an vector of strings, one for each UTF-8 character.
vector<string> Explode(const StringPiece& s) {
  vector<string> v;

  for (const char *q = s.begin(); q < s.end(); ) {
    const char* p = q;
    Rune r;
    q += chartorune(&r, q);
    v.push_back(string(p, q - p));
  }

  return v;
}

// Split string everywhere a substring is found, returning
// vector of pieces.
vector<string> Split(const StringPiece& sep, const StringPiece& s) {
  vector<string> v;

  if (sep.size() == 0)
    return Explode(s);

  const char *p = s.begin();
  for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) {
    if (StringPiece(q, sep.size()) == sep) {
      v.push_back(string(p, q - p));
      p = q + sep.size();
      q = p - 1;  // -1 for ++ in loop
      continue;
    }
  }
  if (p < s.end())
    v.push_back(string(p, s.end() - p));
  return v;
}

}  // namespace re2