普通文本  |  147行  |  3.51 KB

#!/usr/bin/python
# coding=utf-8
#
# Copyright 2008 The RE2 Authors.  All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

# See unicode_casefold.h for description of case folding tables.

"""Generate C++ table for Unicode case folding."""

import unicode, sys

_header = """
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
// make_unicode_casefold.py >unicode_casefold.cc

#include "re2/unicode_casefold.h"

namespace re2 {

"""

_trailer = """

} // namespace re2

"""

def _Delta(a, b):
  """Compute the delta for b - a.  Even/odd and odd/even
     are handled specially, as described above."""
  if a+1 == b:
    if a%2 == 0:
      return 'EvenOdd'
    else:
      return 'OddEven'
  if a == b+1:
    if a%2 == 0:
      return 'OddEven'
    else:
      return 'EvenOdd'
  return b - a

def _AddDelta(a, delta):
  """Return a + delta, handling EvenOdd and OddEven specially."""
  if type(delta) == int:
    return a+delta
  if delta == 'EvenOdd':
    if a%2 == 0:
      return a+1
    else:
      return a-1
  if delta == 'OddEven':
    if a%2 == 1:
      return a+1
    else:
      return a-1
  print >>sys.stderr, "Bad Delta: ", delta
  raise "Bad Delta"

def _MakeRanges(pairs):
  """Turn a list like [(65,97), (66, 98), ..., (90,122)]
     into [(65, 90, +32)]."""
  ranges = []
  last = -100

  def evenodd(last, a, b, r):
    if a != last+1 or b != _AddDelta(a, r[2]):
      return False
    r[1] = a
    return True

  def evenoddpair(last, a, b, r):
    if a != last+2:
      return False
    delta = r[2]
    d = delta
    if type(delta) is not str:
      return False
    if delta.endswith('Skip'):
      d = delta[:-4]
    else:
      delta = d + 'Skip'
    if b != _AddDelta(a, d):
      return False
    r[1] = a
    r[2] = delta
    return True

  for a, b in pairs:
    if ranges and evenodd(last, a, b, ranges[-1]):
      pass
    elif ranges and evenoddpair(last, a, b, ranges[-1]):
      pass
    else:
      ranges.append([a, a, _Delta(a, b)])
    last = a
  return ranges

# The maximum size of a case-folding group.
# Case folding is implemented in parse.cc by a recursive process
# with a recursion depth equal to the size of the largest
# case-folding group, so it is important that this bound be small.
# The current tables have no group bigger than 4.
# If there are ever groups bigger than 10 or so, it will be
# time to rework the code in parse.cc.
MaxCasefoldGroup = 4

def main():
  lowergroups, casegroups = unicode.CaseGroups()
  foldpairs = []
  seen = {}
  for c in casegroups:
    if len(c) > MaxCasefoldGroup:
      raise unicode.Error("casefold group too long: %s" % (c,))
    for i in range(len(c)):
      if c[i-1] in seen:
        raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
      seen[c[i-1]] = True
      foldpairs.append([c[i-1], c[i]])

  lowerpairs = []
  for lower, group in lowergroups.iteritems():
    for g in group:
      if g != lower:
        lowerpairs.append([g, lower])

  def printpairs(name, foldpairs):
    foldpairs.sort()
    foldranges = _MakeRanges(foldpairs)
    print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
    print "CaseFold unicode_%s[] = {" % (name,)
    for lo, hi, delta in foldranges:
      print "\t{ %d, %d, %s }," % (lo, hi, delta)
    print "};"
    print "int num_unicode_%s = %d;" % (name, len(foldranges),)
    print ""

  print _header
  printpairs("casefold", foldpairs)
  printpairs("tolower", lowerpairs)
  print _trailer

if __name__ == '__main__':
  main()