#!/usr/bin/python
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
"""Generate C++ tables for Unicode Script and Category groups."""
import sys
import unicode
_header = """
// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
// make_unicode_groups.py >unicode_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
"""
_trailer = """
} // namespace re2
"""
n16 = 0
n32 = 0
def MakeRanges(codes):
"""Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
ranges = []
last = -100
for c in codes:
if c == last+1:
ranges[-1][1] = c
else:
ranges.append([c, c])
last = c
return ranges
def PrintRanges(type, name, ranges):
"""Print the ranges as an array of type named name."""
print "static %s %s[] = {" % (type, name,)
for lo, hi in ranges:
print "\t{ %d, %d }," % (lo, hi)
print "};"
# def PrintCodes(type, name, codes):
# """Print the codes as an array of type named name."""
# print "static %s %s[] = {" % (type, name,)
# for c in codes:
# print "\t%d," % (c,)
# print "};"
def PrintGroup(name, codes):
"""Print the data structures for the group of codes.
Return a UGroup literal for the group."""
# See unicode_groups.h for a description of the data structure.
# Split codes into 16-bit ranges and 32-bit ranges.
range16 = MakeRanges([c for c in codes if c < 65536])
range32 = MakeRanges([c for c in codes if c >= 65536])
# Pull singleton ranges out of range16.
# code16 = [lo for lo, hi in range16 if lo == hi]
# range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
global n16
global n32
n16 += len(range16)
n32 += len(range32)
ugroup = "{ \"%s\", +1" % (name,)
# if len(code16) > 0:
# PrintCodes("uint16", name+"_code16", code16)
# ugroup += ", %s_code16, %d" % (name, len(code16))
# else:
# ugroup += ", 0, 0"
if len(range16) > 0:
PrintRanges("URange16", name+"_range16", range16)
ugroup += ", %s_range16, %d" % (name, len(range16))
else:
ugroup += ", 0, 0"
if len(range32) > 0:
PrintRanges("URange32", name+"_range32", range32)
ugroup += ", %s_range32, %d" % (name, len(range32))
else:
ugroup += ", 0, 0"
ugroup += " }"
return ugroup
def main():
print _header
ugroups = []
for name, codes in unicode.Categories().iteritems():
ugroups.append(PrintGroup(name, codes))
for name, codes in unicode.Scripts().iteritems():
ugroups.append(PrintGroup(name, codes))
print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
print "UGroup unicode_groups[] = {";
ugroups.sort()
for ug in ugroups:
print "\t%s," % (ug,)
print "};"
print "int num_unicode_groups = %d;" % (len(ugroups),)
print _trailer
if __name__ == '__main__':
main()