#!/usr/bin/env python
from __future__ import print_function
import sys, os, re, difflib, unicodedata, errno, cgi
from itertools import *
diff_symbols = "-+=*&^%$#@!~/"
diff_colors = ['red', 'green', 'blue']
def codepoints(s):
return (ord (u) for u in s)
try:
unichr = unichr
if sys.maxunicode < 0x10FFFF:
# workarounds for Python 2 "narrow" builds with UCS2-only support.
_narrow_unichr = unichr
def unichr(i):
"""
Return the unicode character whose Unicode code is the integer 'i'.
The valid range is 0 to 0x10FFFF inclusive.
>>> _narrow_unichr(0xFFFF + 1)
Traceback (most recent call last):
File "<stdin>", line 1, in ?
ValueError: unichr() arg not in range(0x10000) (narrow Python build)
>>> unichr(0xFFFF + 1) == u'\U00010000'
True
>>> unichr(1114111) == u'\U0010FFFF'
True
>>> unichr(0x10FFFF + 1)
Traceback (most recent call last):
File "<stdin>", line 1, in ?
ValueError: unichr() arg not in range(0x110000)
"""
try:
return _narrow_unichr(i)
except ValueError:
try:
padded_hex_str = hex(i)[2:].zfill(8)
escape_str = "\\U" + padded_hex_str
return escape_str.decode("unicode-escape")
except UnicodeDecodeError:
raise ValueError('unichr() arg not in range(0x110000)')
def codepoints(s):
high_surrogate = None
for u in s:
cp = ord (u)
if 0xDC00 <= cp <= 0xDFFF:
if high_surrogate:
yield 0x10000 + (high_surrogate - 0xD800) * 0x400 + (cp - 0xDC00)
high_surrogate = None
else:
yield 0xFFFC
else:
if high_surrogate:
yield 0xFFFC
high_surrogate = None
if 0xD800 <= cp <= 0xDBFF:
high_surrogate = cp
else:
yield cp
high_surrogate = None
if high_surrogate:
yield 0xFFFC
except NameError:
unichr = chr
try:
unicode = unicode
except NameError:
unicode = str
def tounicode(s, encoding='ascii', errors='strict'):
if not isinstance(s, unicode):
return s.decode(encoding, errors)
else:
return s
class ColorFormatter:
class Null:
@staticmethod
def start_color (c): return ''
@staticmethod
def end_color (): return ''
@staticmethod
def escape (s): return s
@staticmethod
def newline (): return '\n'
class ANSI:
@staticmethod
def start_color (c):
return {
'red': '\033[41;37;1m',
'green': '\033[42;37;1m',
'blue': '\033[44;37;1m',
}[c]
@staticmethod
def end_color ():
return '\033[m'
@staticmethod
def escape (s): return s
@staticmethod
def newline (): return '\n'
class HTML:
@staticmethod
def start_color (c):
return '<span style="background:%s">' % c
@staticmethod
def end_color ():
return '</span>'
@staticmethod
def escape (s): return cgi.escape (s)
@staticmethod
def newline (): return '<br/>\n'
@staticmethod
def Auto (argv = [], out = sys.stdout):
format = ColorFormatter.ANSI
if "--format" in argv:
argv.remove ("--format")
format = ColorFormatter.ANSI
if "--format=ansi" in argv:
argv.remove ("--format=ansi")
format = ColorFormatter.ANSI
if "--format=html" in argv:
argv.remove ("--format=html")
format = ColorFormatter.HTML
if "--no-format" in argv:
argv.remove ("--no-format")
format = ColorFormatter.Null
return format
class DiffColorizer:
diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)')
def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols):
self.formatter = formatter
self.colors = colors
self.symbols = symbols
def colorize_lines (self, lines):
lines = (l if l else '' for l in lines)
ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines]
oo = ["",""]
st = [False, False]
for l in difflib.Differ().compare (*ss):
if l[0] == '?':
continue
if l[0] == ' ':
for i in range(2):
if st[i]:
oo[i] += self.formatter.end_color ()
st[i] = False
oo = [o + self.formatter.escape (l[2:]) for o in oo]
continue
if l[0] in self.symbols:
i = self.symbols.index (l[0])
if not st[i]:
oo[i] += self.formatter.start_color (self.colors[i])
st[i] = True
oo[i] += self.formatter.escape (l[2:])
continue
for i in range(2):
if st[i]:
oo[i] += self.formatter.end_color ()
st[i] = False
oo = [o.replace ('\n', '') for o in oo]
return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2]
def colorize_diff (self, f):
lines = [None, None]
for l in f:
if l[0] not in self.symbols:
yield self.formatter.escape (l).replace ('\n', self.formatter.newline ())
continue
i = self.symbols.index (l[0])
if lines[i]:
# Flush
for line in self.colorize_lines (lines):
yield line
lines = [None, None]
lines[i] = l[1:]
if (all (lines)):
# Flush
for line in self.colorize_lines (lines):
yield line
lines = [None, None]
if (any (lines)):
# Flush
for line in self.colorize_lines (lines):
yield line
class ZipDiffer:
@staticmethod
def diff_files (files, symbols=diff_symbols):
files = tuple (files) # in case it's a generator, copy it
try:
for lines in izip_longest (*files):
if all (lines[0] == line for line in lines[1:]):
sys.stdout.writelines ([" ", lines[0]])
continue
for i, l in enumerate (lines):
if l:
sys.stdout.writelines ([symbols[i], l])
except IOError as e:
if e.errno != errno.EPIPE:
print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
sys.exit (1)
class DiffFilters:
@staticmethod
def filter_failures (f):
for key, lines in DiffHelpers.separate_test_cases (f):
lines = list (lines)
if not DiffHelpers.test_passed (lines):
for l in lines: yield l
class Stat:
def __init__ (self):
self.count = 0
self.freq = 0
def add (self, test):
self.count += 1
self.freq += test.freq
class Stats:
def __init__ (self):
self.passed = Stat ()
self.failed = Stat ()
self.total = Stat ()
def add (self, test):
self.total.add (test)
if test.passed:
self.passed.add (test)
else:
self.failed.add (test)
def mean (self):
return float (self.passed.count) / self.total.count
def variance (self):
return (float (self.passed.count) / self.total.count) * \
(float (self.failed.count) / self.total.count)
def stddev (self):
return self.variance () ** .5
def zscore (self, population):
"""Calculate the standard score.
Population is the Stats for population.
Self is Stats for sample.
Returns larger absolute value if sample is highly unlikely to be random.
Anything outside of -3..+3 is very unlikely to be random.
See: http://en.wikipedia.org/wiki/Standard_score"""
return (self.mean () - population.mean ()) / population.stddev ()
class DiffSinks:
@staticmethod
def print_stat (f):
passed = 0
failed = 0
# XXX port to Stats, but that would really slow us down here
for key, lines in DiffHelpers.separate_test_cases (f):
if DiffHelpers.test_passed (lines):
passed += 1
else:
failed += 1
total = passed + failed
print ("%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total))
@staticmethod
def print_ngrams (f, ns=(1,2,3)):
gens = tuple (Ngram.generator (n) for n in ns)
allstats = Stats ()
allgrams = {}
for key, lines in DiffHelpers.separate_test_cases (f):
test = Test (lines)
allstats.add (test)
for gen in gens:
for ngram in gen (test.unicodes):
if ngram not in allgrams:
allgrams[ngram] = Stats ()
allgrams[ngram].add (test)
importantgrams = {}
for ngram, stats in allgrams.iteritems ():
if stats.failed.count >= 30: # for statistical reasons
importantgrams[ngram] = stats
allgrams = importantgrams
del importantgrams
for ngram, stats in allgrams.iteritems ():
print ("zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram)))
class Test:
def __init__ (self, lines):
self.freq = 1
self.passed = True
self.identifier = None
self.text = None
self.unicodes = None
self.glyphs = None
for l in lines:
symbol = l[0]
if symbol != ' ':
self.passed = False
i = 1
if ':' in l:
i = l.index (':')
if not self.identifier:
self.identifier = l[1:i]
i = i + 2 # Skip colon and space
j = -1
if l[j] == '\n':
j -= 1
brackets = l[i] + l[j]
l = l[i+1:-2]
if brackets == '()':
self.text = l
elif brackets == '<>':
self.unicodes = Unicode.parse (l)
elif brackets == '[]':
# XXX we don't handle failed tests here
self.glyphs = l
class DiffHelpers:
@staticmethod
def separate_test_cases (f):
'''Reads lines from f, and if the lines have identifiers, ie.
have a colon character, groups them by identifier,
yielding lists of all lines with the same identifier.'''
def identifier (l):
if ':' in l[1:]:
return l[1:l.index (':')]
return l
return groupby (f, key=identifier)
@staticmethod
def test_passed (lines):
lines = list (lines)
# XXX This is a hack, but does the job for now.
if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True
if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True
if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True
if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True
if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True
if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True
return all (l[0] == ' ' for l in lines)
class FilterHelpers:
@staticmethod
def filter_printer_function (filter_callback):
def printer (f):
for line in filter_callback (f):
print (line)
return printer
@staticmethod
def filter_printer_function_no_newline (filter_callback):
def printer (f):
for line in filter_callback (f):
sys.stdout.writelines ([line])
return printer
class Ngram:
@staticmethod
def generator (n):
def gen (f):
l = []
for x in f:
l.append (x)
if len (l) == n:
yield tuple (l)
l[:1] = []
gen.n = n
return gen
class UtilMains:
@staticmethod
def process_multiple_files (callback, mnemonic = "FILE"):
if "--help" in sys.argv:
print ("Usage: %s %s..." % (sys.argv[0], mnemonic))
sys.exit (1)
try:
files = sys.argv[1:] if len (sys.argv) > 1 else ['-']
for s in files:
callback (FileHelpers.open_file_or_stdin (s))
except IOError as e:
if e.errno != errno.EPIPE:
print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
sys.exit (1)
@staticmethod
def process_multiple_args (callback, mnemonic):
if len (sys.argv) == 1 or "--help" in sys.argv:
print ("Usage: %s %s..." % (sys.argv[0], mnemonic))
sys.exit (1)
try:
for s in sys.argv[1:]:
callback (s)
except IOError as e:
if e.errno != errno.EPIPE:
print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
sys.exit (1)
@staticmethod
def filter_multiple_strings_or_stdin (callback, mnemonic, \
separator = " ", \
concat_separator = False):
if "--help" in sys.argv:
print ("Usage:\n %s %s...\nor:\n %s\n\nWhen called with no arguments, input is read from standard input." \
% (sys.argv[0], mnemonic, sys.argv[0]))
sys.exit (1)
try:
if len (sys.argv) == 1:
while (1):
line = sys.stdin.readline ()
if not len (line):
break
if line[-1] == '\n':
line = line[:-1]
print (callback (line))
else:
args = sys.argv[1:]
if concat_separator != False:
args = [concat_separator.join (args)]
print (separator.join (callback (x) for x in (args)))
except IOError as e:
if e.errno != errno.EPIPE:
print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr)
sys.exit (1)
class Unicode:
@staticmethod
def decode (s):
return u','.join ("U+%04X" % cp for cp in codepoints (tounicode (s, 'utf-8')))
@staticmethod
def parse (s):
s = re.sub (r"0[xX]", " ", s)
s = re.sub (r"[<+>{},;&#\\xXuUnNiI\n\t]", " ", s)
return [int (x, 16) for x in s.split ()]
@staticmethod
def encode (s):
s = u''.join (unichr (x) for x in Unicode.parse (s))
if sys.version_info[0] == 2: s = s.encode ('utf-8')
return s
shorthands = {
"ZERO WIDTH NON-JOINER": "ZWNJ",
"ZERO WIDTH JOINER": "ZWJ",
"NARROW NO-BREAK SPACE": "NNBSP",
"COMBINING GRAPHEME JOINER": "CGJ",
"LEFT-TO-RIGHT MARK": "LRM",
"RIGHT-TO-LEFT MARK": "RLM",
"LEFT-TO-RIGHT EMBEDDING": "LRE",
"RIGHT-TO-LEFT EMBEDDING": "RLE",
"POP DIRECTIONAL FORMATTING": "PDF",
"LEFT-TO-RIGHT OVERRIDE": "LRO",
"RIGHT-TO-LEFT OVERRIDE": "RLO",
}
@staticmethod
def pretty_name (u):
try:
s = unicodedata.name (u)
except ValueError:
return "XXX"
s = re.sub (".* LETTER ", "", s)
s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s)
s = re.sub (".* SIGN ", "", s)
s = re.sub (".* COMBINING ", "", s)
if re.match (".* VIRAMA", s):
s = "HALANT"
if s in Unicode.shorthands:
s = Unicode.shorthands[s]
return s
@staticmethod
def pretty_names (s):
s = re.sub (r"[<+>\\uU]", " ", s)
s = re.sub (r"0[xX]", " ", s)
s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)]
return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8')
class FileHelpers:
@staticmethod
def open_file_or_stdin (f):
if f == '-':
return sys.stdin
return file (f)
class Manifest:
@staticmethod
def read (s, strict = True):
if not os.path.exists (s):
if strict:
print ("%s: %s does not exist" % (sys.argv[0], s), file=sys.stderr)
sys.exit (1)
return
s = os.path.normpath (s)
if os.path.isdir (s):
try:
m = file (os.path.join (s, "MANIFEST"))
items = [x.strip () for x in m.readlines ()]
for f in items:
for p in Manifest.read (os.path.join (s, f)):
yield p
except IOError:
if strict:
print ("%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST")), file=sys.stderr)
sys.exit (1)
return
else:
yield s
@staticmethod
def update_recursive (s):
for dirpath, dirnames, filenames in os.walk (s, followlinks=True):
for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]:
if f in dirnames:
dirnames.remove (f)
if f in filenames:
filenames.remove (f)
dirnames.sort ()
filenames.sort ()
ms = os.path.join (dirpath, "MANIFEST")
print (" GEN %s" % ms)
m = open (ms, "w")
for f in filenames:
print (f, file=m)
for f in dirnames:
print (f, file=m)
for f in dirnames:
Manifest.update_recursive (os.path.join (dirpath, f))
if __name__ == '__main__':
pass