#!/usr/bin/python -u # # Original script modified in November 2003 to take advantage of # the character-validation range routines, and updated to the # current Unicode information (Version 4.0.1) # # NOTE: there is an 'alias' facility for blocks which are not present in # the current release, but are needed for ABI compatibility. This # must be accomplished MANUALLY! Please see the comments below under # 'blockAliases' # import sys import string import time webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html" sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt" # # blockAliases is a small hack - it is used for mapping block names which # were were used in the 3.1 release, but are missing or changed in the current # release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]" blockAliases = [] blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols") blockAliases.append("Greek:GreekandCoptic") blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + "SupplementaryPrivateUseArea-B") # minTableSize gives the minimum number of ranges which must be present # before a range table is produced. If there are less than this # number, inline comparisons are generated minTableSize = 8 (blockfile, catfile) = string.split(sources) # # Now process the "blocks" file, reducing it to a dictionary # indexed by blockname, containing a tuple with the applicable # block range # BlockNames = {} try: blocks = open(blockfile, "r") except: print "Missing %s, aborting ..." % blockfile sys.exit(1) for line in blocks.readlines(): if line[0] == '#': continue line = string.strip(line) if line == '': continue try: fields = string.split(line, ';') range = string.strip(fields[0]) (start, end) = string.split(range, "..") name = string.strip(fields[1]) name = string.replace(name, ' ', '') except: print "Failed to process line: %s" % (line) continue start = "0x" + start end = "0x" + end try: BlockNames[name].append((start, end)) except: BlockNames[name] = [(start, end)] blocks.close() print "Parsed %d blocks descriptions" % (len(BlockNames.keys())) for block in blockAliases: alias = string.split(block,':') alist = string.split(alias[1],',') for comp in alist: if BlockNames.has_key(comp): if alias[0] not in BlockNames: BlockNames[alias[0]] = [] for r in BlockNames[comp]: BlockNames[alias[0]].append(r) else: print "Alias %s: %s not in Blocks" % (alias[0], comp) continue # # Next process the Categories file. This is more complex, since # the file is in code sequence, and we need to invert it. We use # a dictionary with index category-name, with each entry containing # all the ranges (codepoints) of that category. Note that category # names comprise two parts - the general category, and the "subclass" # within that category. Therefore, both "general category" (which is # the first character of the 2-character category-name) and the full # (2-character) name are entered into this dictionary. # try: data = open(catfile, "r") except: print "Missing %s, aborting ..." % catfile sys.exit(1) nbchar = 0; Categories = {} for line in data.readlines(): if line[0] == '#': continue line = string.strip(line) if line == '': continue try: fields = string.split(line, ';') point = string.strip(fields[0]) value = 0 while point != '': value = value * 16 if point[0] >= '0' and point[0] <= '9': value = value + ord(point[0]) - ord('0') elif point[0] >= 'A' and point[0] <= 'F': value = value + 10 + ord(point[0]) - ord('A') elif point[0] >= 'a' and point[0] <= 'f': value = value + 10 + ord(point[0]) - ord('a') point = point[1:] name = fields[2] except: print "Failed to process line: %s" % (line) continue nbchar = nbchar + 1 # update entry for "full name" try: Categories[name].append(value) except: try: Categories[name] = [value] except: print "Failed to process line: %s" % (line) # update "general category" name try: Categories[name[0]].append(value) except: try: Categories[name[0]] = [value] except: print "Failed to process line: %s" % (line) blocks.close() print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())) # # The data is now all read. Time to process it into a more useful form. # # reduce the number list into ranges for cat in Categories.keys(): list = Categories[cat] start = -1 prev = -1 end = -1 ranges = [] for val in list: if start == -1: start = val prev = val continue elif val == prev + 1: prev = val continue elif prev == start: ranges.append((prev, prev)) start = val prev = val continue else: ranges.append((start, prev)) start = val prev = val continue if prev == start: ranges.append((prev, prev)) else: ranges.append((start, prev)) Categories[cat] = ranges # # Assure all data is in alphabetic order, since we will be doing binary # searches on the tables. # bkeys = BlockNames.keys() bkeys.sort() ckeys = Categories.keys() ckeys.sort() # # Generate the resulting files # try: header = open("include/libxml/xmlunicode.h", "w") except: print "Failed to open include/libxml/xmlunicode.h" sys.exit(1) try: output = open("xmlunicode.c", "w") except: print "Failed to open xmlunicode.c" sys.exit(1) date = time.asctime(time.localtime(time.time())) header.write( """/* * Summary: Unicode character APIs * Description: API for the Unicode character APIs * * This file is automatically generated from the * UCS description files of the Unicode Character Database * %s * using the genUnicode.py Python script. * * Generation date: %s * Sources: %s * Author: Daniel Veillard */ #ifndef __XML_UNICODE_H__ #define __XML_UNICODE_H__ #include <libxml/xmlversion.h> #ifdef LIBXML_UNICODE_ENABLED #ifdef __cplusplus extern "C" { #endif """ % (webpage, date, sources)); output.write( """/* * xmlunicode.c: this module implements the Unicode character APIs * * This file is automatically generated from the * UCS description files of the Unicode Character Database * %s * using the genUnicode.py Python script. * * Generation date: %s * Sources: %s * Daniel Veillard <veillard@redhat.com> */ #define IN_LIBXML #include "libxml.h" #ifdef LIBXML_UNICODE_ENABLED #include <string.h> #include <libxml/xmlversion.h> #include <libxml/xmlunicode.h> #include <libxml/chvalid.h> typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */ typedef struct { const char *rangename; xmlIntFunc *func; } xmlUnicodeRange; typedef struct { const xmlUnicodeRange *table; int numentries; } xmlUnicodeNameTable; static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname); static const xmlUnicodeRange xmlUnicodeBlocks[] = { """ % (webpage, date, sources)); flag = 0 for block in bkeys: name = string.replace(block, '-', '') if flag: output.write(',\n') else: flag = 1 output.write(' {"%s", xmlUCSIs%s}' % (block, name)) output.write('};\n\n') output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n') flag = 0; for name in ckeys: if flag: output.write(',\n') else: flag = 1 output.write(' {"%s", xmlUCSIsCat%s}' % (name, name)) output.write('};\n\n') # # For any categories with more than minTableSize ranges we generate # a range table suitable for xmlCharInRange # for name in ckeys: if len(Categories[name]) > minTableSize: numshort = 0 numlong = 0 ranges = Categories[name] sptr = "NULL" lptr = "NULL" for range in ranges: (low, high) = range if high < 0x10000: if numshort == 0: pline = "static const xmlChSRange xml%sS[] = {" % name sptr = "xml%sS" % name else: pline += ", " numshort += 1 else: if numlong == 0: if numshort > 0: output.write(pline + " };\n") pline = "static const xmlChLRange xml%sL[] = {" % name lptr = "xml%sL" % name else: pline += ", " numlong += 1 if len(pline) > 60: output.write(pline + "\n") pline = " " pline += "{%s, %s}" % (hex(low), hex(high)) output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n" % (name, numshort, numlong, sptr, lptr)) output.write( """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s}; static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s}; /** * xmlUnicodeLookup: * @tptr: pointer to the name table * @name: name to be found * * binary table lookup for user-supplied name * * Returns pointer to range function if found, otherwise NULL */ static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) { int low, high, mid, cmp; xmlUnicodeRange *sptr; if ((tptr == NULL) || (tname == NULL)) return(NULL); low = 0; high = tptr->numentries - 1; sptr = tptr->table; while (low <= high) { mid = (low + high) / 2; if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0) return (sptr[mid].func); if (cmp < 0) high = mid - 1; else low = mid + 1; } return (NULL); } """ % (len(BlockNames), len(Categories)) ) for block in bkeys: name = string.replace(block, '-', '') header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name) output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name)) output.write(" *\n * Check whether the character is part of %s UCS Block\n"% (block)) output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); output.write("int\nxmlUCSIs%s(int code) {\n return(" % name) flag = 0 for (start, end) in BlockNames[block]: if flag: output.write(" ||\n ") else: flag = 1 output.write("((code >= %s) && (code <= %s))" % (start, end)) output.write(");\n}\n\n") header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n") output.write( """/** * xmlUCSIsBlock: * @code: UCS code point * @block: UCS block name * * Check whether the character is part of the UCS Block * * Returns 1 if true, 0 if false and -1 on unknown block */ int xmlUCSIsBlock(int code, const char *block) { xmlIntFunc *func; func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block); if (func == NULL) return (-1); return (func(code)); } """) for name in ckeys: ranges = Categories[name] header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name) output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name)) output.write(" *\n * Check whether the character is part of %s UCS Category\n"% (name)) output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); output.write("int\nxmlUCSIsCat%s(int code) {\n" % name) if len(Categories[name]) > minTableSize: output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)" % name) else: start = 1 for range in ranges: (begin, end) = range; if start: output.write(" return("); start = 0 else: output.write(" ||\n "); if (begin == end): output.write("(code == %s)" % (hex(begin))) else: output.write("((code >= %s) && (code <= %s))" % ( hex(begin), hex(end))) output.write(");\n}\n\n") header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n") output.write( """/** * xmlUCSIsCat: * @code: UCS code point * @cat: UCS Category name * * Check whether the character is part of the UCS Category * * Returns 1 if true, 0 if false and -1 on unknown category */ int xmlUCSIsCat(int code, const char *cat) { xmlIntFunc *func; func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat); if (func == NULL) return (-1); return (func(code)); } #define bottom_xmlunicode #include "elfgcchack.h" #endif /* LIBXML_UNICODE_ENABLED */ """) header.write(""" #ifdef __cplusplus } #endif #endif /* LIBXML_UNICODE_ENABLED */ #endif /* __XML_UNICODE_H__ */ """); header.close() output.close()