普通文本  |  89行  |  2.67 KB

import json

import html5lib

def parse(path="html5ents.xml"):
    return html5lib.parse(open(path), treebuilder="lxml")

def entity_table(tree):
    return dict((entity_name("".join(tr[0].xpath(".//text()"))),
                 entity_characters(tr[1].text))
                for tr in tree.xpath("//h:tbody/h:tr",
                                     namespaces={"h":"http://www.w3.org/1999/xhtml"}))

def entity_name(inp):
    return inp.strip()

def entity_characters(inp):
    return "".join(codepoint_to_character(item)
                    for item in inp.split()
                    if item)

def codepoint_to_character(inp):
    return ("\U000"+inp[2:]).decode("unicode-escape")

def make_tests_json(entities):
    test_list = make_test_list(entities)
    tests_json = {"tests":
                      [make_test(*item) for item in test_list]
                  }
    return tests_json

def make_test(name, characters, good):
    return {
        "description":test_description(name, good),
        "input":"&%s"%name,
        "output":test_expected(name, characters, good)
        }

def test_description(name, good):
    with_semicolon = name.endswith(";")
    semicolon_text = {True:"with a semi-colon",
                      False:"without a semi-colon"}[with_semicolon]
    if good:
        text = "Named entity: %s %s"%(name, semicolon_text)
    else:
        text = "Bad named entity: %s %s"%(name, semicolon_text)
    return text

def test_expected(name, characters, good):
    rv = []
    if not good or not name.endswith(";"):
        rv.append("ParseError")
    rv.append(["Character", characters])
    return rv

def make_test_list(entities):
    tests = []
    for entity_name, characters in entities.items():
        if entity_name.endswith(";") and not subentity_exists(entity_name, entities):
            tests.append((entity_name[:-1], "&" + entity_name[:-1], False))
        tests.append((entity_name, characters, True))
    return sorted(tests)

def subentity_exists(entity_name, entities):
    for i in range(1, len(entity_name)):
        if entity_name[:-i] in entities:
            return True
    return False

def make_entities_code(entities):
    entities_text = "\n".join("    \"%s\": u\"%s\","%(
            name, entities[name].encode(
                "unicode-escape").replace("\"", "\\\""))
                              for name in sorted(entities.keys()))
    return """entities = {
%s
}"""%entities_text

def main():
    entities = entity_table(parse())
    tests_json = make_tests_json(entities)
    json.dump(tests_json, open("namedEntities.test", "w"), indent=4)
    code = make_entities_code(entities)
    open("entities_constants.py", "w").write(code)

if __name__ == "__main__":
    main()