Edit

kc3-lang/libxml2/tools/genHtmlEnt.py

Branch :

  • Show log

    Commit

  • Author : Nick Wellnhofer
    Date : 2024-09-03 15:52:44
    Hash : 59511792
    Message : html: Parse named character references according to HTML5

  • tools/genHtmlEnt.py
  • #!/usr/bin/env python3
    
    import json
    import sys
    from dataclasses import dataclass
    
    # The basic idea is to find named character references using binary
    # search. Since entity strings may not have a terminator, this doesn't
    # work if one entity string is a prefix of another. In this case,
    # we branch to a subtable after matching the prefix.
    #
    # We create separate initial tables based on the first character
    # of the entity name.
    #
    # The following tables are generated:
    #
    # htmlEntAlpha:   start and end of initial tables, indexing into
    #                 htmlEntValues
    # htmlEntValues:  concatenation of all table values, which index into
    #                 htmlEntStrings
    # htmlEntStrings: variable sized records containing entity name,
    #                 replacement and optionally the position of a
    #                 subtable
    
    try:
        with open('entities.json') as json_data:
            ents = json.load(json_data)
    except FileNotFoundError:
        print('entities.json not found, try curl -LJO',
              'https://html.spec.whatwg.org/entities.json')
        sys.exit(1)
    
    def to_cchars(s):
        r = []
    
        for c in s.encode():
            if c >= 0x20 and c <= 0x7E and c != ord("'") and c != ord('\\'):
                v = f"'{chr(c)}'"
            else:
                v = c
            r += [ v ]
    
        return r
    
    @dataclass
    class PrefixStackEntry:
        prefix: str
        table_id: int
    
    @dataclass
    class AlphaFixup:
        table_id: int
        char: int
    
    @dataclass
    class StringFixup:
        table_id: int
        string_index: int
        super_table_id: int
        super_offset: int
    
    # Remove entity strings without trailing semicolon
    keys = (key for key in ents.keys() if key.endswith(';'))
    
    # Sort entity strings
    keys = sorted(keys, key=lambda k: k[1:-1])
    
    strings = []
    tables = []
    prefix_stack = []
    alpha_fixups = []
    string_fixups = []
    for i in range(64):
        tables.append([])
    
    for i, key in enumerate(keys):
        name = key[1:-1]
    
        next_name = None
        if i + 1 < len(keys):
            next_name = keys[i+1][1:-1]
    
        while prefix_stack and not name.startswith(prefix_stack[-1].prefix):
            prefix_stack.pop()
    
        # First character is initial prefix
        if not prefix_stack:
            table_id = len(tables)
            tables.append([])
    
            prefix_stack.append(PrefixStackEntry(name[0], table_id))
            alpha_fixups.append(AlphaFixup(table_id, ord(name[0]) % 64))
    
        string_index = len(strings)
        table = tables[prefix_stack[-1].table_id]
        table_index = len(table)
        table.append(string_index)
    
        name_offset = len(prefix_stack[-1].prefix)
        name_chars = to_cchars(name[name_offset:])
        repl_chars = to_cchars(ents[key]['characters'])
        semicolon_flag = 0
        if key[:-1] in ents:
            semicolon_flag = 0x80
    
        if next_name and next_name.startswith(name):
            # Create subtable
    
            strings += [
                len(name_chars) | semicolon_flag | 0x40, *name_chars,
                0, 0, # subtable position, to be fixed up
                len(repl_chars), *repl_chars,
            ]
    
            table_id = len(tables)
            tables.append([])
    
            fixup_index = string_index + 1 + len(name_chars)
            string_fixups.append(StringFixup(
                table_id, fixup_index, prefix_stack[-1].table_id, table_index,
            ))
    
            prefix_stack.append(PrefixStackEntry(name, table_id))
        else:
            strings += [
                len(name_chars) | semicolon_flag, *name_chars,
                len(repl_chars), *repl_chars,
            ]
    
    # Concat tables and record ranges
    ranges = [ 0 ]
    values = []
    for table in tables:
        values += table
        ranges.append(len(values))
    
    # Create alpha table
    alpha = [ 0 ] * (59 * 3)
    for fixup in alpha_fixups:
        table_id, c = fixup.table_id, fixup.char
        start = ranges[table_id]
        end = ranges[table_id+1]
        alpha[c*3:c*3+3] = [ start & 0xFF, start >> 8, end - start ]
    
    # Fix up subtable positions
    for fixup in string_fixups:
        table_id, i = fixup.table_id, fixup.string_index
        start = ranges[table_id]
        end = ranges[table_id+1]
        super_index = ranges[fixup.super_table_id] + fixup.super_offset
        strings[i:i+2] = [ start - super_index, end - start ]
    
    # Print tables
    
    def gen_table(ctype, cname, values, fmt, elems_per_line):
        count = len(values)
        r = ''
    
        for i in range(count):
            if i != 0: r += ','
            if i % elems_per_line == 0: r += '\n    '
            else: r += ' '
            r += fmt % values[i]
    
        return f'static const {ctype} {cname}[{count}] = {{{r}\n}};\n'
    
    print(gen_table('unsigned char', 'htmlEntAlpha', alpha, '%3d', 15))
    print(gen_table('unsigned short', 'htmlEntValues', values, '%5d', 10))
    print(gen_table('unsigned char', 'htmlEntStrings', strings, '%3s', 15))