Edit

kc3-lang/md4c/scripts/build_folding_map.py

Branch :

  • Show log

    Commit

  • Author : Martin Mitas
    Date : 2020-05-20 16:44:07
    Hash : 72dad97e
    Message : scripts/build_folding_map.py: Handle properly "ranges" of length 2. Update the data structures in md_get_unicode_fold_info() to reflect the update in the script and handle the previously omitted characters. Fixes #113.

  • scripts/build_folding_map.py
  • #!/usr/bin/env python3
    
    import os
    import sys
    import textwrap
    
    
    self_path = os.path.dirname(os.path.realpath(__file__));
    f = open(self_path + "/unicode/CaseFolding.txt", "r")
    
    status_list = [ "C", "F" ]
    
    folding_list = [ dict(), dict(), dict() ]
    
    # Filter the foldings for "full" folding.
    for line in f:
        comment_off = line.find("#")
        if comment_off >= 0:
            line = line[:comment_off]
        line = line.strip()
        if not line:
            continue
    
        raw_codepoint, status, raw_mapping, ignored_tail = line.split(";", 3)
        if not status.strip() in status_list:
            continue
        codepoint = int(raw_codepoint.strip(), 16)
        mapping = [int(it, 16) for it in raw_mapping.strip().split(" ")]
        mapping_len = len(mapping)
    
        if mapping_len in range(1, 4):
            folding_list[mapping_len-1][codepoint] = mapping
        else:
            assert(False)
    f.close()
    
    
    # If we assume that range (index0 ... index-1) makes a range, check that index
    # is compatible with it too.
    #
    # We are capable to handle ranges which:
    #
    # (1) either form consecutive sequence of codepoints and which map that range
    #     to other consecutive range of codepoints;
    #
    # (2) or consecutive range of codepoints with step 2 where each codepoint
    #     CP is mapped to the next codepoint CP+1
    #     (e.g. 0x1234 -> 0x1235; 0x1236 -> 0x1238; ...).
    #
    # (If the mappings have multiple codepoints, only the 1st mapped codepoint is
    # considered and all the other ones have to be the same for the whole range.)
    def is_range_compatible(folding, codepoint_list, index0, index):
        N = index - index0
        codepoint0 = codepoint_list[index0]
        codepoint1 = codepoint_list[index0+1]
        codepointN = codepoint_list[index]
        mapping0 = folding[codepoint0]
        mapping1 = folding[codepoint1]
        mappingN = folding[codepointN]
    
        # Check the range type (1):
        if codepoint1 - codepoint0 == 1 and codepointN - codepoint0 == N                \
                and mapping1[0] - mapping0[0] == 1 and mapping1[1:] == mapping0[1:]     \
                and mappingN[0] - mapping0[0] == N and mappingN[1:] == mapping0[1:]:
            return True
    
        # Check the range type (2):
        if codepoint1 - codepoint0 == 2 and codepointN - codepoint0 == 2 * N            \
                and mapping0[0] - codepoint0 == 1                                       \
                and mapping1[0] - codepoint1 == 1 and mapping1[1:] == mapping0[1:]      \
                and mappingN[0] - codepointN == 1 and mappingN[1:] == mapping0[1:]:
            return True
    
        return False
    
    
    def mapping_str(list, mapping):
        return ",".join("0x{:04x}".format(x) for x in mapping)
    
    for mapping_len in range(1, 4):
        folding = folding_list[mapping_len-1]
        codepoint_list = list(folding)
    
        index0 = 0
        count = len(folding)
    
        records = list()
        data_records = list()
    
        while index0 < count:
            index1 = index0 + 1
            while index1 < count and is_range_compatible(folding, codepoint_list, index0, index1):
                index1 += 1
    
            if index1 - index0 > 2:
                # Range of codepoints
                records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1]))
                data_records.append(mapping_str(data_records, folding[codepoint_list[index0]]))
                data_records.append(mapping_str(data_records, folding[codepoint_list[index1-1]]))
                index0 = index1
            else:
                # Single codepoint
                records.append("S(0x{:04x})".format(codepoint_list[index0]))
                data_records.append(mapping_str(data_records, folding[codepoint_list[index0]]))
                index0 += 1
    
        sys.stdout.write("static const unsigned FOLD_MAP_{}[] = {{\n".format(mapping_len))
        sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110,
                            initial_indent = "    ", subsequent_indent="    ")))
        sys.stdout.write("\n};\n")
    
        sys.stdout.write("static const unsigned FOLD_MAP_{}_DATA[] = {{\n".format(mapping_len))
        sys.stdout.write("\n".join(textwrap.wrap(", ".join(data_records), 110,
                            initial_indent = "    ", subsequent_indent="    ")))
        sys.stdout.write("\n};\n")