Hash :
4f6a9e54
        
        Author :
  
        
        Date :
2019-05-19T10:46:26
        
      
Update Unicode support to 12.1. * scipts/build_*_map.py: Implement helper pythonic scripts used to generate some Unicode search maps and data for helper Unicode functions used in MD4C. This should simplify updating to future Unicode versions. * md_get_unicode_fold_info: Use data generated by the scripts. * md_is_unicode_whitespace__: Ditto. * md_is_unicode_punct__: Ditto.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
#!/usr/bin/env python3
import os
import sys
import textwrap
self_path = os.path.dirname(os.path.realpath(__file__));
f = open(self_path + "/unicode/DerivedGeneralCategory.txt", "r")
codepoint_list = []
category_list = [ "Zs" ]
# Filter codepoints falling in the right category:
for line in f:
    comment_off = line.find("#")
    if comment_off >= 0:
        line = line[:comment_off]
    line = line.strip()
    if not line:
        continue
    char_range, category = line.split(";")
    char_range = char_range.strip()
    category = category.strip()
    if not category in category_list:
        continue
    delim_off = char_range.find("..")
    if delim_off >= 0:
        codepoint0 = int(char_range[:delim_off], 16)
        codepoint1 = int(char_range[delim_off+2:], 16)
        for codepoint in range(codepoint0, codepoint1 + 1):
            codepoint_list.append(codepoint)
    else:
        codepoint = int(char_range, 16)
        codepoint_list.append(codepoint)
f.close()
codepoint_list.sort()
index0 = 0
count = len(codepoint_list)
records = list()
while index0 < count:
    index1 = index0 + 1
    while index1 < count and codepoint_list[index1] == codepoint_list[index1-1] + 1:
        index1 += 1
    if index1 - index0 > 1:
        # Range of codepoints
        records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1]))
    else:
        # Single codepoint
        records.append("S(0x{:04x})".format(codepoint_list[index0]))
    index0 = index1
sys.stdout.write("static const unsigned WHITESPACE_MAP[] = {\n")
sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110,
                    initial_indent = "    ", subsequent_indent="    ")))
sys.stdout.write("\n};\n\n")