Hash :
de06f29f
Author :
Date :
2025-09-14T09:21:39
keysyms: Enable to use UCD files instead of ICU This enables to use a local Unicode Character Database instead of ICU via Python. Useful when ICU is not available or when it does not support the latest Unicode version. Data are available at: https://www.unicode.org/Public/<UNICODE_VERSION>/ucd
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
#!/usr/bin/env python3
# # Copyright © 2025 Pierre Le Marre <dev@wismill.eu>
# SPDX-License-Identifier: MIT
"""
Utils to parse the Unicode database files
"""
from collections.abc import Callable, Iterator
from dataclasses import dataclass
from pathlib import Path
import sys
from typing import ClassVar, Self
def parse_code_point(raw: str) -> int | None:
return None if not raw else int(raw, 16)
@dataclass
class CodePointRange:
start: int
end: int
def __iter__(self) -> Iterator[int]:
yield from range(self.start, self.end + 1)
@classmethod
def parse(cls, raw: str) -> Self:
start, *end = raw.strip().split("..")
return cls(
start=int(start, 16), end=int(start, 16) if not end else int(end[0], 16)
)
@dataclass
class PropertyEntry:
code_point: int
property: str
@classmethod
def parse_file(
cls, path: Path, filter: Callable[[str], bool] | None = None
) -> Iterator[Self]:
with path.open("rt", encoding="utf-8") as fd:
for line in fd:
# Remove comment
line, *_ = line.split("#")
line = line.strip()
# Skip empty lines
if not line:
continue
raw_range, property, *_ = line.split(";")
range = CodePointRange.parse(raw_range)
property = property.strip()
if filter and not filter(property):
continue
for code_point in range:
yield cls(code_point=code_point, property=property)
@dataclass
class UnicodeDataEntry:
code_point: int
general_category: str
lower_case: int | None
upper_case: int | None
title_case: int | None
@classmethod
def parse_file(cls, path: Path) -> Iterator[Self]:
with path.open("rt", encoding="utf-8") as fd:
for line in fd:
line = line.strip()
if not line or line.startswith("#"):
continue
(
cp,
_name,
general_category,
_cc,
_bc,
_d,
_decimal,
_digit,
_numeric,
_mirrored,
_,
_,
upper_case,
lower_case,
title_case,
*_,
) = line.split(";")
code_point = int(cp, 16)
yield cls(
code_point=code_point,
general_category=general_category,
lower_case=parse_code_point(lower_case),
upper_case=parse_code_point(upper_case),
title_case=parse_code_point(title_case),
)
@dataclass
class DB:
lower_case: set[int]
upper_case: set[int]
title_case: set[int]
lower_case_mappings: dict[int, int]
upper_case_mappings: dict[int, int]
title_case_mappings: dict[int, int]
case_properties: ClassVar[frozenset[str]] = frozenset(("Lowercase", "Uppercase"))
@classmethod
def filter_case_properties(cls, property: str) -> bool:
return property in cls.case_properties
@classmethod
def parse_ucd(cls, path: Path) -> Self:
lower_case: set[int] = set()
upper_case: set[int] = set()
title_case: set[int] = set()
lower_case_mappings: dict[int, int] = {}
upper_case_mappings: dict[int, int] = {}
title_case_mappings: dict[int, int] = {}
for entry in UnicodeDataEntry.parse_file(path / "UnicodeData.txt"):
if entry.general_category == "Lt":
title_case.add(entry.code_point)
if entry.lower_case is not None:
lower_case_mappings[entry.code_point] = entry.lower_case
if entry.upper_case is not None:
upper_case_mappings[entry.code_point] = entry.upper_case
if entry.title_case is not None:
title_case_mappings[entry.code_point] = entry.title_case
for entry in PropertyEntry.parse_file(
path / "DerivedCoreProperties.txt", filter=cls.filter_case_properties
):
match entry.property:
case "Lowercase":
lower_case.add(entry.code_point)
case "Uppercase":
upper_case.add(entry.code_point)
case _:
raise ValueError(entry)
return cls(
lower_case=lower_case,
upper_case=upper_case,
title_case=title_case,
lower_case_mappings=lower_case_mappings,
upper_case_mappings=upper_case_mappings,
title_case_mappings=title_case_mappings,
)
def isULowercase(self, cp: int) -> bool:
return cp in self.lower_case
def isUUppercase(self, cp: int) -> bool:
return cp in self.upper_case
def istitle(self, cp: int) -> bool:
return cp in self.title_case
def tolower(self, cp_or_char: int | str) -> int | str:
cp = cp_or_char if isinstance(cp_or_char, int) else ord(cp_or_char)
mapping = self.lower_case_mappings.get(cp, cp)
return mapping if isinstance(cp_or_char, int) else chr(mapping)
def toupper(self, cp_or_char: int | str) -> int | str:
cp = cp_or_char if isinstance(cp_or_char, int) else ord(cp_or_char)
mapping = self.upper_case_mappings.get(cp, cp)
return mapping if isinstance(cp_or_char, int) else chr(mapping)
if __name__ == "__main__":
# Test
import icu
c = icu.Locale.createFromName("C")
icu.Locale.setDefault(c)
path = Path(sys.argv[1])
db = DB.parse_ucd(path)
for cp in range(0, 0x10FFFF + 1):
assert db.isULowercase(cp) == icu.DB.isULowercase(cp), (
cp,
db.isULowercase(cp),
icu.DB.isULowercase(cp),
)
assert db.isUUppercase(cp) == icu.DB.isUUppercase(cp), cp
assert db.istitle(cp) == icu.DB.istitle(cp), cp
assert db.tolower(cp) == icu.DB.tolower(cp)
assert db.toupper(cp) == icu.DB.toupper(cp)