Branch :
/* c3
* Copyright 2022 kmx.io <contact@kmx.io>
*
* Permission is hereby granted to use this software granted
* the above copyright notice and this permission paragraph
* are included in all copies and substantial portions of this
* software.
*
* THIS SOFTWARE IS PROVIDED "AS-IS" WITHOUT ANY GUARANTEE OF
* PURPOSE AND PERFORMANCE. IN NO EVENT WHATSOEVER SHALL THE
* AUTHOR BE CONSIDERED LIABLE FOR THE USE AND PERFORMANCE OF
* THIS SOFTWARE.
*/
#include <assert.h>
#include <err.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "ucd.h"
#define BUFSZ 1024
unsigned long long read_hex (const char **src) {
unsigned long long u64 = 0;
char c;
while ((c = **src) != '\0' && c != ';') {
if ('0' <= c && c <= '9') {
u64 *= 16;
u64 += c - '0';
(*src)++;
}
else if ('A' <= c && c <= 'F') {
u64 *= 16;
u64 += c - 'A' + 10;
(*src)++;
}
else if ('a' <= c && c <= 'f') {
u64 *= 16;
u64 += c - 'a' + 10;
(*src)++;
}
else {
errx(1, "invalid character in index: %c", c);
}
}
if (c == ';')
(*src)++;
return u64;
}
void ucd_parse (s_ucd ucd[UCD_MAX], char *line,
unsigned long lineno) {
unsigned long i;
char *p = NULL;
char *sep;
size_t size;
#define UCD_PARSE_UNKNOWN_CATEGORY \
warnx("line %lu: unknown category : %c%c", lineno, p[0], p[1])
p = line;
i = read_hex((const char **) &p);
if (i >= UCD_MAX) {
warnx("%lu > UCD_MAX", i);
goto error;
}
ucd[i].flags = 0;
if (! (sep = strchr(p, ';')))
goto error;
*sep = '\0';
size = strlen(p) + 1;
ucd[i].name = malloc(size);
strlcpy(ucd[i].name, p, size);
p = sep + 1;
if (! (sep = strchr(p, ';')))
goto error;
while (p < sep) {
switch (p[0]) {
case 'L':
ucd[i].flags |= UCD_LETTER;
switch (p[1]) {
case 'l': ucd[i].flags |= UCD_LETTER_LOWERCASE; break;
case 'm': ucd[i].flags |= UCD_LETTER_MODIFIER; break;
case 'o': ucd[i].flags |= UCD_LETTER_OTHER; break;
case 't': ucd[i].flags |= UCD_LETTER_TITLECASE; break;
case 'u': ucd[i].flags |= UCD_LETTER_UPPERCASE; break;
default: UCD_PARSE_UNKNOWN_CATEGORY;
}
break;
case 'M':
ucd[i].flags |= UCD_MARK;
switch (p[1]) {
case 'c': ucd[i].flags |= UCD_MARK_COMBINING; break;
case 'e': ucd[i].flags |= UCD_MARK_ENCLOSING; break;
case 'n': ucd[i].flags |= UCD_MARK_NONSPACING; break;
default: UCD_PARSE_UNKNOWN_CATEGORY;
}
break;
case 'N':
ucd[i].flags |= UCD_NUMBER;
switch (p[1]) {
case 'd': ucd[i].flags |= UCD_NUMBER_DECIMAL_DIGIT; break;
case 'l': ucd[i].flags |= UCD_NUMBER_LETTER; break;
case 'o': ucd[i].flags |= UCD_NUMBER_OTHER; break;
default: UCD_PARSE_UNKNOWN_CATEGORY;
}
break;
case 'P':
ucd[i].flags |= UCD_PUNCTUATION;
switch (p[1]) {
case 'c': ucd[i].flags |= UCD_PUNCTUATION_CONNECTOR; break;
case 'd': ucd[i].flags |= UCD_PUNCTUATION_DASH; break;
case 's': ucd[i].flags |= UCD_PUNCTUATION_OPEN; break;
case 'e': ucd[i].flags |= UCD_PUNCTUATION_CLOSE; break;
case 'i': ucd[i].flags |= UCD_PUNCTUATION_INITIAL_QUOTE; break;
case 'f': ucd[i].flags |= UCD_PUNCTUATION_FINAL_QUOTE; break;
case 'o': ucd[i].flags |= UCD_PUNCTUATION_OTHER; break;
default: UCD_PARSE_UNKNOWN_CATEGORY;
}
break;
case 'S':
ucd[i].flags |= UCD_SYMBOL;
switch (p[1]) {
case 'c': ucd[i].flags |= UCD_SYMBOL_CURRENCY; break;
case 'k': ucd[i].flags |= UCD_SYMBOL_MODIFIER; break;
case 'm': ucd[i].flags |= UCD_SYMBOL_MATH; break;
case 'o': ucd[i].flags |= UCD_SYMBOL_OTHER; break;
default: UCD_PARSE_UNKNOWN_CATEGORY;
}
break;
case 'Z':
ucd[i].flags |= UCD_SEPARATOR;
switch (p[1]) {
case 'l': ucd[i].flags |= UCD_SEPARATOR_LINE; break;
case 'p': ucd[i].flags |= UCD_SEPARATOR_PARAGRAPH; break;
case 's': ucd[i].flags |= UCD_SEPARATOR_SPACE; break;
default: UCD_PARSE_UNKNOWN_CATEGORY;
}
break;
case 'C':
ucd[i].flags |= UCD_OTHER;
switch (p[1]) {
case 'c': ucd[i].flags |= UCD_OTHER_CONTROL; break;
case 'f': ucd[i].flags |= UCD_OTHER_FORMAT; break;
case 's': ucd[i].flags |= UCD_OTHER_SURROGATE; break;
case 'o': ucd[i].flags |= UCD_OTHER_PRIVATE_USE; break;
case 'n': ucd[i].flags |= UCD_OTHER_NOT_ASSIGNED; break;
default: UCD_PARSE_UNKNOWN_CATEGORY;
}
break;
default: warnx("line %lu: unknown category: '%c'", lineno, *p);
}
if (*p != ';')
p++;
if (*p != ';')
p++;
}
return;
error:
warnx("invalid entry line %lu: %s", lineno, line);
}
void ucd_write_c (s_ucd ucd[UCD_MAX])
{
unsigned long i = 0;
printf("/* c3\n"
" * Copyright 2022 kmx.io <contact@kmx.io>\n"
" *\n"
" * Permission is hereby granted to use this software granted\n"
" * the above copyright notice and this permission paragraph\n"
" * are included in all copies and substantial portions of this\n"
" * software.\n"
" *\n"
" * THIS SOFTWARE IS PROVIDED \"AS-IS\" WITHOUT ANY GUARANTEE OF\n"
" * PURPOSE AND PERFORMANCE. IN NO EVENT WHATSOEVER SHALL THE\n"
" * AUTHOR BE CONSIDERED LIABLE FOR THE USE AND PERFORMANCE OF\n"
" * THIS SOFTWARE.\n"
" */\n");
printf("/* generated by ucd2c */\n");
printf("#include \"ucd.h\"\n");
printf("const s_ucd g_ucd[UCD_MAX] = {\n");
while (i < UCD_MAX) {
if (i > 0)
printf(",\n");
printf(" {0x%llx, ",
ucd[i].flags);
if (ucd[i].name)
printf("\"%s\"", ucd[i].name);
else
printf("0");
printf("}");
i++;
}
printf("\n};\n");
}
int main (int argc, char **argv)
{
char buf[BUFSZ];
unsigned long lineno;
s_ucd *ucd = calloc(sizeof(s_ucd), UCD_MAX);
(void) argc;
(void) argv;
lineno = 0;
while (fgets(buf, BUFSZ, stdin)) {
size_t len = strlen(buf);
if (len > 0 && buf[len - 1] == '\n')
buf[len - 1] = 0;
ucd_parse(ucd, buf, ++lineno);
}
ucd_write_c(ucd);
free(ucd);
return 0;
}