Hash :
08d9a031
Author :
Date :
2025-04-08T06:31:33
Unicode: Make surrogate handling more explicit
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
/*
* Copyright © 2024 Pierre Le Marre <dev@wismill.eu>
* SPDX-License-Identifier: MIT
*/
#include "config.h"
#include "utils.h"
#include "utf8-decoding.h"
/* Array mapping the leading byte to the length of a UTF-8 sequence.
* A value of zero indicates that the byte can not begin a UTF-8 sequence. */
static const uint8_t utf8_sequence_length_by_leading_byte[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00-0x0F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x10-0x1F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x20-0x2F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x30-0x3F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40-0x4F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x50-0x5F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60-0x6F */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x70-0x7F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80-0x8F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90-0x9F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0-0xAF */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0-0xBF */
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xC0-0xCF */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xD0-0xDF */
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xE0-0xEF */
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0-0xFF */
};
/* Length of next utf-8 sequence */
uint8_t
utf8_sequence_length(const char *s)
{
return utf8_sequence_length_by_leading_byte[(unsigned char)s[0]];
}
/* Reads the next UTF-8 sequence in a string */
uint32_t
utf8_next_code_point(const char *s, size_t max_size, size_t *size_out)
{
uint32_t cp = 0;
uint8_t len = utf8_sequence_length(s);
*size_out = 0;
if (!max_size || len > max_size)
return INVALID_UTF8_CODE_POINT;
/* Handle leading byte */
switch (len) {
case 1:
*size_out = 1;
return (uint32_t)s[0];
case 2:
cp = (uint32_t)s[0] & 0x1f;
break;
case 3:
cp = (uint32_t)s[0] & 0x0f;
break;
case 4:
cp = (uint32_t)s[0] & 0x07;
break;
default:
return INVALID_UTF8_CODE_POINT;
}
/* Process remaining bytes of the UTF-8 sequence */
for (size_t k = 1; k < len; k++) {
if (((uint32_t)s[k] & 0xc0) != 0x80)
return INVALID_UTF8_CODE_POINT;
cp <<= 6;
cp |= (uint32_t)s[k] & 0x3f;
}
/* Check surrogates */
if (is_surrogate(cp))
return INVALID_UTF8_CODE_POINT;
*size_out = len;
return cp;
}