Edit

kc3-lang/libxkbcommon/src/utf8-decoding.c

Branch :

  • Show log

    Commit

  • Author : Pierre Le Marre
    Date : 2024-09-12 16:52:40
    Hash : 98dee225
    Message : Add UTF-8 to UTF-32 decoding Add internal functions to convert UTF-32 to UTF-8, with corresponding tests.

  • src/utf8-decoding.c
  • /*
     * Copyright © 2024 Pierre Le Marre <dev@wismill.eu>
     *
     * Permission is hereby granted, free of charge, to any person obtaining a
     * copy of this software and associated documentation files (the "Software"),
     * to deal in the Software without restriction, including without limitation
     * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     * and/or sell copies of the Software, and to permit persons to whom the
     * Software is furnished to do so, subject to the following conditions:
     *
     * The above copyright notice and this permission notice (including the next
     * paragraph) shall be included in all copies or substantial portions of the
     * Software.
     *
     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     * DEALINGS IN THE SOFTWARE.
     */
    
    #include "config.h"
    
    #include "utf8-decoding.h"
    
    /* Array mapping the leading byte to the length of a UTF-8 sequence.
     * A value of zero indicates that the byte can not begin a UTF-8 sequence. */
    static const uint8_t utf8_sequence_length_by_leading_byte[256] = {
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00-0x0F */
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x10-0x1F */
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x20-0x2F */
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x30-0x3F */
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40-0x4F */
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x50-0x5F */
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60-0x6F */
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x70-0x7F */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80-0x8F */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90-0x9F */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0-0xAF */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0-0xBF */
        0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xC0-0xCF */
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0xD0-0xDF */
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xE0-0xEF */
        4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0-0xFF */
    };
    
    /* Length of next utf-8 sequence */
    uint8_t
    utf8_sequence_length(const char *s)
    {
        return utf8_sequence_length_by_leading_byte[(unsigned char)s[0]];
    }
    
    /* Reads the next UTF-8 sequence in a string */
    uint32_t
    utf8_next_code_point(const char *s, size_t max_size, size_t *size_out)
    {
        uint32_t cp = 0;
        uint8_t len = utf8_sequence_length(s);
        *size_out = 0;
    
        if (!max_size || len > max_size)
            return INVALID_UTF8_CODE_POINT;
    
        /* Handle leading byte */
        switch (len) {
        case 1:
            *size_out = 1;
            return (uint32_t)s[0];
        case 2:
            cp = (uint32_t)s[0] & 0x1f;
            break;
        case 3:
            cp = (uint32_t)s[0] & 0x0f;
            break;
        case 4:
            cp = (uint32_t)s[0] & 0x07;
            break;
        default:
            return INVALID_UTF8_CODE_POINT;
        }
    
        /* Process remaining bytes of the UTF-8 sequence */
        for (size_t k = 1; k < len; k++) {
            if (((uint32_t)s[k] & 0xc0) != 0x80)
                return INVALID_UTF8_CODE_POINT;
            cp <<= 6;
            cp |= (uint32_t)s[k] & 0x3f;
        }
    
        /* Check surrogates */
        if (cp >= 0xd800 && cp <= 0xdfff)
            return INVALID_UTF8_CODE_POINT;
    
        *size_out = len;
        return cp;
    }