Edit

kc3-lang/brotli/c/enc/utf8_util.c

Branch :

  • Show log

    Commit

  • Author : Eugene Kliuchnikov
    Date : 2021-09-08 09:18:45
    Hash : 62662f87
    Message : Strip "./" in includes (#925) Co-authored-by: Eugene Kliuchnikov <eustas@chromium.org>

  • c/enc/utf8_util.c
  • /* Copyright 2013 Google Inc. All Rights Reserved.
    
       Distributed under MIT license.
       See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
    */
    
    /* Heuristics for deciding about the UTF8-ness of strings. */
    
    #include "utf8_util.h"
    
    #include <brotli/types.h>
    
    #if defined(__cplusplus) || defined(c_plusplus)
    extern "C" {
    #endif
    
    static size_t BrotliParseAsUTF8(
        int* symbol, const uint8_t* input, size_t size) {
      /* ASCII */
      if ((input[0] & 0x80) == 0) {
        *symbol = input[0];
        if (*symbol > 0) {
          return 1;
        }
      }
      /* 2-byte UTF8 */
      if (size > 1u &&
          (input[0] & 0xE0) == 0xC0 &&
          (input[1] & 0xC0) == 0x80) {
        *symbol = (((input[0] & 0x1F) << 6) |
                   (input[1] & 0x3F));
        if (*symbol > 0x7F) {
          return 2;
        }
      }
      /* 3-byte UFT8 */
      if (size > 2u &&
          (input[0] & 0xF0) == 0xE0 &&
          (input[1] & 0xC0) == 0x80 &&
          (input[2] & 0xC0) == 0x80) {
        *symbol = (((input[0] & 0x0F) << 12) |
                   ((input[1] & 0x3F) << 6) |
                   (input[2] & 0x3F));
        if (*symbol > 0x7FF) {
          return 3;
        }
      }
      /* 4-byte UFT8 */
      if (size > 3u &&
          (input[0] & 0xF8) == 0xF0 &&
          (input[1] & 0xC0) == 0x80 &&
          (input[2] & 0xC0) == 0x80 &&
          (input[3] & 0xC0) == 0x80) {
        *symbol = (((input[0] & 0x07) << 18) |
                   ((input[1] & 0x3F) << 12) |
                   ((input[2] & 0x3F) << 6) |
                   (input[3] & 0x3F));
        if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) {
          return 4;
        }
      }
      /* Not UTF8, emit a special symbol above the UTF8-code space */
      *symbol = 0x110000 | input[0];
      return 1;
    }
    
    /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
    BROTLI_BOOL BrotliIsMostlyUTF8(
        const uint8_t* data, const size_t pos, const size_t mask,
        const size_t length, const double min_fraction) {
      size_t size_utf8 = 0;
      size_t i = 0;
      while (i < length) {
        int symbol;
        size_t bytes_read =
            BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
        i += bytes_read;
        if (symbol < 0x110000) size_utf8 += bytes_read;
      }
      return TO_BROTLI_BOOL((double)size_utf8 > min_fraction * (double)length);
    }
    
    #if defined(__cplusplus) || defined(c_plusplus)
    }  /* extern "C" */
    #endif