Branch
Hash :
91f96be0
Author :
Date :
2021-06-06T11:51:12
Change the license of the library from LGPL 2.0 to LGPL 2.1.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
/*
* Copyright (C) 1999-2001, 2005, 2012, 2016 Free Software Foundation, Inc.
* This file is part of the GNU LIBICONV Library.
*
* The GNU LIBICONV Library is free software; you can redistribute it
* and/or modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* The GNU LIBICONV Library is distributed in the hope that it will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with the GNU LIBICONV Library; see the file COPYING.LIB.
* If not, see <https://www.gnu.org/licenses/>.
*/
/*
* ISO-IR-165
*/
/*
* ISO-IR-165 is an extension of GB 2312, consisting of:
* 1. GB 6345.1-86 corrections:
* Two corrections to GB 2312, at 0x2367 and 0x6F71.
* 2. GB 6345.1-86 additions:
* - 6 new full-width pinyin characters in row 0x28.
* - ISO646-CN in row 0x2A.
* - 32 half-width pinyin characters in row 0x2B.
* 3. GB 8565.2-88 additions:
* - 50 characters in row 0x2D.
* - 92 characters in row 0x2E.
* - 93 characters in row 0x2F.
* - 470 characters in rows 0x7A-0x7E.
* 4. ISO-IR-165 additions:
* - 22 characters in row 0x26.
* - 94 characters in row 0x2C.
* - 44 new characters in row 0x2D.
* - 1 new character in row 0x2F.
*
* The conversion table was created from the following sources:
* Ad 1. The 0x2367 correction is already integrated in the unicode.org
* GB2312.TXT table. The 0x6F71 mapping is the same in the unicode.org
* GB2312.TXT and UNIHAN.TXT table and in Koichi Yasuoka's Uni2GB table,
* so we assume it's correct.
* The unicode.org UNIHAN.TXT table about GB 8565 is not usable: it has
* extraneous code points at rows 0x28, 0x2C, 0x2D. Note also that it does
* not list the 69 non-hanzi in row 0x2F. Moreover, it has the characters
* 0x2F7A-0x2F7D shifted down by one to 0x2F79-0x2F7C.
* Therefore we take the GB8565 and ISO-IR-165 data from Koichi Yasuoka's
* Uni2GB table.
* Ad 1. Yasuoka maps 0x2367 to U+0261 (small script g) and 0x2840 to U+FF47
* (full-width small normal g). While coherent with ISO-IR's 165.pdf,
* this disagrees with Ken Lunde's book: He says that ISO-IR-165
* includes the GB6345 correction, i.e. maps 0x2367 to U+FF47 or U+0067
* and _not_ to U+0261 (small script g).
* To overcome the confusion, we just map both 0x2367 and 0x2840 to
* U+FF47.
* Ad 2. Row 0x28: Add a mapping from 0x283F to U+01F9.
* Row 0x2A: Mapping is well-known, also present in Koichi Yasuoka's
* table.
* Row 0x2B: Typed in by hand from appendix E in Ken Lunde's book.
* When converting from Unicode to ISO-IR-165, prefer the half-width
* range 0x2B{21..40} to the full-width range 0x28{21..40}.
* Ad 3. Rows 0x2D, 0x2E: Both Koichi Yasuoka's Uni2GB table and the UNIHAN.TXT
* data for GB 8565 agree here.
* Row 0x2F: Taken from Koichi Yasuoka's Uni2GB table.
* Rows 0x7A-0x7E: Koichi Yasuoka's Uni2GB table and the UNIHAN.TXT
* data for GB 8565 agree here mostly. Differences:
* 0x7C38 -> U+6F26 or U+527A ? We choose U+6F26.
* 0x7C5A -> U+7A40 or U+6996 ? We choose U+6996.
* Ad 4. Row 0x26: Mapping unknown.
* Rows 0x2C, 0x2D: Both Koichi Yasuoka's Uni2GB table and the UNIHAN.TXT
* data for GB 8565 (!) agree here.
* Row 0x2F: Taken from Koichi Yasuoka's Uni2GB table.
*/
#include "isoir165ext.h"
static int
isoir165_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
{
int ret;
/* Map full-width pinyin (row 0x28) like half-width pinyin (row 0x2B). */
if (s[0] == 0x28) {
if (n >= 2) {
unsigned char c2 = s[1];
if (c2 >= 0x21 && c2 <= 0x40) {
unsigned char buf[2];
buf[0] = 0x2b;
buf[1] = c2;
ret = isoir165ext_mbtowc(conv,pwc,buf,2);
if (ret != RET_ILSEQ)
return ret;
}
}
}
/* Try the GB2312 -> Unicode table. */
ret = gb2312_mbtowc(conv,pwc,s,n);
if (ret != RET_ILSEQ)
return ret;
/* Row 0x2A is GB_1988-80. */
if (s[0] == 0x2a) {
if (n >= 2) {
unsigned char c2 = s[1];
if (c2 >= 0x21 && c2 < 0x7f) {
ret = iso646_cn_mbtowc(conv,pwc,s+1,1);
if (ret != 1) abort();
return 2;
}
return RET_ILSEQ;
}
return RET_TOOFEW(0);
}
/* Try the ISO-IR-165 extensions -> Unicode table. */
ret = isoir165ext_mbtowc(conv,pwc,s,n);
return ret;
}
static int
isoir165_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
{
unsigned char buf[2];
int ret;
/* Try the Unicode -> GB2312 table. */
ret = gb2312_wctomb(conv,buf,wc,2);
if (ret != RET_ILUNI) {
if (ret != 2) abort();
if (!(buf[0] == 0x28 && buf[1] >= 0x21 && buf[1] <= 0x40)) {
if (n >= 2) {
r[0] = buf[0];
r[1] = buf[1];
return 2;
}
return RET_TOOSMALL;
}
}
/* Row 0x2A is GB_1988-80. */
ret = iso646_cn_wctomb(conv,buf,wc,1);
if (ret != RET_ILUNI) {
if (ret != 1) abort();
if (buf[0] >= 0x21 && buf[0] < 0x7f) {
if (n >= 2) {
r[0] = 0x2a;
r[1] = buf[0];
return 2;
}
return RET_TOOSMALL;
}
}
/* Try the Unicode -> ISO-IR-165 extensions table. */
ret = isoir165ext_wctomb(conv,r,wc,n);
return ret;
}