Hash :
a615528b
Author :
Date :
2000-11-23T19:54:07
Move src/ to lib/, and install the iconv program.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
/*
* GBK
*/
/*
* GBK, as described in Ken Lunde's book, is an extension of GB 2312-1980
* (shifted by adding 0x8080 to the range 0xA1A1..0xFEFE, as used in EUC-CN).
* It adds the following ranges:
*
* (part of GBK/1) 0xA2A1-0xA2AA Small Roman numerals
* GBK/3 0x{81-A0}{40-7E,80-FE} 6080 new characters, all in Unicode
* GBK/4 0x{AA-FE}{40-7E,80-A0} 8160 new characters, 8080 in Unicode
* GBK/5 0x{A8-A9}{40-7E,80-A0} 166 new characters, 153 in Unicode
*/
/*
* CP936 is nearly identical to GBK. It differs as follows:
*
* 1. Some characters in the GB2312 range are defined differently:
*
* code GB2312 CP936.TXT
* 0xA1A4 0x30FB # KATAKANA MIDDLE DOT 0x00B7 # MIDDLE DOT
* 0xA1AA 0x2015 # HORIZONTAL BAR 0x2014 # EM DASH
*
* 2. 19 characters added in the range 0xA6E0-0xA6F5.
*
* 3. 4 characters added in the range 0xA8BB-0xA8C0.
*/
/*
* Since all three tables I have looked at
* - the CP936 table by Microsoft, found on ftp.unicode.org,
* - the GBK table by Sun, investigated on a Solaris 2.7 machine,
* - the GBK tables by CWEX, found in the Big5+ package,
* all include these CP936 extensions (the CWEX tables have additional
* differences), I conclude that either Ken Lunde has overlooked some of
* the differences between GB2312 and GBK, or he is right but the major
* vendors don't care about it. In either case, CP936 is the de facto
* standard under the name "GBK", and we should better support it.
*
* So in what follows, when we write "GBK" we always mean "CP936".
*/
#include "gbkext1.h"
#include "gbkext2.h"
#include "gbkext_inv.h"
#include "cp936ext.h"
static int
gbk_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
{
unsigned char c = *s;
if (c >= 0x81 && c < 0xff) {
if (n < 2)
return RET_TOOFEW(0);
if (c >= 0xa1 && c <= 0xf7) {
unsigned char c2 = s[1];
if (c == 0xa1) {
if (c2 == 0xa4) {
*pwc = 0x00b7;
return 2;
}
if (c2 == 0xaa) {
*pwc = 0x2014;
return 2;
}
}
if (c2 >= 0xa1 && c2 < 0xff) {
unsigned char buf[2];
int ret;
buf[0] = c-0x80; buf[1] = c2-0x80;
ret = gb2312_mbtowc(conv,pwc,buf,2);
if (ret != RET_ILSEQ)
return ret;
buf[0] = c; buf[1] = c2;
ret = cp936ext_mbtowc(conv,pwc,buf,2);
if (ret != RET_ILSEQ)
return ret;
}
}
if (c >= 0x81 && c <= 0xa0)
return gbkext1_mbtowc(conv,pwc,s,2);
if (c >= 0xa8 && c <= 0xfe)
return gbkext2_mbtowc(conv,pwc,s,2);
if (c == 0xa2) {
unsigned char c2 = s[1];
if (c2 >= 0xa1 && c2 <= 0xaa) {
*pwc = 0x2170+(c2-0xa1);
return 2;
}
}
}
return RET_ILSEQ;
}
static int
gbk_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
{
unsigned char buf[2];
int ret;
if (wc != 0x30fb && wc != 0x2015) {
ret = gb2312_wctomb(conv,buf,wc,2);
if (ret != RET_ILSEQ) {
if (ret != 2) abort();
if (n < 2)
return RET_TOOSMALL;
r[0] = buf[0]+0x80;
r[1] = buf[1]+0x80;
return 2;
}
}
ret = gbkext_inv_wctomb(conv,buf,wc,2);
if (ret != RET_ILSEQ) {
if (ret != 2) abort();
if (n < 2)
return RET_TOOSMALL;
r[0] = buf[0];
r[1] = buf[1];
return 2;
}
if (wc >= 0x2170 && wc <= 0x2179) {
r[0] = 0xa2;
r[1] = 0xa1 + (wc-0x2170);
return 2;
}
ret = cp936ext_wctomb(conv,buf,wc,2);
if (ret != RET_ILSEQ) {
if (ret != 2) abort();
if (n < 2)
return RET_TOOSMALL;
r[0] = buf[0];
r[1] = buf[1];
return 2;
}
if (wc == 0x00b7) {
if (n < 2)
return RET_TOOSMALL;
r[0] = 0xa1;
r[1] = 0xa4;
return 2;
}
if (wc == 0x2014) {
if (n < 2)
return RET_TOOSMALL;
r[0] = 0xa1;
r[1] = 0xaa;
return 2;
}
return RET_ILSEQ;
}