kmx git

/* Copyright (C) 1999 Free Software Foundation, Inc.
   This file is part of the GNU ICONV Library.

   The GNU ICONV Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   The GNU ICONV Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the GNU ICONV Library; see the file COPYING.LIB.  If not,
   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.  */

#include <libiconv.h>

#include <stdlib.h>
#include <string.h>
#include "config.h"

/* Define our own notion of wchar_t, as UCS-4, according to ISO-10646-1. */
#undef wchar_t
#define wchar_t  unsigned int

/* State used by a conversion. 0 denotes the initial state. */
typedef unsigned int state_t;

/* iconv_t is an opaque type. This is the real iconv_t type. */
typedef struct conv_struct * conv_t;

/*
 * Data type for conversion multibyte -> unicode
 */
struct mbtowc_funcs {
  int (*xxx_mbtowc) (conv_t conv, wchar_t *pwc, unsigned char const *s, int n);
  /*
   * int xxx_mbtowc (conv_t conv, wchar_t *pwc, unsigned char const *s, int n)
   * converts the byte sequence starting at s to a wide character. Up to n bytes
   * are available at s. n is >= 1.
   * Result is number of bytes consumed (if a wide character was read),
   * or 0 if invalid, or -1 if n too small, or -1-(number of bytes consumed)
   * if only a shift sequence was read.
   */
};

/*
 * Data type for conversion unicode -> multibyte
 */
struct wctomb_funcs {
  int (*xxx_wctomb) (conv_t conv, unsigned char *r, wchar_t wc, int n);
  /*
   * int xxx_wctomb (conv_t conv, unsigned char *r, wchar_t wc, int n)
   * converts the wide character wc to the character set xxx, and stores the
   * result beginning at r. Up to n bytes may be written at r. n is >= 1.
   * Result is number of bytes written, or 0 if invalid, or -1 if n too small.
   */
  int (*xxx_reset) (conv_t conv, unsigned char *r, int n);
  /*
   * int xxx_reset (conv_t conv, unsigned char *r, int n)
   * stores a shift sequences returning to the initial state beginning at r.
   * Up to n bytes may be written at r. n is >= 0.
   * Result is number of bytes written, or -1 if n too small.
   */
};

/* Return code if invalid. (xxx_mbtowc, xxx_wctomb) */
#define RET_ILSEQ      0
/* Return code if only a shift sequence of n bytes was read. (xxx_mbtowc) */
#define RET_TOOFEW(n)  (-1-(n))
/* Return code if output buffer is too small. (xxx_wctomb, xxx_reset) */
#define RET_TOOSMALL   -1

/*
 * Contents of a conversion descriptor.
 */
struct conv_struct {
  /* Input (conversion multibyte -> unicode) */
  struct mbtowc_funcs ifuncs;
  state_t istate;
  /* Output (conversion unicode -> multibyte) */
  struct wctomb_funcs ofuncs;
  state_t ostate;
  /* Operation flags */
  int transliterate;
};

/*
 * Include all the converters.
 */

#include "ascii.h"

/* General multi-byte encodings */
#include "utf8.h"
#include "ucs2.h"
#include "ucs4.h"
#include "utf16.h"
#include "utf7.h"
#include "ucs2internal.h"
#include "ucs2swapped.h"
#include "ucs4internal.h"
#include "ucs4swapped.h"
#include "java.h"

/* 8-bit encodings */
#include "iso8859_1.h"
#include "iso8859_2.h"
#include "iso8859_3.h"
#include "iso8859_4.h"
#include "iso8859_5.h"
#include "iso8859_6.h"
#include "iso8859_7.h"
#include "iso8859_8.h"
#include "iso8859_9.h"
#include "iso8859_10.h"
#include "iso8859_13.h"
#include "iso8859_14.h"
#include "iso8859_15.h"
#include "koi8_r.h"
#include "koi8_u.h"
#include "koi8_ru.h"
#include "cp1250.h"
#include "cp1251.h"
#include "cp1252.h"
#include "cp1253.h"
#include "cp1254.h"
#include "cp1255.h"
#include "cp1256.h"
#include "cp1257.h"
#include "cp1258.h"
#include "cp850.h"
#include "cp866.h"
#include "mac_roman.h"
#include "mac_centraleurope.h"
#include "mac_iceland.h"
#include "mac_croatian.h"
#include "mac_romania.h"
#include "mac_cyrillic.h"
#include "mac_ukraine.h"
#include "mac_greek.h"
#include "mac_turkish.h"
#include "mac_hebrew.h"
#include "mac_arabic.h"
#include "mac_thai.h"
#include "hp_roman8.h"
#include "nextstep.h"
#include "armscii_8.h"
#include "georgian_academy.h"
#include "georgian_ps.h"
#include "mulelao.h"
#include "cp1133.h"
#include "tis620.h"
#include "cp874.h"
#include "viscii.h"
#include "tcvn.h"

/* CJK character sets [CCS = coded character set] [CJKV.INF chapter 3] */

typedef struct {
  unsigned short indx; /* index into big table */
  unsigned short used; /* bitmask of used entries */
} Summary16;

#include "jisx0201.h"
#include "jisx0208.h"
#include "jisx0212.h"

#include "gb2312.h"
/*#include "gb12345.h"*/
#include "gbk.h"
#include "cns11643.h"
#include "big5.h"

#include "ksc5601.h"
#include "johab_hangul.h"

/* CJK encodings [CES = character encoding scheme] [CJKV.INF chapter 4] */

#include "euc_jp.h"
#include "sjis.h"
#include "cp932.h"
#include "iso2022_jp.h"
#include "iso2022_jp1.h"
#include "iso2022_jp2.h"

#include "euc_cn.h"
#include "ces_gbk.h"
#include "iso2022_cn.h"
#include "iso2022_cnext.h"
#include "hz.h"
#include "euc_tw.h"
#include "ces_big5.h"
#include "cp950.h"

#include "euc_kr.h"
#include "johab.h"
#include "iso2022_kr.h"

/*
 * Table of all supported encodings.
 */
struct encoding {
  struct mbtowc_funcs ifuncs; /* conversion multibyte -> unicode */
  struct wctomb_funcs ofuncs; /* conversion unicode -> multibyte */
};
enum {
#define DEFENCODING(xxx_names,xxx,xxx_ifuncs,xxx_ofuncs1,xxx_ofuncs2) \
  ei_##xxx ,
#include "encodings.def"
#undef DEFENCODING
ei_for_broken_compilers_that_dont_like_trailing_commas
};
static struct encoding const all_encodings[] = {
#define DEFENCODING(xxx_names,xxx,xxx_ifuncs,xxx_ofuncs1,xxx_ofuncs2) \
  { xxx_ifuncs, xxx_ofuncs1,xxx_ofuncs2 },
#include "encodings.def"
#undef DEFENCODING
};

/*
 * Alias lookup function.
 * Defines
 *   struct alias { const char* name; unsigned int encoding_index; };
 *   const struct alias * aliases_lookup (const char *str, unsigned int len);
 *   #define MAX_WORD_LENGTH ...
 */
#include "aliases.h"

#if 0
/* Like !strcasecmp, except that the both strings can be assumed to be ASCII
   and the first string can be assumed to be in uppercase. */
static int strequal (const char* str1, const char* str2)
{
  unsigned char c1;
  unsigned char c2;
  for (;;) {
    c1 = * (unsigned char *) str1++;
    c2 = * (unsigned char *) str2++;
    if (c1 == 0)
      break;
    if (c2 >= 'a' && c2 <= 'z')
      c2 -= 'a'-'A';
    if (c1 != c2)
      break;
  }
  return (c1 == c2);
}
#endif

iconv_t iconv_open (const char* tocode, const char* fromcode)
{
  struct conv_struct * cd = (struct conv_struct *) malloc(sizeof(struct conv_struct));
  char buf[MAX_WORD_LENGTH+1];
  const char* cp;
  char* bp;
  const struct alias * ap;
  unsigned int count;

  if (cd == NULL) {
    errno = ENOMEM;
    return (iconv_t)(-1);
  }
  /* Before calling aliases_lookup, convert the input string to upper case,
   * and check whether it's entirely ASCII (we call gperf with option "-7"
   * to achieve a smaller table) and non-empty. If it's not entirely ASCII,
   * or if it's too long, it is not a valid encoding name.
   */
  /* Search tocode in the table. */
  for (cp = tocode, bp = buf, count = MAX_WORD_LENGTH+1; ; cp++, bp++) {
    unsigned char c = * (unsigned char *) cp;
    if (c >= 0x80)
      goto invalid;
    if (c >= 'a' && c <= 'z')
      c -= 'a'-'A';
    *bp = c;
    if (c == '\0')
      break;
    if (--count == 0)
      goto invalid;
  }
  ap = aliases_lookup(buf,bp-buf);
  if (ap == NULL)
    goto invalid;
  cd->ofuncs = all_encodings[ap->encoding_index].ofuncs;
  /* Search fromcode in the table. */
  for (cp = fromcode, bp = buf, count = MAX_WORD_LENGTH+1; ; cp++, bp++) {
    unsigned char c = * (unsigned char *) cp;
    if (c >= 0x80)
      goto invalid;
    if (c >= 'a' && c <= 'z')
      c -= 'a'-'A';
    *bp = c;
    if (c == '\0')
      break;
    if (--count == 0)
      goto invalid;
  }
  ap = aliases_lookup(buf,bp-buf);
  if (ap == NULL)
    goto invalid;
  cd->ifuncs = all_encodings[ap->encoding_index].ifuncs;
  /* Initialize the states. */
  memset(&cd->istate,'\0',sizeof(state_t));
  memset(&cd->ostate,'\0',sizeof(state_t));
  /* Initialize the operation flags. */
  cd->transliterate = 1;
  /* Done. */
  return (iconv_t)cd;
invalid:
  errno = EINVAL;
  return (iconv_t)(-1);
}

size_t iconv (iconv_t icd,
              ICONV_CONST char* * inbuf, size_t *inbytesleft,
              char* * outbuf, size_t *outbytesleft)
{
  conv_t cd = (conv_t) icd;
  if (inbuf == NULL || *inbuf == NULL) {
    if (outbuf == NULL || *outbuf == NULL) {
      /* Reset the states. */
      memset(&cd->istate,'\0',sizeof(state_t));
      memset(&cd->ostate,'\0',sizeof(state_t));
      return 0;
    } else {
      if (cd->ofuncs.xxx_reset) {
        int outcount = cd->ofuncs.xxx_reset(cd,*outbuf,*outbytesleft);
        if (outcount < 0) {
          errno = E2BIG;
          return -1;
        }
        *outbuf += outcount; *outbytesleft -= outcount;
      }
      memset(&cd->istate,'\0',sizeof(state_t));
      memset(&cd->ostate,'\0',sizeof(state_t));
      return 0;
    }
  } else {
    size_t result = 0;
    const unsigned char* inptr = (const unsigned char*) *inbuf;
    size_t inleft = *inbytesleft;
    unsigned char* outptr = (unsigned char*) *outbuf;
    size_t outleft = *outbytesleft;
    while (inleft > 0) {
      wchar_t wc;
      int incount;
      int outcount;
      incount = cd->ifuncs.xxx_mbtowc(cd,&wc,inptr,inleft);
      if (incount <= 0) {
        if (incount == 0) {
          /* Case 1: invalid input */
          errno = EILSEQ;
          result = -1;
          break;
        }
        if (incount == -1) {
          /* Case 2: not enough bytes available to detect anything */
          errno = EINVAL;
          result = -1;
          break;
        }
        /* Case 3: k bytes read, but only a shift sequence */
        incount = -1-incount;
      } else {
        /* Case 4: k bytes read, making up a wide character */
        outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
        if (outcount == 0) {
          /* This is not optimal. Transliteration would be better. */
          outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
          if (outcount == 0) {
            errno = EILSEQ;
            result = -1;
            break;
          }
        }
        if (outcount < 0) {
          errno = E2BIG;
          result = -1;
          break;
        }
        if (!(outcount <= outleft)) abort();
        outptr += outcount; outleft -= outcount;
      }
      if (!(incount <= inleft)) abort();
      inptr += incount; inleft -= incount;
    }
    *inbuf = (ICONV_CONST char*) inptr;
    *inbytesleft = inleft;
    *outbuf = (char*) outptr;
    *outbytesleft = outleft;
    return result;
  }
}

int iconv_close (iconv_t icd)
{
  conv_t cd = (conv_t) icd;
  free(cd);
  return 0;
}
thodg/libiconv/src/iconv.c

Commit

src/iconv.c