Edit

thodg/libiconv/lib/iconv_open1.h

Branch :

  • Show log

    Commit

  • Author : Bruno Haible
    Date : 2023-04-03 04:12:01
    Hash : 19b6af5e
    Message : Allow overriding the newline conversion for EBCDIC encodings. Reported by Mike Fulton <mikefultonpersonal@gmail.com> in <https://lists.gnu.org/archive/html/bug-gnu-libiconv/2023-04/msg00009.html>. * include/iconv.h.in (ICONV_SURFACE_NONE, ICONV_SURFACE_EBCDIC_ZOS_UNIX): New macros. (ICONV_GET_FROM_SURFACE, ICONV_SET_FROM_SURFACE, ICONV_GET_TO_SURFACE, ICONV_SET_TO_SURFACE): New macros. * lib/converters.h (struct conv_struct): Add the fields isurface, osurface. (swap_x15_x25): New macro. * lib/iconv.c (iconv_open, iconv_open_into): Add local variables from_surface, to_surface. (ALL_SURFACES): New macro. (iconvctl): Adjust ICONV_TRIVIALP implementation. Implement the ICONV_{GET,SET}_{FROM,TO}_SURFACE requests. * lib/iconv_open1.h: Parse a /ZOS_UNIX surface specifier. Set from_surface, to_surface. * lib/iconv_open2.h: Copy the values of from_surface, to_surface into the conversion descriptor. * lib/ebcdic*.h (*_mbtowc): Test the isurface. If requested, call swap_x15_x25 right after fetching an input byte. (*_wctomb): Test the osurface. If requested, call swap_x15_x25 right before storing an output byte. * man/iconvctl.3 (REQUEST VALUES): Document the ICONV_{GET,SET}_{FROM,TO}_SURFACE requests. * src/iconv.c (main): If ICONV_EBCDIC_ZOS_UNIX is set, set the from/to surfaces accordingly. * man/iconv.1 (ENVIRONMENT): New section. * tests/check-ebcdic: New file. * tests/Makefile.in (check): Invoke it. (SOURCE_FILES): Add it. * NEWS: Mention the new functionality.

  • lib/iconv_open1.h
  • /*
     * Copyright (C) 1999-2008, 2011, 2018, 2020, 2023 Free Software Foundation, Inc.
     * This file is part of the GNU LIBICONV Library.
     *
     * The GNU LIBICONV Library is free software; you can redistribute it
     * and/or modify it under the terms of the GNU Lesser General Public
     * License as published by the Free Software Foundation; either version 2.1
     * of the License, or (at your option) any later version.
     *
     * The GNU LIBICONV Library is distributed in the hope that it will be
     * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     * Lesser General Public License for more details.
     *
     * You should have received a copy of the GNU Lesser General Public
     * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
     * If not, see <https://www.gnu.org/licenses/>.
     */
    
    /* Part 1 of iconv_open.
       Input: const char* tocode, const char* fromcode.
       Output:
         unsigned int from_index;
         int from_wchar;
         unsigned int from_surface;
         unsigned int to_index;
         int to_wchar;
         unsigned int to_surface;
         int transliterate;
         int discard_ilseq;
       Jumps to 'invalid' in case of errror.
     */
    {
      char buf[MAX_WORD_LENGTH+9+9+1];
      const char* cp;
      char* bp;
      const struct alias * ap;
      unsigned int count;
    
      from_surface = ICONV_SURFACE_NONE;
      to_surface = ICONV_SURFACE_NONE;
      transliterate = 0;
      discard_ilseq = 0;
    
      /* Before calling aliases_lookup, convert the input string to upper case,
       * and check whether it's entirely ASCII (we call gperf with option "-7"
       * to achieve a smaller table) and non-empty. If it's not entirely ASCII,
       * or if it's too long, it is not a valid encoding name.
       */
      for (to_wchar = 0;;) {
        /* Search tocode in the table. */
        for (cp = tocode, bp = buf, count = MAX_WORD_LENGTH+9+9+1; ; cp++, bp++) {
          unsigned char c = (unsigned char) *cp;
          if (c >= 0x80)
            goto invalid;
          if (c >= 'a' && c <= 'z')
            c -= 'a'-'A';
          *bp = c;
          if (c == '\0')
            break;
          if (--count == 0)
            goto invalid;
        }
        for (;;) {
          char *sp = bp;
          int parsed_translit = 0;
          int parsed_ignore = 0;
          if (sp-buf > 9 && memcmp(sp-9,"/TRANSLIT",9)==0) {
            sp = sp - 9;
            parsed_translit = 1;
          } else if (sp-buf > 7 && memcmp(sp-7,"/IGNORE",7)==0) {
            sp = sp - 7;
            parsed_ignore = 1;
          }
          if (sp > buf && memcmp(sp-1,"/",1) == 0) {
            bp = sp - 1;
          } else if (sp-buf >= 9 && memcmp(sp-9,"/ZOS_UNIX",9)==0) {
            bp = sp - 9;
            to_surface = ICONV_SURFACE_EBCDIC_ZOS_UNIX;
          } else
            break;
          *bp = '\0';
          if (parsed_translit)
            transliterate = 1;
          if (parsed_ignore)
            discard_ilseq = 1;
          break;
        }
        if (buf[0] == '\0') {
          tocode = locale_charset();
          /* Avoid an endless loop that could occur when using an older version
             of localcharset.c. */
          if (tocode[0] == '\0')
            goto invalid;
          continue;
        }
        ap = aliases_lookup(buf,bp-buf);
        if (ap == NULL) {
          ap = aliases2_lookup(buf);
          if (ap == NULL)
            goto invalid;
        }
        if (ap->encoding_index == ei_local_char) {
          tocode = locale_charset();
          /* Avoid an endless loop that could occur when using an older version
             of localcharset.c. */
          if (tocode[0] == '\0')
            goto invalid;
          continue;
        }
        if (ap->encoding_index == ei_local_wchar_t) {
          /* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
             This is also the case on native Woe32 systems and Cygwin >= 1.7, where
             we know that it is UTF-16.  */
    #if (defined _WIN32 && !defined __CYGWIN__) || (defined __CYGWIN__ && CYGWIN_VERSION_DLL_MAJOR >= 1007)
          if (sizeof(wchar_t) == 4) {
            to_index = ei_ucs4internal;
            break;
          }
          if (sizeof(wchar_t) == 2) {
    # if WORDS_LITTLEENDIAN
            to_index = ei_utf16le;
    # else
            to_index = ei_utf16be;
    # endif
            break;
          }
    #elif __STDC_ISO_10646__
          if (sizeof(wchar_t) == 4) {
            to_index = ei_ucs4internal;
            break;
          }
          if (sizeof(wchar_t) == 2) {
            to_index = ei_ucs2internal;
            break;
          }
          if (sizeof(wchar_t) == 1) {
            to_index = ei_iso8859_1;
            break;
          }
    #endif
    #if HAVE_MBRTOWC
          to_wchar = 1;
          tocode = locale_charset();
          continue;
    #endif
          goto invalid;
        }
        to_index = ap->encoding_index;
        break;
      }
      for (from_wchar = 0;;) {
        /* Search fromcode in the table. */
        for (cp = fromcode, bp = buf, count = MAX_WORD_LENGTH+9+9+1; ; cp++, bp++) {
          unsigned char c = (unsigned char) *cp;
          if (c >= 0x80)
            goto invalid;
          if (c >= 'a' && c <= 'z')
            c -= 'a'-'A';
          *bp = c;
          if (c == '\0')
            break;
          if (--count == 0)
            goto invalid;
        }
        for (;;) {
          char *sp = bp;
          int parsed_translit = 0;
          int parsed_ignore = 0;
          if (sp-buf > 9 && memcmp(sp-9,"/TRANSLIT",9)==0) {
            sp = sp - 9;
            parsed_translit = 1;
          } else if (sp-buf > 7 && memcmp(sp-7,"/IGNORE",7)==0) {
            sp = sp - 7;
            parsed_ignore = 1;
          }
          if (sp > buf && memcmp(sp-1,"/",1) == 0) {
            bp = sp - 1;
          } else if (sp-buf >= 9 && memcmp(sp-9,"/ZOS_UNIX",9)==0) {
            bp = sp - 9;
            from_surface = ICONV_SURFACE_EBCDIC_ZOS_UNIX;
          } else
            break;
          *bp = '\0';
          if (parsed_translit)
            transliterate = 1;
          if (parsed_ignore)
            discard_ilseq = 1;
          break;
        }
        if (buf[0] == '\0') {
          fromcode = locale_charset();
          /* Avoid an endless loop that could occur when using an older version
             of localcharset.c. */
          if (fromcode[0] == '\0')
            goto invalid;
          continue;
        }
        ap = aliases_lookup(buf,bp-buf);
        if (ap == NULL) {
          ap = aliases2_lookup(buf);
          if (ap == NULL)
            goto invalid;
        }
        if (ap->encoding_index == ei_local_char) {
          fromcode = locale_charset();
          /* Avoid an endless loop that could occur when using an older version
             of localcharset.c. */
          if (fromcode[0] == '\0')
            goto invalid;
          continue;
        }
        if (ap->encoding_index == ei_local_wchar_t) {
          /* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
             This is also the case on native Woe32 systems and Cygwin >= 1.7, where
             we know that it is UTF-16.  */
    #if (defined _WIN32 && !defined __CYGWIN__) || (defined __CYGWIN__ && CYGWIN_VERSION_DLL_MAJOR >= 1007)
          if (sizeof(wchar_t) == 4) {
            from_index = ei_ucs4internal;
            break;
          }
          if (sizeof(wchar_t) == 2) {
    # if WORDS_LITTLEENDIAN
            from_index = ei_utf16le;
    # else
            from_index = ei_utf16be;
    # endif
            break;
          }
    #elif __STDC_ISO_10646__
          if (sizeof(wchar_t) == 4) {
            from_index = ei_ucs4internal;
            break;
          }
          if (sizeof(wchar_t) == 2) {
            from_index = ei_ucs2internal;
            break;
          }
          if (sizeof(wchar_t) == 1) {
            from_index = ei_iso8859_1;
            break;
          }
    #endif
    #if HAVE_WCRTOMB
          from_wchar = 1;
          fromcode = locale_charset();
          continue;
    #endif
          goto invalid;
        }
        from_index = ap->encoding_index;
        break;
      }
    }