Edit

kc3-lang/libiconv/tools/8bit_tab_to_h.c

Branch :

  • Show log

    Commit

  • Author : Bruno Haible
    Date : 2022-01-23 23:37:30
    Hash : 68ac8a9f
    Message : New EBCDIC encodings. Reported by Ulrich Schwab and Calvin Buckley via Jack Woehr. * NOTES: Mention how to enable EBCDIC encodings. * tests/IBM-*.TXT: New files. * tools/8bit_tab_to_h.c (main): Emit copyright header with year 2022. * tools/Makefile: Add rules for generating ebcdic*.h. * lib/ebcdic*.h: New files, automatically generated by tools/Makefile. * lib/ebcdic838.h: Tweak reverse mapping manually. * lib/ebcdic1160.h: Likewise. * lib/converters.h: Include all ebcdic*.h. * lib/encodings_zos.def: New file. * lib/genaliases2.c: Include encodings_zos.def. * lib/genflags.c: Likewise. * Makefile.devel (lib/aliases_zos.h lib/canonical_zos.h): New rule. (lib/flags.h, totally-clean): Update. * lib/aliases2.h: Include aliases_zos.h. * lib/iconv.c (USE_ZOS): New macro. Include encodings_zos.def, canonical_zos.h. * README, man/iconv_open.3: Document the IBM-* encodings. * tests/Makefile.in (check-extra-yes): Also test the EBCDIC encodings.

  • tools/8bit_tab_to_h.c
  • /* Copyright (C) 1999-2002, 2011-2012, 2016, 2018, 2022 Free Software Foundation, Inc.
       This file is part of the GNU LIBICONV Tools.
    
       This program is free software: you can redistribute it and/or modify
       it under the terms of the GNU General Public License as published by
       the Free Software Foundation; either version 3 of the License, or
       (at your option) any later version.
    
       This program is distributed in the hope that it will be useful,
       but WITHOUT ANY WARRANTY; without even the implied warranty of
       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
       GNU General Public License for more details.
    
       You should have received a copy of the GNU General Public License
       along with this program; if not, see <https://www.gnu.org/licenses/>.  */
    
    /*
     * Generates an 8-bit character set table from a .TXT table as found on
     * ftp.unicode.org or from a table containing the 256 Unicode values as
     * hexadecimal integers.
     * Examples:
     *
     *   ./8bit_tab_to_h ISO-8859-1 iso8859_1 < tab8859_1
     *   ./8bit_tab_to_h ISO-8859-2 iso8859_2 < tab8859_2
     *   ./8bit_tab_to_h ISO-8859-3 iso8859_3 < tab8859_3
     *   ./8bit_tab_to_h ISO-8859-4 iso8859_4 < tab8859_4
     *   ./8bit_tab_to_h ISO-8859-5 iso8859_5 < tab8859_5
     *   ./8bit_tab_to_h ISO-8859-6 iso8859_6 < tab8859_6
     *   ./8bit_tab_to_h ISO-8859-7 iso8859_7 < tab8859_7
     *   ./8bit_tab_to_h ISO-8859-8 iso8859_8 < tab8859_8
     *   ./8bit_tab_to_h ISO-8859-9 iso8859_9 < tab8859_9
     *   ./8bit_tab_to_h ISO-8859-10 iso8859_10 < tab8859_10
     *   ./8bit_tab_to_h ISO-8859-14 iso8859_14 < tab8859_14
     *   ./8bit_tab_to_h ISO-8859-15 iso8859_15 < tab8859_15
     *   ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < jis0201
     *   ./8bit_tab_to_h TIS620.2533-1 tis620 < tabtis620
     *   ./8bit_tab_to_h KOI8-R koi8_r < tabkoi8_r
     *   ./8bit_tab_to_h KOI8-U koi8_u < tabkoi8_u
     *   ./8bit_tab_to_h ARMSCII-8 armscii_8 < tabarmscii_8
     *   ./8bit_tab_to_h CP1133 cp1133 < tabibm_cp1133
     *   ./8bit_tab_to_h MULELAO-1 mulelao < tabmulelao_1
     *   ./8bit_tab_to_h VISCII1.1-1 viscii1 < tabviscii
     *   ./8bit_tab_to_h TCVN-5712 tcvn < tabtcvn
     *   ./8bit_tab_to_h GEORGIAN-ACADEMY georgian_ac < tabgeorgian_academy
     *   ./8bit_tab_to_h GEORGIAN-PS georgian_ps < tabgeorgian_ps
     *
     *   ./8bit_tab_to_h ISO-8859-1 iso8859_1 < 8859-1.TXT
     *   ./8bit_tab_to_h ISO-8859-2 iso8859_2 < 8859-2.TXT
     *   ./8bit_tab_to_h ISO-8859-3 iso8859_3 < 8859-3.TXT
     *   ./8bit_tab_to_h ISO-8859-4 iso8859_4 < 8859-4.TXT
     *   ./8bit_tab_to_h ISO-8859-5 iso8859_5 < 8859-5.TXT
     *   ./8bit_tab_to_h ISO-8859-6 iso8859_6 < 8859-6.TXT
     *   ./8bit_tab_to_h ISO-8859-7 iso8859_7 < 8859-7.TXT
     *   ./8bit_tab_to_h ISO-8859-8 iso8859_8 < 8859-8.TXT
     *   ./8bit_tab_to_h ISO-8859-9 iso8859_9 < 8859-9.TXT
     *   ./8bit_tab_to_h ISO-8859-10 iso8859_10 < 8859-10.TXT
     *   ./8bit_tab_to_h ISO-8859-14 iso8859_14 < 8859-14.TXT
     *   ./8bit_tab_to_h ISO-8859-15 iso8859_15 < 8859-15.TXT
     *   ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < JIS0201.TXT
     *   ./8bit_tab_to_h KOI8-R koi8_r < KOI8-R.TXT
     *
     *   ./8bit_tab_to_h 'CP50221 JISX0208 extensions' cp50221_0208_ext < CP50221-0208-EXT.TXT
     *   ./8bit_tab_to_h 'CP50221 JISX0212 extensions' cp50221_0212_ext < CP50221-0212-EXT.TXT
     */
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <stdbool.h>
    #include <string.h>
    
    int main (int argc, char *argv[])
    {
      const char* charsetname;
      const char* c_charsetname;
      const char* filename;
      const char* directory;
      int charset2uni[0x100];
    
      if (argc != 3 && argc != 4 && argc != 5)
        exit(1);
      charsetname = argv[1];
      c_charsetname = argv[2];
      if (argc > 3) {
        filename = argv[3];
      } else {
        char* s = (char*) malloc(strlen(c_charsetname)+strlen(".h")+1);
        strcpy(s,c_charsetname); strcat(s,".h");
        filename = s;
      }
      directory = (argc > 4 ? argv[4] : "");
    
      fprintf(stderr, "Creating %s%s\n", directory, filename);
    
      {
        int i, c;
        c = getc(stdin);
        ungetc(c,stdin);
        if (c == '#') {
          /* Read a unicode.org style .TXT file. */
          for (i = 0; i < 0x100; i++)
            charset2uni[i] = 0xfffd;
          for (;;) {
            c = getc(stdin);
            if (c == EOF)
              break;
            if (c == '\n' || c == ' ' || c == '\t')
              continue;
            if (c == '#') {
              do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
              continue;
            }
            ungetc(c,stdin);
            if (scanf("0x%x", &i) != 1 || !(i >= 0 && i < 0x100))
              exit(1);
            do { c = getc(stdin); } while (c == ' ' || c == '\t');
            if (c != EOF)
              ungetc(c,stdin);
            if (c == '\n' || c == '#')
              continue;
            if (scanf("0x%x", &charset2uni[i]) != 1)
              exit(1);
          }
        } else {
          /* Read a table of hexadecimal Unicode values. */
          for (i = 0; i < 0x100; i++) {
            if (scanf("%x", &charset2uni[i]) != 1)
              exit(1);
            if (charset2uni[i] < 0 || charset2uni[i] == 0xffff)
              charset2uni[i] = 0xfffd;
          }
          if (scanf("%x", &i) != EOF)
            exit(1);
        }
      }
    
      /* Write the output file. */
      {
        FILE* f;
    
        {
          char* fname = malloc(strlen(directory)+strlen(filename)+1);
          strcpy(fname,directory); strcat(fname,filename);
          f = fopen(fname,"w");
          if (f == NULL)
            exit(1);
        }
    
        fprintf(f, "/*\n");
        fprintf(f, " * Copyright (C) 1999-2022 Free Software Foundation, Inc.\n");
        fprintf(f, " * This file is part of the GNU LIBICONV Library.\n");
        fprintf(f, " *\n");
        fprintf(f, " * The GNU LIBICONV Library is free software; you can redistribute it\n");
        fprintf(f, " * and/or modify it under the terms of the GNU Lesser General Public\n");
        fprintf(f, " * License as published by the Free Software Foundation; either version 2\n");
        fprintf(f, " * of the License, or (at your option) any later version.\n");
        fprintf(f, " *\n");
        fprintf(f, " * The GNU LIBICONV Library is distributed in the hope that it will be\n");
        fprintf(f, " * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
        fprintf(f, " * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n");
        fprintf(f, " * Lesser General Public License for more details.\n");
        fprintf(f, " *\n");
        fprintf(f, " * You should have received a copy of the GNU Lesser General Public\n");
        fprintf(f, " * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
        fprintf(f, " * If not, see <https://www.gnu.org/licenses/>.\n");
        fprintf(f, " */\n");
        fprintf(f, "\n");
        fprintf(f, "/*\n");
        fprintf(f, " * %s\n", charsetname);
        fprintf(f, " */\n");
        fprintf(f, "\n");
    
        {
          int i, i1, i2, i3;
          int line[16];
          int tableno;
          struct { int minline; int maxline; } tables[16];
          bool some_invalid;
          bool final_ret_reached;
    
          for (i1 = 0; i1 < 16; i1++) {
            bool all_invalid = true;
            bool all_identity = true;
            for (i2 = 0; i2 < 16; i2++) {
              i = 16*i1+i2;
              if (charset2uni[i] != 0xfffd)
                all_invalid = false;
              if (charset2uni[i] != i)
                all_identity = false;
            }
            if (all_invalid)
              line[i1] = -2;
            else if (all_identity)
              line[i1] = -1;
            else
              line[i1] = 0;
          }
          tableno = 0;
          for (i1 = 0; i1 < 16; i1++) {
            if (line[i1] >= 0) {
              if (i1 > 0 && tableno > 0 && line[i1-1] == tableno-1) {
                line[i1] = tableno-1;
                tables[tableno-1].maxline = i1;
              } else {
                tableno++;
                line[i1] = tableno-1;
                tables[tableno-1].minline = tables[tableno-1].maxline = i1;
              }
            }
          }
          some_invalid = false;
          for (i = 0; i < 0x100; i++)
            if (charset2uni[i] == 0xfffd)
              some_invalid = true;
          if (tableno > 0) {
            int t;
            for (t = 0; t < tableno; t++) {
              fprintf(f, "static const unsigned short %s_2uni", c_charsetname);
              if (tableno > 1)
                fprintf(f, "_%d", t+1);
              fprintf(f, "[%d] = {\n", 16*(tables[t].maxline-tables[t].minline+1));
              for (i1 = tables[t].minline; i1 <= tables[t].maxline; i1++) {
                fprintf(f, "  /* 0x%02x */\n", 16*i1);
                for (i2 = 0; i2 < 2; i2++) {
                  fprintf(f, " ");
                  for (i3 = 0; i3 < 8; i3++) {
                    i = 16*i1+8*i2+i3;
                    fprintf(f, " 0x%04x,", charset2uni[i]);
                  }
                  fprintf(f, "\n");
                }
              }
              fprintf(f, "};\n");
            }
            fprintf(f, "\n");
          }
          final_ret_reached = false;
          fprintf(f, "static int\n%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)\n", c_charsetname);
          fprintf(f, "{\n");
          fprintf(f, "  unsigned char c = *s;\n");
          if (some_invalid) {
            for (i1 = 0; i1 < 16;) {
              int t = line[i1];
              const char* indent;
              for (i2 = i1; i2 < 16 && line[i2] == t; i2++);
              indent = (i1 == 0 && i2 == 16 ? "  " : "    ");
              if (i1 == 0) {
                if (i2 == 16) {
                } else {
                  fprintf(f, "  if (c < 0x%02x) {\n", 16*i2);
                }
              } else {
                if (i2 == 16) {
                  fprintf(f, "  else {\n");
                } else {
                  fprintf(f, "  else if (c < 0x%02x) {\n", 16*i2);
                }
              }
              if (t == -2) {
                final_ret_reached = true;
              } else if (t == -1) {
                fprintf(f, "%s*pwc = (ucs4_t) c;\n", indent);
                fprintf(f, "%sreturn 1;\n", indent);
              } else {
                fprintf(f, "%s", indent);
                some_invalid = false;
                for (i = 16*i1; i < 16*i2; i++)
                  if (charset2uni[i] == 0xfffd)
                    some_invalid = true;
                if (some_invalid)
                  fprintf(f, "unsigned short wc = ");
                else
                  fprintf(f, "*pwc = (ucs4_t) ");
                fprintf(f, "%s_2uni", c_charsetname);
                if (tableno > 1)
                  fprintf(f, "_%d", t+1);
                fprintf(f, "[c");
                if (tables[t].minline > 0)
                  fprintf(f, "-0x%02x", 16*tables[t].minline);
                fprintf(f, "];\n");
                if (some_invalid) {
                  fprintf(f, "%sif (wc != 0xfffd) {\n", indent);
                  fprintf(f, "%s  *pwc = (ucs4_t) wc;\n", indent);
                  fprintf(f, "%s  return 1;\n", indent);
                  fprintf(f, "%s}\n", indent);
                  final_ret_reached = true;
                } else {
                  fprintf(f, "%sreturn 1;\n", indent);
                }
              }
              if (!(i1 == 0 && i2 == 16))
                fprintf(f, "  }\n");
              i1 = i2;
            }
            if (final_ret_reached)
              fprintf(f, "  return RET_ILSEQ;\n");
          } else {
            for (i1 = 0; i1 < 16;) {
              int t = line[i1];
              for (i2 = i1; i2 < 16 && line[i2] == t; i2++);
              if (i1 == 0) {
                if (i2 == 16) {
                  fprintf(f, "  ");
                } else {
                  fprintf(f, "  if (c < 0x%02x)\n    ", 16*i2);
                }
              } else {
                if (i2 == 16) {
                  fprintf(f, "  else\n    ");
                } else {
                  fprintf(f, "  else if (c < 0x%02x)\n    ", 16*i2);
                }
              }
              if (t == -1)
                fprintf(f, "*pwc = (ucs4_t) c;\n");
              else {
                fprintf(f, "*pwc = (ucs4_t) %s_2uni", c_charsetname);
                if (tableno > 1)
                  fprintf(f, "_%d", t+1);
                fprintf(f, "[c");
                if (tables[t].minline > 0)
                  fprintf(f, "-0x%02x", 16*tables[t].minline);
                fprintf(f, "];\n");
              }
              i1 = i2;
            }
            fprintf(f, "  return 1;\n");
          }
          fprintf(f, "}\n");
    
        }
    
        fprintf(f, "\n");
    
        {
          int uni2charset[0x10000];
          bool pages[0x100];
          int line[0x2000];
          int tableno;
          struct { int minline; int maxline; int usecount; const char* suffix; } tables[0x2000];
          bool need_c;
          bool fix_0000;
          int i, j, p, j1, j2, t;
    
          for (j = 0; j < 0x10000; j++)
            uni2charset[j] = 0;
          for (p = 0; p < 0x100; p++)
            pages[p] = false;
          for (i = 0; i < 0x100; i++) {
            j = charset2uni[i];
            if (j != 0xfffd) {
              uni2charset[j] = i;
              pages[j>>8] = true;
            }
          }
          for (j1 = 0; j1 < 0x2000; j1++) {
            bool all_invalid = true;
            bool all_identity = true;
            for (j2 = 0; j2 < 8; j2++) {
              j = 8*j1+j2;
              if (uni2charset[j] != 0)
                all_invalid = false;
              if (uni2charset[j] != j)
                all_identity = false;
            }
            if (all_invalid)
              line[j1] = -2;
            else if (all_identity)
              line[j1] = -1;
            else
              line[j1] = 0;
          }
          tableno = 0;
          for (j1 = 0; j1 < 0x2000; j1++) {
            if (line[j1] >= 0) {
              if (tableno > 0
                  && ((j1 > 0 && line[j1-1] == tableno-1)
                      || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
                          && j1 - tables[tableno-1].maxline <= 8))) {
                line[j1] = tableno-1;
                tables[tableno-1].maxline = j1;
              } else {
                tableno++;
                line[j1] = tableno-1;
                tables[tableno-1].minline = tables[tableno-1].maxline = j1;
              }
            }
          }
          for (t = 0; t < tableno; t++) {
            tables[t].usecount = 0;
            j1 = 8*tables[t].minline;
            j2 = 8*(tables[t].maxline+1);
            for (j = j1; j < j2; j++)
              if (uni2charset[j] != 0)
                tables[t].usecount++;
          }
          for (t = 0, p = -1, i = 0; t < tableno; t++) {
            if (tables[t].usecount > 1) {
              char* s;
              if (p == tables[t].minline >> 5) {
                s = (char*) malloc(5+1);
                sprintf(s, "%02x_%d", p, ++i);
              } else {
                p = tables[t].minline >> 5;
                s = (char*) malloc(2+1);
                sprintf(s, "%02x", p);
              }
              tables[t].suffix = s;
            } else
              tables[t].suffix = NULL;
          }
          {
            p = -1;
            for (t = 0; t < tableno; t++)
              if (tables[t].usecount > 1) {
                p = 0;
                fprintf(f, "static const unsigned char %s_page%s[%d] = {\n", c_charsetname, tables[t].suffix, 8*(tables[t].maxline-tables[t].minline+1));
                for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
                  if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
                    fprintf(f, "  /* 0x%04x */\n", 8*j1);
                  fprintf(f, " ");
                  for (j2 = 0; j2 < 8; j2++) {
                    j = 8*j1+j2;
                    fprintf(f, " 0x%02x,", uni2charset[j]);
                  }
                  fprintf(f, " /* 0x%02x-0x%02x */\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
                }
                fprintf(f, "};\n");
              }
            if (p >= 0)
              fprintf(f, "\n");
          }
          need_c = false;
          for (j1 = 0; j1 < 0x2000;) {
            t = line[j1];
            for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
            if (t >= 0)
              j2 = tables[t].maxline+1;
            if (!(t == -2 || (t == -1 && j1 == 0)))
              need_c = true;
            j1 = j2;
          }
          fix_0000 = false;
          fprintf(f, "static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)\n", c_charsetname);
          fprintf(f, "{\n");
          if (need_c)
            fprintf(f, "  unsigned char c = 0;\n");
          for (j1 = 0; j1 < 0x2000;) {
            t = line[j1];
            for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
            if (t >= 0) {
              if (j1 != tables[t].minline) abort();
              if (j2 > tables[t].maxline+1) abort();
              j2 = tables[t].maxline+1;
            }
            if (t == -2) {
            } else {
              if (j1 == 0)
                fprintf(f, "  ");
              else
                fprintf(f, "  else ");
              if (t >= 0 && tables[t].usecount == 0) abort();
              if (t >= 0 && tables[t].usecount == 1) {
                if (j2 != j1+1) abort();
                for (j = 8*j1; j < 8*j2; j++)
                  if (uni2charset[j] != 0) {
                    fprintf(f, "if (wc == 0x%04x)\n    c = 0x%02x;\n", j, uni2charset[j]);
                    break;
                  }
              } else {
                if (j1 == 0) {
                  fprintf(f, "if (wc < 0x%04x)", 8*j2);
                } else {
                  fprintf(f, "if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
                }
                if (t == -1) {
                  if (j1 == 0)
                    /* If wc == 0, the function must return 1, not -1. */
                    fprintf(f, " {\n    *r = wc;\n    return 1;\n  }\n");
                  else
                    fprintf(f, "\n    c = wc;\n");
                } else {
                  fprintf(f, "\n    c = %s_page%s[wc", c_charsetname, tables[t].suffix);
                  if (tables[t].minline > 0)
                    fprintf(f, "-0x%04x", 8*j1);
                  fprintf(f, "];\n");
                  if (j1 == 0 && uni2charset[0] == 0)
                    /* If wc == 0, the function must return 1, not -1. */
                    fix_0000 = true;
                }
              }
            }
            j1 = j2;
          }
          if (need_c) {
            if (fix_0000)
              fprintf(f, "  if (c != 0 || wc == 0) {\n");
            else
              fprintf(f, "  if (c != 0) {\n");
            fprintf(f, "    *r = c;\n");
            fprintf(f, "    return 1;\n");
            fprintf(f, "  }\n");
          }
          fprintf(f, "  return RET_ILUNI;\n");
          fprintf(f, "}\n");
    
        }
    
        if (ferror(f) || fclose(f))
          exit(1);
      }
    
    #if 0
    
        int i1, i2, i3, i1_min, i1_max, j1, j2;
    
      i1_min = 16;
      i1_max = -1;
      for (i1 = 0; i1 < 16; i1++)
        for (i2 = 0; i2 < 16; i2++)
          if (charset2uni[16*i1+i2] != 0xfffd) {
            if (i1_min > i1) i1_min = i1;
            if (i1_max < i1) i1_max = i1;
          }
      printf("static const unsigned short %s_2uni[%d] = {\n",
             name, 16*(i1_max-i1_min+1));
      for (i1 = i1_min; i1 <= i1_max; i1++) {
        printf("  /""* 0x%02x *""/\n", 16*i1);
        for (i2 = 0; i2 < 2; i2++) {
          printf("  ");
          for (i3 = 0; i3 < 8; i3++) {
            if (i3 > 0) printf(" ");
            printf("0x%04x,", charset2uni[16*i1+8*i2+i3]);
          }
          printf("\n");
        }
      }
      printf("};\n");
      printf("\n");
    
      for (p = 0; p < 0x100; p++)
        pages[p] = 0;
      for (i = 0; i < 0x100; i++)
        if (charset2uni[i] != 0xfffd)
          pages[charset2uni[i]>>8] = 1;
      for (p = 0; p < 0x100; p++)
        if (pages[p]) {
          int j1_min = 32;
          int j1_max = -1;
          for (j1 = 0; j1 < 32; j1++)
            for (j2 = 0; j2 < 8; j2++)
              if (uni2charset[256*p+8*j1+j2] != 0) {
                if (j1_min > j1) j1_min = j1;
                if (j1_max < j1) j1_max = j1;
              }
          printf("static const unsigned char %s_page%02x[%d] = {\n",
                 name, p, 8*(j1_max-j1_min+1));
          for (j1 = j1_min; j1 <= j1_max; j1++) {
            printf("  ");
            for (j2 = 0; j2 < 8; j2++)
              printf("0x%02x, ", uni2charset[256*p+8*j1+j2]);
            printf("/""* 0x%02x-0x%02x *""/\n", 8*j1, 8*j1+7);
          }
          printf("};\n");
        }
      printf("\n");
    
    }
    #endif
    
      exit(0);
    }