Edit

thodg/libiconv/tools/gen-translit-def-part.lisp

Branch :

  • Show log

    Commit

  • Author : Bruno Haible
    Date : 2023-04-01 01:27:36
    Hash : af6ae1c5
    Message : Correct indentation.

  • tools/gen-translit-def-part.lisp
  • ;;; Copyright (C) 2023 Free Software Foundation, Inc.
    ;;; This file is part of the GNU LIBICONV Tools.
    ;;;
    ;;; This program is free software: you can redistribute it and/or modify
    ;;; it under the terms of the GNU General Public License as published by
    ;;; the Free Software Foundation; either version 3 of the License, or
    ;;; (at your option) any later version.
    ;;;
    ;;; This program is distributed in the hope that it will be useful,
    ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
    ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    ;;; GNU General Public License for more details.
    ;;;
    ;;; You should have received a copy of the GNU General Public License
    ;;; along with this program; if not, see <https://www.gnu.org/licenses/>.
    
    ;;; Transform UnicodeData.txt to a part of translit.def.
    ;;; translit.def consists of lines of the form
    ;;;   Unicode <tab> utf-8 replacement <tab> # comment
    ;;; This program produces the lines for the code points >= U+3200
    ;;; and reasonable approximations for the code points < U+3200.
    
    (defun starts-with (s prefix)
      (locally (declare (compile)))
      (and (>= (length s) (length prefix))
           (equal (substring s 0 (length prefix)) prefix)))
    
    (defun generate-part (infilename outfilename)
      (locally (declare (compile)))
      (with-open-file (in infilename)
        (with-open-file (out outfilename :direction :output
                                         :external-format CHARSET:UTF-8)
          (loop
            (let ((line (read-line in nil nil)))
              (unless line (return))
              (let* ((semicolon1 (position #\; line))
                     (semicolon2 (position #\; line :start (+ semicolon1 1)))
                     (semicolon3 (position #\; line :start (+ semicolon2 1)))
                     (semicolon4 (position #\; line :start (+ semicolon3 1)))
                     (semicolon5 (position #\; line :start (+ semicolon4 1)))
                     (semicolon6 (position #\; line :start (+ semicolon5 1)))
                     (code (let ((*read-base* 16)) (read-from-string (substring line 0 semicolon1))))
                     (name (substring line (+ semicolon1 1) semicolon2))
                     (category (substring line (+ semicolon2 1) semicolon3))
                     (combining (substring line (+ semicolon3 1) semicolon4))
                     (bidi (substring line (+ semicolon4 1) semicolon5))
                     (decomposition (substring line (+ semicolon5 1) semicolon6)))
                (declare (ignore category combining bidi))
                (when (or (starts-with decomposition "<circle> ")
                          (starts-with decomposition "<compat> ")
                          (starts-with decomposition "<font> ")
                          (starts-with decomposition "<narrow> ")
                          (starts-with decomposition "<small> ")
                          (starts-with decomposition "<square> ")
                          (starts-with decomposition "<wide> ")
                          (and (starts-with name "CJK COMPATIBILITY IDEOGRAPH-")
                               (> (length decomposition) 0))
                      )
                  (let ((replacement '()))
                    (let ((i (+ (or (position #\> decomposition) -2) 2)))
                      (loop
                        (let* ((space (position #\Space decomposition :start i))
                               (hexcode (substring decomposition i (or space (length decomposition)))))
                          (push
                            (int-char (let ((*read-base* 16)) (read-from-string hexcode)))
                            replacement)
                          (unless space
                            (return))
                          (setq i (+ space 1)))))
                    (setq replacement (nreverse replacement))
                    (when (starts-with decomposition "<circle> ")
                      (setq replacement (concatenate 'list '(#\() replacement '(#\)))))
                    (setq replacement (mapcan (lambda (c)
                                                (cond ((eql c #\DIVISION_SLASH) (list #\/))
                                                      ((eql c #\SUPERSCRIPT_TWO) (list #\^ #\2))
                                                      ((eql c #\SUPERSCRIPT_THREE) (list #\^ #\3))
                                                      ((eql c #\SCRIPT_SMALL_L) (list #\l))
                                                      (t (list c))))
                                              replacement))
                    (format out "~4,'0X~A~{~A~}~A# ~A~%" code #\Tab replacement #\Tab name)
                ) )
    ) ) ) ) ) )
    
    ; (generate-part "/home/bruno/notes/UnicodeData-15.0.0.txt" "translit-part-15.0.0.def")