Edit

kc3-lang/ftgl/src/FTUnicode.h

Branch :

  • Show log

    Commit

  • Author : brlcad
    Date : 2010-05-12 16:24:45
    Hash : ca4a58f9
    Message : pull in change from bzflag downstream, r20589 by brlcad, make sure our chars are all unsigned so we don't end up coercing a template instantiation to char where we start spitting out negative values.

  • src/FTUnicode.h
  • /*
     * FTGL - OpenGL font library
     *
     * Copyright (c) 2008 Daniel Remenak <dtremenak@users.sourceforge.net>
     *
     * Portions derived from ConvertUTF.c Copyright (C) 2001-2004 Unicode, Inc
     *   Unicode, Inc. hereby grants the right to freely use the information
     *   supplied in this file in the creation of products supporting the
     *   Unicode Standard, and to make copies of this file in any form
     *   for internal or external distribution as long as this notice
     *   remains attached.
     *
     * Permission is hereby granted, free of charge, to any person obtaining
     * a copy of this software and associated documentation files (the
     * "Software"), to deal in the Software without restriction, including
     * without limitation the rights to use, copy, modify, merge, publish,
     * distribute, sublicense, and/or sell copies of the Software, and to
     * permit persons to whom the Software is furnished to do so, subject to
     * the following conditions:
     *
     * The above copyright notice and this permission notice shall be
     * included in all copies or substantial portions of the Software.
     *
     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
     * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     */
    
    #ifndef    __FTUnicode__
    #define    __FTUnicode__
    
    /**
     * Provides a way to easily walk multibyte unicode strings in the various
     * Unicode encodings (UTF-8, UTF-16, UTF-32, UCS-2, and UCS-4).  Encodings
     * with elements larger than one byte must already be in the correct endian
     * order for the current architecture.
     */
    template <typename T>
    class FTUnicodeStringItr
    {
    public:
        /**
         * Constructor.  Also reads the first character and stores it.
         *
         * @param string  The buffer to iterate.  No copy is made.
         */
        FTUnicodeStringItr(const T* string) : curPos(string), nextPos(string)
        {
            (*this)++;
        };
    
        /**
         * Pre-increment operator.  Reads the next unicode character and sets
         * the state appropriately.
         * Note - not protected against overruns.
         */
        FTUnicodeStringItr& operator++()
        {
            curPos = nextPos;
            // unicode handling
            switch (sizeof(T))
            {
                case 1: // UTF-8
                    // get this character
                    readUTF8(); break;
                case 2: // UTF-16
                    readUTF16(); break;
                case 4: // UTF-32
                    // fall through
                default: // error condition really, but give it a shot anyway
                    curChar = *nextPos++;
            }
            return *this;
        }
    
        /**
         * Post-increment operator.  Reads the next character and sets
         * the state appropriately.
         * Note - not protected against overruns.
         */
        FTUnicodeStringItr operator++(int)
        {
            FTUnicodeStringItr temp = *this;
            ++*this;
            return temp;
        }
    
        /**
         * Equality operator.  Two FTUnicodeStringItrs are considered equal
         * if they have the same current buffer and buffer position.
         */
        bool operator==(const FTUnicodeStringItr& right) const
        {
            if (curPos == right.getBufferFromHere())
                return true;
            return false;
        }
    
        /**
         * Dereference operator.
         *
         * @return  The unicode codepoint of the character currently pointed
         * to by the FTUnicodeStringItr.
         */
        unsigned int operator*() const
        {
            return curChar;
        }
    
        /**
         * Buffer-fetching getter.  You can use this to retreive the buffer
         * starting at the currently-iterated character for functions which
         * require a Unicode string as input.
         */
        const T* getBufferFromHere() const { return curPos; }
    
    private:
        /**
         * Helper function for reading a single UTF8 character from the string.
         * Updates internal state appropriately.
         */
        void readUTF8();
    
        /**
         * Helper function for reading a single UTF16 character from the string.
         * Updates internal state appropriately.
         */
        void readUTF16();
    
        /**
         * The buffer position of the first element in the current character.
         */
        const T* curPos;
    
        /**
         * The character stored at the current buffer position (prefetched on
         * increment, so there's no penalty for dereferencing more than once).
         */
        unsigned int curChar;
    
        /**
         * The buffer position of the first element in the next character.
         */
        const T* nextPos;
    
        // unicode magic numbers
        static const unsigned char utf8bytes[256];
        static const unsigned long offsetsFromUTF8[6];
        static const unsigned long highSurrogateStart;
        static const unsigned long highSurrogateEnd;
        static const unsigned long lowSurrogateStart;
        static const unsigned long lowSurrogateEnd;
        static const unsigned long highSurrogateShift;
        static const unsigned long lowSurrogateBase;
    };
    
    /* The first character in a UTF8 sequence indicates how many bytes
     * to read (among other things) */
    template <typename T>
    const unsigned char FTUnicodeStringItr<T>::utf8bytes[256] = {
      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
      3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
    };
    
    /* Magic values subtracted from a buffer value during UTF8 conversion.
     * This table contains as many values as there might be trailing bytes
     * in a UTF-8 sequence. */
    template <typename T>
    const unsigned long FTUnicodeStringItr<T>::offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
      0x03C82080UL, 0xFA082080UL, 0x82082080UL };
    
    // get a UTF8 character; leave the tracking pointer at the start of the
    // next character
    // not protected against invalid UTF8
    template <typename T>
    inline void FTUnicodeStringItr<T>::readUTF8()
    {
        unsigned int ch = 0;
        unsigned int extraBytesToRead = utf8bytes[(unsigned char)(*nextPos)];
        // falls through
        switch (extraBytesToRead)
        {
              case 6: ch += *nextPos++; ch <<= 6; /* remember, illegal UTF-8 */
              case 5: ch += *nextPos++; ch <<= 6; /* remember, illegal UTF-8 */
              case 4: ch += *nextPos++; ch <<= 6;
              case 3: ch += *nextPos++; ch <<= 6;
              case 2: ch += *nextPos++; ch <<= 6;
              case 1: ch += *nextPos++;
        }
        ch -= offsetsFromUTF8[extraBytesToRead-1];
        curChar = ch;
    }
    
    // Magic numbers for UTF-16 conversions
    template <typename T>
    const unsigned long FTUnicodeStringItr<T>::highSurrogateStart = 0xD800;
    template <typename T>
    const unsigned long FTUnicodeStringItr<T>::highSurrogateEnd = 0xDBFF;
    template <typename T>
    const unsigned long FTUnicodeStringItr<T>::lowSurrogateStart = 0xDC00;
    template <typename T>
    const unsigned long FTUnicodeStringItr<T>::lowSurrogateEnd = 0xDFFF;
    template <typename T>
    const unsigned long FTUnicodeStringItr<T>::highSurrogateShift = 10;
    template <typename T>
    const unsigned long FTUnicodeStringItr<T>::lowSurrogateBase = 0x0010000UL;
    
    template <typename T>
    inline void FTUnicodeStringItr<T>::readUTF16()
    {
        unsigned int ch = *nextPos++;
        // if we have the first half of the surrogate pair
        if (ch >= highSurrogateStart && ch <= highSurrogateEnd)
        {
            unsigned int ch2 = *curPos;
            // complete the surrogate pair
            if (ch2 >= lowSurrogateStart && ch2 <= lowSurrogateEnd)
            {
                ch = ((ch - highSurrogateStart) << highSurrogateShift)
                    + (ch2 - lowSurrogateStart) + lowSurrogateBase;
                ++nextPos;
            }
        }
        curChar = ch;
    }
    
    #endif