/*
* FTGL - OpenGL font library
*
* Copyright (c) 2008 Daniel Remenak <dtremenak@users.sourceforge.net>
*
* Portions derived from ConvertUTF.c Copyright (C) 2001-2004 Unicode, Inc
* Unicode, Inc. hereby grants the right to freely use the information
* supplied in this file in the creation of products supporting the
* Unicode Standard, and to make copies of this file in any form
* for internal or external distribution as long as this notice
* remains attached.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef __FTUnicode__
#define __FTUnicode__
/**
* Provides a way to easily walk multibyte unicode strings in the various
* Unicode encodings (UTF-8, UTF-16, UTF-32, UCS-2, and UCS-4). Encodings
* with elements larger than one byte must already be in the correct endian
* order for the current architecture.
*/
template <typename T>
class FTUnicodeStringItr
{
public:
/**
* Constructor. Also reads the first character and stores it.
*
* @param string The buffer to iterate. No copy is made.
*/
FTUnicodeStringItr(const T* string) : curPos(string), nextPos(string)
{
(*this)++;
};
/**
* Pre-increment operator. Reads the next unicode character and sets
* the state appropriately.
* Note - not protected against overruns.
*/
FTUnicodeStringItr& operator++()
{
curPos = nextPos;
// unicode handling
switch (sizeof(T))
{
case 1: // UTF-8
// get this character
readUTF8(); break;
case 2: // UTF-16
readUTF16(); break;
case 4: // UTF-32
// fall through
default: // error condition really, but give it a shot anyway
curChar = *nextPos++;
}
return *this;
}
/**
* Post-increment operator. Reads the next character and sets
* the state appropriately.
* Note - not protected against overruns.
*/
FTUnicodeStringItr operator++(int)
{
FTUnicodeStringItr temp = *this;
++*this;
return temp;
}
/**
* Equality operator. Two FTUnicodeStringItrs are considered equal
* if they have the same current buffer and buffer position.
*/
bool operator==(const FTUnicodeStringItr& right) const
{
if (curPos == right.getBufferFromHere())
return true;
return false;
}
/**
* Dereference operator.
*
* @return The unicode codepoint of the character currently pointed
* to by the FTUnicodeStringItr.
*/
unsigned int operator*() const
{
return curChar;
}
/**
* Buffer-fetching getter. You can use this to retreive the buffer
* starting at the currently-iterated character for functions which
* require a Unicode string as input.
*/
const T* getBufferFromHere() const { return curPos; }
private:
/**
* Helper function for reading a single UTF8 character from the string.
* Updates internal state appropriately.
*/
void readUTF8();
/**
* Helper function for reading a single UTF16 character from the string.
* Updates internal state appropriately.
*/
void readUTF16();
/**
* The buffer position of the first element in the current character.
*/
const T* curPos;
/**
* The character stored at the current buffer position (prefetched on
* increment, so there's no penalty for dereferencing more than once).
*/
unsigned int curChar;
/**
* The buffer position of the first element in the next character.
*/
const T* nextPos;
// unicode magic numbers
static const unsigned char utf8bytes[256];
static const unsigned long offsetsFromUTF8[6];
static const unsigned long highSurrogateStart;
static const unsigned long highSurrogateEnd;
static const unsigned long lowSurrogateStart;
static const unsigned long lowSurrogateEnd;
static const unsigned long highSurrogateShift;
static const unsigned long lowSurrogateBase;
};
/* The first character in a UTF8 sequence indicates how many bytes
* to read (among other things) */
template <typename T>
const unsigned char FTUnicodeStringItr<T>::utf8bytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
};
/* Magic values subtracted from a buffer value during UTF8 conversion.
* This table contains as many values as there might be trailing bytes
* in a UTF-8 sequence. */
template <typename T>
const unsigned long FTUnicodeStringItr<T>::offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
// get a UTF8 character; leave the tracking pointer at the start of the
// next character
// not protected against invalid UTF8
template <typename T>
inline void FTUnicodeStringItr<T>::readUTF8()
{
unsigned int ch = 0;
unsigned int extraBytesToRead = utf8bytes[(unsigned char)(*nextPos)];
// falls through
switch (extraBytesToRead)
{
case 6: ch += *nextPos++; ch <<= 6; /* remember, illegal UTF-8 */
case 5: ch += *nextPos++; ch <<= 6; /* remember, illegal UTF-8 */
case 4: ch += *nextPos++; ch <<= 6;
case 3: ch += *nextPos++; ch <<= 6;
case 2: ch += *nextPos++; ch <<= 6;
case 1: ch += *nextPos++;
}
ch -= offsetsFromUTF8[extraBytesToRead-1];
curChar = ch;
}
// Magic numbers for UTF-16 conversions
template <typename T>
const unsigned long FTUnicodeStringItr<T>::highSurrogateStart = 0xD800;
template <typename T>
const unsigned long FTUnicodeStringItr<T>::highSurrogateEnd = 0xDBFF;
template <typename T>
const unsigned long FTUnicodeStringItr<T>::lowSurrogateStart = 0xDC00;
template <typename T>
const unsigned long FTUnicodeStringItr<T>::lowSurrogateEnd = 0xDFFF;
template <typename T>
const unsigned long FTUnicodeStringItr<T>::highSurrogateShift = 10;
template <typename T>
const unsigned long FTUnicodeStringItr<T>::lowSurrogateBase = 0x0010000UL;
template <typename T>
inline void FTUnicodeStringItr<T>::readUTF16()
{
unsigned int ch = *nextPos++;
// if we have the first half of the surrogate pair
if (ch >= highSurrogateStart && ch <= highSurrogateEnd)
{
unsigned int ch2 = *curPos;
// complete the surrogate pair
if (ch2 >= lowSurrogateStart && ch2 <= lowSurrogateEnd)
{
ch = ((ch - highSurrogateStart) << highSurrogateShift)
+ (ch2 - lowSurrogateStart) + lowSurrogateBase;
++nextPos;
}
}
curChar = ch;
}
#endif