/******************************************************************************
@File PVRTUnicode.cpp
@Title PVRTUnicode
@Version @Version
@Copyright Copyright (c) Imagination Technologies Limited.
@Platform All
@Description A small collection of functions used to decode Unicode formats to
individual code points.
******************************************************************************/
#include "PVRTUnicode.h"
#include <string.h>
/****************************************************************************
** Constants
****************************************************************************/
const PVRTuint32 c_u32ReplChar = 0xFFFD;
#define VALID_ASCII 0x80
#define TAIL_MASK 0x3F
#define BYTES_PER_TAIL 6
#define UTF16_SURG_H_MARK 0xD800
#define UTF16_SURG_H_END 0xDBFF
#define UTF16_SURG_L_MARK 0xDC00
#define UTF16_SURG_L_END 0xDFFF
#define UNICODE_NONCHAR_MARK 0xFDD0
#define UNICODE_NONCHAR_END 0xFDEF
#define UNICODE_RESERVED 0xFFFE
#define UNICODE_MAX 0x10FFFF
#define MAX_LEN 0x8FFF
/****************************************************************************
** A table which allows quick lookup to determine the number of bytes of a
** UTF8 code point.
****************************************************************************/
const PVRTuint8 c_u8UTF8Lengths[256] =
{
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,
};
/****************************************************************************
** A table which allows quick lookup to determine whether a UTF8 sequence
** is 'overlong'.
****************************************************************************/
const PVRTuint32 c_u32MinVals[4] =
{
0x00000000, // 0 tail bytes
0x00000080, // 1 tail bytes
0x00000800, // 2 tail bytes
0x00010000, // 3 tail bytes
};
/*!***************************************************************************
@Function CheckGenericUnicode
@Input c32 A UTF32 character/Unicode code point
@Returns Success or failure.
@Description Checks that the decoded code point is valid.
*****************************************************************************/
static bool CheckGenericUnicode(PVRTuint32 c32)
{
// Check that this value isn't a UTF16 surrogate mask.
if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END)
return false;
// Check non-char values
if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END)
return false;
// Check reserved values
if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED)
return false;
// Check max value.
if(c32 > UNICODE_MAX)
return false;
return true;
}
/*!***************************************************************************
@Function PVRTUnicodeUTF8ToUTF32
@Input pUTF8 A UTF8 string, which is null terminated.
@Output aUTF32 An array of Unicode code points.
@Returns Success or failure.
@Description Decodes a UTF8-encoded string in to Unicode code points
(UTF32). If pUTF8 is not null terminated, the results are
undefined.
*****************************************************************************/
EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32)
{
unsigned int uiTailLen, uiIndex;
unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
PVRTuint32 c32;
const PVRTuint8* pC = pUTF8;
while(*pC)
{
// Quick optimisation for ASCII characters
while(*pC && *pC < VALID_ASCII)
{
aUTF32.Append(*pC++);
}
// Done
if(!*pC)
break;
c32 = *pC++;
uiTailLen = c_u8UTF8Lengths[c32];
// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
// Also check to make sure the tail length is inside the provided buffer.
if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
return PVR_OVERFLOW;
c32 &= (TAIL_MASK >> uiTailLen); // Get the data out of the first byte. This depends on the length of the tail.
// Get the data out of each tail byte
uiIndex = 0;
while(uiIndex < uiTailLen)
{
if((pC[uiIndex] & 0xC0) != 0x80)
return PVR_FAIL; // Invalid tail byte!
c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
uiIndex++;
}
pC += uiIndex;
// Check overlong values.
if(c32 < c_u32MinVals[uiTailLen])
return PVR_FAIL;
if(!CheckGenericUnicode(c32))
return PVR_FAIL;
// OK
aUTF32.Append(c32);
}
return PVR_SUCCESS;
}
/*!***************************************************************************
@Function PVRTUnicodeUTF16ToUTF32
@Input pUTF16 A UTF16 string, which is null terminated.
@Output aUTF32 An array of Unicode code points.
@Returns Success or failure.
@Description Decodes a UTF16-encoded string in to Unicode code points
(UTF32). If pUTF16 is not null terminated, the results are
undefined.
*****************************************************************************/
EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32)
{
const PVRTuint16* pC = pUTF16;
// Determine the number of shorts
while(*++pC && (pC - pUTF16) < MAX_LEN);
unsigned int uiBufferLen = (unsigned int) (pC - pUTF16);
if(uiBufferLen == MAX_LEN)
return PVR_OVERFLOW; // Probably not NULL terminated.
// Reset to start.
pC = pUTF16;
PVRTuint32 c32;
while(*pC)
{
// Straight copy. We'll check for surrogate pairs next...
c32 = *pC++;
// Check surrogate pair
if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END)
{
// Make sure the next 2 bytes are in range...
if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0)
return PVR_OVERFLOW;
// Check that the next value is in the low surrogate range
if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END)
return PVR_FAIL;
// Decode
c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000;
pC++;
}
if(!CheckGenericUnicode(c32))
return PVR_FAIL;
// OK
aUTF32.Append(c32);
}
return PVR_SUCCESS;
}
/*!***************************************************************************
@Function PVRTUnicodeUTF8Length
@Input pUTF8 A UTF8 string, which is null terminated.
@Returns The length of the string, in Unicode code points.
@Description Calculates the length of a UTF8 string. If pUTF8 is
not null terminated, the results are undefined.
*****************************************************************************/
unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8)
{
const PVRTuint8* pC = pUTF8;
unsigned int charCount = 0;
unsigned int mask;
while(*pC)
{
// Quick optimisation for ASCII characters
const PVRTuint8* pStart = pC;
while(*pC && *pC < VALID_ASCII)
pC++;
charCount += (unsigned int) (pC - pStart);
// Done
if(!*pC)
break;
mask = *pC & 0xF0;
switch(mask)
{
case 0xF0: pC++;
case 0xE0: pC++;
case 0xC0: pC++;
break;
default:
_ASSERT(!"Invalid tail byte!");
return 0;
}
pC++;
charCount++;
}
return charCount;
}
/*!***************************************************************************
@Function PVRTUnicodeUTF16Length
@Input pUTF16 A UTF16 string, which is null terminated.
@Returns The length of the string, in Unicode code points.
@Description Calculates the length of a UTF16 string.
If pUTF16 is not null terminated, the results are
undefined.
*****************************************************************************/
unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16)
{
const PVRTuint16* pC = pUTF16;
unsigned int charCount = 0;
while(*pC && (pC - pUTF16) < MAX_LEN)
{
if( pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END
&& pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END)
{
pC += 2;
}
else
{
pC += 1;
}
charCount++;
}
return charCount;
}
/*!***************************************************************************
@Function PVRTUnicodeValidUTF8
@Input pUTF8 A UTF8 string, which is null terminated.
@Returns true or false
@Description Checks whether the encoding of a UTF8 string is valid.
If pUTF8 is not null terminated, the results are undefined.
*****************************************************************************/
bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8)
{
unsigned int uiTailLen, uiIndex;
unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
const PVRTuint8* pC = pUTF8;
while(*pC)
{
// Quick optimisation for ASCII characters
while(*pC && *pC < VALID_ASCII) pC++;
// Done?
if(!*pC)
break;
PVRTuint32 c32 = *pC++;
uiTailLen = c_u8UTF8Lengths[c32];
// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
// Also check to make sure the tail length is inside the provided buffer.
if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
return false;
// Get the data out of each tail byte
uiIndex = 0;
while(uiIndex < uiTailLen)
{
if((pC[uiIndex] & 0xC0) != 0x80)
return false; // Invalid tail byte!
c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
uiIndex++;
}
pC += uiIndex;
// Check overlong values.
if(c32 < c_u32MinVals[uiTailLen])
return false;
if(!CheckGenericUnicode(c32))
return false;
}
return true;
}
/*****************************************************************************
End of file (PVRTUnicode.cpp)
*****************************************************************************/