/****************************************************************************** @File PVRTUnicode.cpp @Title PVRTUnicode @Version @Version @Copyright Copyright (c) Imagination Technologies Limited. @Platform All @Description A small collection of functions used to decode Unicode formats to individual code points. ******************************************************************************/ #include "PVRTUnicode.h" #include <string.h> /**************************************************************************** ** Constants ****************************************************************************/ const PVRTuint32 c_u32ReplChar = 0xFFFD; #define VALID_ASCII 0x80 #define TAIL_MASK 0x3F #define BYTES_PER_TAIL 6 #define UTF16_SURG_H_MARK 0xD800 #define UTF16_SURG_H_END 0xDBFF #define UTF16_SURG_L_MARK 0xDC00 #define UTF16_SURG_L_END 0xDFFF #define UNICODE_NONCHAR_MARK 0xFDD0 #define UNICODE_NONCHAR_END 0xFDEF #define UNICODE_RESERVED 0xFFFE #define UNICODE_MAX 0x10FFFF #define MAX_LEN 0x8FFF /**************************************************************************** ** A table which allows quick lookup to determine the number of bytes of a ** UTF8 code point. ****************************************************************************/ const PVRTuint8 c_u8UTF8Lengths[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0, }; /**************************************************************************** ** A table which allows quick lookup to determine whether a UTF8 sequence ** is 'overlong'. ****************************************************************************/ const PVRTuint32 c_u32MinVals[4] = { 0x00000000, // 0 tail bytes 0x00000080, // 1 tail bytes 0x00000800, // 2 tail bytes 0x00010000, // 3 tail bytes }; /*!*************************************************************************** @Function CheckGenericUnicode @Input c32 A UTF32 character/Unicode code point @Returns Success or failure. @Description Checks that the decoded code point is valid. *****************************************************************************/ static bool CheckGenericUnicode(PVRTuint32 c32) { // Check that this value isn't a UTF16 surrogate mask. if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END) return false; // Check non-char values if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END) return false; // Check reserved values if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED) return false; // Check max value. if(c32 > UNICODE_MAX) return false; return true; } /*!*************************************************************************** @Function PVRTUnicodeUTF8ToUTF32 @Input pUTF8 A UTF8 string, which is null terminated. @Output aUTF32 An array of Unicode code points. @Returns Success or failure. @Description Decodes a UTF8-encoded string in to Unicode code points (UTF32). If pUTF8 is not null terminated, the results are undefined. *****************************************************************************/ EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32) { unsigned int uiTailLen, uiIndex; unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8); PVRTuint32 c32; const PVRTuint8* pC = pUTF8; while(*pC) { // Quick optimisation for ASCII characters while(*pC && *pC < VALID_ASCII) { aUTF32.Append(*pC++); } // Done if(!*pC) break; c32 = *pC++; uiTailLen = c_u8UTF8Lengths[c32]; // Check for invalid tail length. Maximum 4 bytes for each UTF8 character. // Also check to make sure the tail length is inside the provided buffer. if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes)) return PVR_OVERFLOW; c32 &= (TAIL_MASK >> uiTailLen); // Get the data out of the first byte. This depends on the length of the tail. // Get the data out of each tail byte uiIndex = 0; while(uiIndex < uiTailLen) { if((pC[uiIndex] & 0xC0) != 0x80) return PVR_FAIL; // Invalid tail byte! c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK); uiIndex++; } pC += uiIndex; // Check overlong values. if(c32 < c_u32MinVals[uiTailLen]) return PVR_FAIL; if(!CheckGenericUnicode(c32)) return PVR_FAIL; // OK aUTF32.Append(c32); } return PVR_SUCCESS; } /*!*************************************************************************** @Function PVRTUnicodeUTF16ToUTF32 @Input pUTF16 A UTF16 string, which is null terminated. @Output aUTF32 An array of Unicode code points. @Returns Success or failure. @Description Decodes a UTF16-encoded string in to Unicode code points (UTF32). If pUTF16 is not null terminated, the results are undefined. *****************************************************************************/ EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32) { const PVRTuint16* pC = pUTF16; // Determine the number of shorts while(*++pC && (pC - pUTF16) < MAX_LEN); unsigned int uiBufferLen = (unsigned int) (pC - pUTF16); if(uiBufferLen == MAX_LEN) return PVR_OVERFLOW; // Probably not NULL terminated. // Reset to start. pC = pUTF16; PVRTuint32 c32; while(*pC) { // Straight copy. We'll check for surrogate pairs next... c32 = *pC++; // Check surrogate pair if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END) { // Make sure the next 2 bytes are in range... if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0) return PVR_OVERFLOW; // Check that the next value is in the low surrogate range if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END) return PVR_FAIL; // Decode c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000; pC++; } if(!CheckGenericUnicode(c32)) return PVR_FAIL; // OK aUTF32.Append(c32); } return PVR_SUCCESS; } /*!*************************************************************************** @Function PVRTUnicodeUTF8Length @Input pUTF8 A UTF8 string, which is null terminated. @Returns The length of the string, in Unicode code points. @Description Calculates the length of a UTF8 string. If pUTF8 is not null terminated, the results are undefined. *****************************************************************************/ unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8) { const PVRTuint8* pC = pUTF8; unsigned int charCount = 0; unsigned int mask; while(*pC) { // Quick optimisation for ASCII characters const PVRTuint8* pStart = pC; while(*pC && *pC < VALID_ASCII) pC++; charCount += (unsigned int) (pC - pStart); // Done if(!*pC) break; mask = *pC & 0xF0; switch(mask) { case 0xF0: pC++; case 0xE0: pC++; case 0xC0: pC++; break; default: _ASSERT(!"Invalid tail byte!"); return 0; } pC++; charCount++; } return charCount; } /*!*************************************************************************** @Function PVRTUnicodeUTF16Length @Input pUTF16 A UTF16 string, which is null terminated. @Returns The length of the string, in Unicode code points. @Description Calculates the length of a UTF16 string. If pUTF16 is not null terminated, the results are undefined. *****************************************************************************/ unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16) { const PVRTuint16* pC = pUTF16; unsigned int charCount = 0; while(*pC && (pC - pUTF16) < MAX_LEN) { if( pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END) { pC += 2; } else { pC += 1; } charCount++; } return charCount; } /*!*************************************************************************** @Function PVRTUnicodeValidUTF8 @Input pUTF8 A UTF8 string, which is null terminated. @Returns true or false @Description Checks whether the encoding of a UTF8 string is valid. If pUTF8 is not null terminated, the results are undefined. *****************************************************************************/ bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8) { unsigned int uiTailLen, uiIndex; unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8); const PVRTuint8* pC = pUTF8; while(*pC) { // Quick optimisation for ASCII characters while(*pC && *pC < VALID_ASCII) pC++; // Done? if(!*pC) break; PVRTuint32 c32 = *pC++; uiTailLen = c_u8UTF8Lengths[c32]; // Check for invalid tail length. Maximum 4 bytes for each UTF8 character. // Also check to make sure the tail length is inside the provided buffer. if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes)) return false; // Get the data out of each tail byte uiIndex = 0; while(uiIndex < uiTailLen) { if((pC[uiIndex] & 0xC0) != 0x80) return false; // Invalid tail byte! c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK); uiIndex++; } pC += uiIndex; // Check overlong values. if(c32 < c_u32MinVals[uiTailLen]) return false; if(!CheckGenericUnicode(c32)) return false; } return true; } /***************************************************************************** End of file (PVRTUnicode.cpp) *****************************************************************************/