/* ********************************************************************** * Copyright (C) 2000-2007, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * file name: ucnvisci.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2001JUN26 * created by: Ram Viswanadha * * Date Name Description * 24/7/2001 Ram Added support for EXT character handling */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION #include "cmemory.h" #include "ucnv_bld.h" #include "unicode/ucnv.h" #include "ucnv_cnv.h" #include "unicode/ucnv_cb.h" #include "unicode/uset.h" #include "cstring.h" #define UCNV_OPTIONS_VERSION_MASK 0xf #define NUKTA 0x093c #define HALANT 0x094d #define ZWNJ 0x200c /* Zero Width Non Joiner */ #define ZWJ 0x200d /* Zero width Joiner */ #define INVALID_CHAR 0xffff #define ATR 0xEF /* Attribute code */ #define EXT 0xF0 /* Extension code */ #define DANDA 0x0964 #define DOUBLE_DANDA 0x0965 #define ISCII_NUKTA 0xE9 #define ISCII_HALANT 0xE8 #define ISCII_DANDA 0xEA #define ISCII_INV 0xD9 #define ISCII_VOWEL_SIGN_E 0xE0 #define INDIC_BLOCK_BEGIN 0x0900 #define INDIC_BLOCK_END 0x0D7F #define INDIC_RANGE (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN) #define VOCALLIC_RR 0x0931 #define LF 0x0A #define ASCII_END 0xA0 #define NO_CHAR_MARKER 0xFFFE #define TELUGU_DELTA DELTA * TELUGU #define DEV_ABBR_SIGN 0x0970 #define DEV_ANUDATTA 0x0952 #define EXT_RANGE_BEGIN 0xA1 #define EXT_RANGE_END 0xEE typedef enum { DEVANAGARI =0, BENGALI, GURMUKHI, GUJARATI, ORIYA, TAMIL, TELUGU, KANNADA, MALAYALAM, DELTA=0x80 }UniLang; /** * Enumeration for switching code pages if <ATR>+<one of below values> * is encountered */ typedef enum { DEF = 0x40, RMN = 0x41, DEV = 0x42, BNG = 0x43, TML = 0x44, TLG = 0x45, ASM = 0x46, ORI = 0x47, KND = 0x48, MLM = 0x49, GJR = 0x4A, PNJ = 0x4B, ARB = 0x71, PES = 0x72, URD = 0x73, SND = 0x74, KSM = 0x75, PST = 0x76 }ISCIILang; typedef enum{ DEV_MASK =0x80, PNJ_MASK =0x40, GJR_MASK =0x20, ORI_MASK =0x10, BNG_MASK =0x08, KND_MASK =0x04, MLM_MASK =0x02, TML_MASK =0x01, ZERO =0x00 }MaskEnum; #define ISCII_CNV_PREFIX "ISCII,version=" typedef struct{ UChar contextCharToUnicode; /* previous Unicode codepoint for contextual analysis */ UChar contextCharFromUnicode; /* previous Unicode codepoint for contextual analysis */ uint16_t defDeltaToUnicode; /* delta for switching to default state when DEF is encountered */ uint16_t currentDeltaFromUnicode;/* current delta in Indic block */ uint16_t currentDeltaToUnicode; /* current delta in Indic block */ MaskEnum currentMaskFromUnicode; /* mask for current state in toUnicode */ MaskEnum currentMaskToUnicode; /* mask for current state in toUnicode */ MaskEnum defMaskToUnicode; /* mask for default state in toUnicode */ UBool isFirstBuffer; /* boolean for fromUnicode to see if we need to announce the first script */ UBool resetToDefaultToUnicode; /* boolean for reseting to default delta and mask when a newline is encountered*/ char name[sizeof(ISCII_CNV_PREFIX) + 1]; }UConverterDataISCII; typedef struct LookupDataStruct { UniLang uniLang; MaskEnum maskEnum; ISCIILang isciiLang; } LookupDataStruct; static const LookupDataStruct lookupInitialData[]={ { DEVANAGARI, DEV_MASK, DEV }, { BENGALI, BNG_MASK, BNG }, { GURMUKHI, PNJ_MASK, PNJ }, { GUJARATI, GJR_MASK, GJR }, { ORIYA, ORI_MASK, ORI }, { TAMIL, TML_MASK, TML }, { TELUGU, KND_MASK, TLG }, { KANNADA, KND_MASK, KND }, { MALAYALAM, MLM_MASK, MLM } }; static void _ISCIIOpen(UConverter *cnv, const char *name,const char *locale,uint32_t options, UErrorCode *errorCode){ cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISCII)); if(cnv->extraInfo != NULL) { int32_t len=0; UConverterDataISCII *converterData=(UConverterDataISCII *) cnv->extraInfo; converterData->contextCharToUnicode=NO_CHAR_MARKER; cnv->toUnicodeStatus = missingCharMarker; converterData->contextCharFromUnicode=0x0000; converterData->resetToDefaultToUnicode=FALSE; /* check if the version requested is supported */ if((options & UCNV_OPTIONS_VERSION_MASK) < 9){ /* initialize state variables */ converterData->currentDeltaFromUnicode=converterData->currentDeltaToUnicode= converterData->defDeltaToUnicode= (uint16_t)(lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].uniLang * DELTA); converterData->currentMaskFromUnicode = converterData->currentMaskToUnicode = converterData->defMaskToUnicode=lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK].maskEnum; converterData->isFirstBuffer=TRUE; (void)uprv_strcpy(converterData->name, ISCII_CNV_PREFIX); len = (int32_t)uprv_strlen(converterData->name); converterData->name[len]= (char)((options & UCNV_OPTIONS_VERSION_MASK) + '0'); converterData->name[len+1]=0; }else{ uprv_free(cnv->extraInfo); cnv->extraInfo = NULL; *errorCode = U_ILLEGAL_ARGUMENT_ERROR; } }else{ *errorCode =U_MEMORY_ALLOCATION_ERROR; } } static void _ISCIIClose(UConverter *cnv){ if(cnv->extraInfo!=NULL) { if(!cnv->isExtraLocal) { uprv_free(cnv->extraInfo); } cnv->extraInfo=NULL; } } static const char* _ISCIIgetName(const UConverter* cnv){ if(cnv->extraInfo){ UConverterDataISCII* myData= (UConverterDataISCII*)cnv->extraInfo; return myData->name; } return NULL; } static void _ISCIIReset(UConverter *cnv, UConverterResetChoice choice){ UConverterDataISCII* data =(UConverterDataISCII *) (cnv->extraInfo); if(choice<=UCNV_RESET_TO_UNICODE) { cnv->toUnicodeStatus = missingCharMarker; cnv->mode=0; data->currentDeltaToUnicode=data->defDeltaToUnicode; data->currentMaskToUnicode = data->defMaskToUnicode; data->contextCharToUnicode=NO_CHAR_MARKER; } if(choice!=UCNV_RESET_TO_UNICODE) { cnv->fromUChar32=0x0000; data->contextCharFromUnicode=0x00; data->currentMaskFromUnicode=data->defMaskToUnicode; data->currentDeltaFromUnicode=data->defDeltaToUnicode; data->isFirstBuffer=TRUE; data->resetToDefaultToUnicode=FALSE; } } /** * The values in validity table are indexed by the lower bits of Unicode * range 0x0900 - 0x09ff. The values have a structure like: * --------------------------------------------------------------- * | DEV | PNJ | GJR | ORI | BNG | TLG | MLM | TML | * | | | | | ASM | KND | | | * --------------------------------------------------------------- * If a code point is valid in a particular script * then that bit is turned on * * Unicode does not distinguish between Bengali and Assamese so we use 1 bit for * to represent these languages * * Telugu and Kannada have same codepoints except for Vocallic_RR which we special case * and combine and use 1 bit to represent these languages. * * TODO: It is probably easier to understand and maintain to change this * to use uint16_t and give each of the 9 Unicode/script blocks its own bit. */ static const uint8_t validityTable[128] = { /* This state table is tool generated please do not edit unless you know exactly what you are doing */ /* Note: This table was edited to mirror the Windows XP implementation */ /*ISCII:Valid:Unicode */ /*0xa0 : 0x00: 0x900 */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xa1 : 0xb8: 0x901 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , /*0xa2 : 0xfe: 0x902 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xa3 : 0xbf: 0x903 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0x00 : 0x00: 0x904 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xa4 : 0xff: 0x905 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xa5 : 0xff: 0x906 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xa6 : 0xff: 0x907 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xa7 : 0xff: 0x908 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xa8 : 0xff: 0x909 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xa9 : 0xff: 0x90a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xaa : 0xfe: 0x90b */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0x00 : 0x00: 0x90c */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xae : 0x80: 0x90d */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xab : 0x87: 0x90e */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xac : 0xff: 0x90f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xad : 0xff: 0x910 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xb2 : 0x80: 0x911 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xaf : 0x87: 0x912 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xb0 : 0xff: 0x913 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xb1 : 0xff: 0x914 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xb3 : 0xff: 0x915 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xb4 : 0xfe: 0x916 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xb5 : 0xfe: 0x917 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xb6 : 0xfe: 0x918 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xb7 : 0xff: 0x919 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xb8 : 0xff: 0x91a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xb9 : 0xfe: 0x91b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xba : 0xff: 0x91c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xbb : 0xfe: 0x91d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xbc : 0xff: 0x91e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xbd : 0xff: 0x91f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xbe : 0xfe: 0x920 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xbf : 0xfe: 0x921 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xc0 : 0xfe: 0x922 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xc1 : 0xff: 0x923 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xc2 : 0xff: 0x924 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xc3 : 0xfe: 0x925 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xc4 : 0xfe: 0x926 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xc5 : 0xfe: 0x927 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xc6 : 0xff: 0x928 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xc7 : 0x81: 0x929 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + TML_MASK , /*0xc8 : 0xff: 0x92a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xc9 : 0xfe: 0x92b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xca : 0xfe: 0x92c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xcb : 0xfe: 0x92d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xcc : 0xfe: 0x92e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xcd : 0xff: 0x92f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xcf : 0xff: 0x930 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xd0 : 0x87: 0x931 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xd1 : 0xff: 0x932 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xd2 : 0xb7: 0x933 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xd3 : 0x83: 0x934 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK , /*0xd4 : 0xff: 0x935 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xd5 : 0xfe: 0x936 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0xd6 : 0xbf: 0x937 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xd7 : 0xff: 0x938 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xd8 : 0xff: 0x939 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0x00 : 0x00: 0x93A */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x93B */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xe9 : 0xda: 0x93c */ DEV_MASK + PNJ_MASK + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x93d */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xda : 0xff: 0x93e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xdb : 0xff: 0x93f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xdc : 0xff: 0x940 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xdd : 0xff: 0x941 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xde : 0xff: 0x942 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xdf : 0xbe: 0x943 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0x00 : 0x00: 0x944 */ DEV_MASK + ZERO + GJR_MASK + ZERO + BNG_MASK + KND_MASK + ZERO + ZERO , /*0xe3 : 0x80: 0x945 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xe0 : 0x87: 0x946 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xe1 : 0xff: 0x947 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xe2 : 0xff: 0x948 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xe7 : 0x80: 0x949 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xe4 : 0x87: 0x94a */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK , /*0xe5 : 0xff: 0x94b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xe6 : 0xff: 0x94c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xe8 : 0xff: 0x94d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xec : 0x00: 0x94e */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xed : 0x00: 0x94f */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x950 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x951 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x952 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x953 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x954 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x955 */ ZERO + ZERO + ZERO + ZERO + ZERO + KND_MASK + ZERO + ZERO , /*0x00 : 0x00: 0x956 */ ZERO + ZERO + ZERO + ORI_MASK + ZERO + KND_MASK + ZERO + ZERO , /*0x00 : 0x00: 0x957 */ ZERO + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + MLM_MASK + ZERO , /*0x00 : 0x00: 0x958 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x959 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x95a */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x95b */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x95c */ DEV_MASK + PNJ_MASK + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x95d */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x95e */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xce : 0x98: 0x95f */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x960 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0x00 : 0x00: 0x961 */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO , /*0x00 : 0x00: 0x962 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , /*0x00 : 0x00: 0x963 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO , /*0xea : 0xf8: 0x964 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xeaea : 0x00: 0x965*/ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /*0xf1 : 0xff: 0x966 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf2 : 0xff: 0x967 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf3 : 0xff: 0x968 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf4 : 0xff: 0x969 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf5 : 0xff: 0x96a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf6 : 0xff: 0x96b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf7 : 0xff: 0x96c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf8 : 0xff: 0x96d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xf9 : 0xff: 0x96e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0xfa : 0xff: 0x96f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK , /*0x00 : 0x80: 0x970 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO , /* * The length of the array is 128 to provide values for 0x900..0x97f. * The last 15 entries for 0x971..0x97f of the validity table are all zero * because no Indic script uses such Unicode code points. */ /*0x00 : 0x00: 0x9yz */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO }; static const uint16_t fromUnicodeTable[128]={ 0x00a0 ,/* 0x0900 */ 0x00a1 ,/* 0x0901 */ 0x00a2 ,/* 0x0902 */ 0x00a3 ,/* 0x0903 */ 0xa4e0 ,/* 0x0904 */ 0x00a4 ,/* 0x0905 */ 0x00a5 ,/* 0x0906 */ 0x00a6 ,/* 0x0907 */ 0x00a7 ,/* 0x0908 */ 0x00a8 ,/* 0x0909 */ 0x00a9 ,/* 0x090a */ 0x00aa ,/* 0x090b */ 0xA6E9 ,/* 0x090c */ 0x00ae ,/* 0x090d */ 0x00ab ,/* 0x090e */ 0x00ac ,/* 0x090f */ 0x00ad ,/* 0x0910 */ 0x00b2 ,/* 0x0911 */ 0x00af ,/* 0x0912 */ 0x00b0 ,/* 0x0913 */ 0x00b1 ,/* 0x0914 */ 0x00b3 ,/* 0x0915 */ 0x00b4 ,/* 0x0916 */ 0x00b5 ,/* 0x0917 */ 0x00b6 ,/* 0x0918 */ 0x00b7 ,/* 0x0919 */ 0x00b8 ,/* 0x091a */ 0x00b9 ,/* 0x091b */ 0x00ba ,/* 0x091c */ 0x00bb ,/* 0x091d */ 0x00bc ,/* 0x091e */ 0x00bd ,/* 0x091f */ 0x00be ,/* 0x0920 */ 0x00bf ,/* 0x0921 */ 0x00c0 ,/* 0x0922 */ 0x00c1 ,/* 0x0923 */ 0x00c2 ,/* 0x0924 */ 0x00c3 ,/* 0x0925 */ 0x00c4 ,/* 0x0926 */ 0x00c5 ,/* 0x0927 */ 0x00c6 ,/* 0x0928 */ 0x00c7 ,/* 0x0929 */ 0x00c8 ,/* 0x092a */ 0x00c9 ,/* 0x092b */ 0x00ca ,/* 0x092c */ 0x00cb ,/* 0x092d */ 0x00cc ,/* 0x092e */ 0x00cd ,/* 0x092f */ 0x00cf ,/* 0x0930 */ 0x00d0 ,/* 0x0931 */ 0x00d1 ,/* 0x0932 */ 0x00d2 ,/* 0x0933 */ 0x00d3 ,/* 0x0934 */ 0x00d4 ,/* 0x0935 */ 0x00d5 ,/* 0x0936 */ 0x00d6 ,/* 0x0937 */ 0x00d7 ,/* 0x0938 */ 0x00d8 ,/* 0x0939 */ 0xFFFF ,/* 0x093A */ 0xFFFF ,/* 0x093B */ 0x00e9 ,/* 0x093c */ 0xEAE9 ,/* 0x093d */ 0x00da ,/* 0x093e */ 0x00db ,/* 0x093f */ 0x00dc ,/* 0x0940 */ 0x00dd ,/* 0x0941 */ 0x00de ,/* 0x0942 */ 0x00df ,/* 0x0943 */ 0xDFE9 ,/* 0x0944 */ 0x00e3 ,/* 0x0945 */ 0x00e0 ,/* 0x0946 */ 0x00e1 ,/* 0x0947 */ 0x00e2 ,/* 0x0948 */ 0x00e7 ,/* 0x0949 */ 0x00e4 ,/* 0x094a */ 0x00e5 ,/* 0x094b */ 0x00e6 ,/* 0x094c */ 0x00e8 ,/* 0x094d */ 0x00ec ,/* 0x094e */ 0x00ed ,/* 0x094f */ 0xA1E9 ,/* 0x0950 */ /* OM Symbol */ 0xFFFF ,/* 0x0951 */ 0xF0B8 ,/* 0x0952 */ 0xFFFF ,/* 0x0953 */ 0xFFFF ,/* 0x0954 */ 0xFFFF ,/* 0x0955 */ 0xFFFF ,/* 0x0956 */ 0xFFFF ,/* 0x0957 */ 0xb3e9 ,/* 0x0958 */ 0xb4e9 ,/* 0x0959 */ 0xb5e9 ,/* 0x095a */ 0xbae9 ,/* 0x095b */ 0xbfe9 ,/* 0x095c */ 0xC0E9 ,/* 0x095d */ 0xc9e9 ,/* 0x095e */ 0x00ce ,/* 0x095f */ 0xAAe9 ,/* 0x0960 */ 0xA7E9 ,/* 0x0961 */ 0xDBE9 ,/* 0x0962 */ 0xDCE9 ,/* 0x0963 */ 0x00ea ,/* 0x0964 */ 0xeaea ,/* 0x0965 */ 0x00f1 ,/* 0x0966 */ 0x00f2 ,/* 0x0967 */ 0x00f3 ,/* 0x0968 */ 0x00f4 ,/* 0x0969 */ 0x00f5 ,/* 0x096a */ 0x00f6 ,/* 0x096b */ 0x00f7 ,/* 0x096c */ 0x00f8 ,/* 0x096d */ 0x00f9 ,/* 0x096e */ 0x00fa ,/* 0x096f */ 0xF0BF ,/* 0x0970 */ 0xFFFF ,/* 0x0971 */ 0xFFFF ,/* 0x0972 */ 0xFFFF ,/* 0x0973 */ 0xFFFF ,/* 0x0974 */ 0xFFFF ,/* 0x0975 */ 0xFFFF ,/* 0x0976 */ 0xFFFF ,/* 0x0977 */ 0xFFFF ,/* 0x0978 */ 0xFFFF ,/* 0x0979 */ 0xFFFF ,/* 0x097a */ 0xFFFF ,/* 0x097b */ 0xFFFF ,/* 0x097c */ 0xFFFF ,/* 0x097d */ 0xFFFF ,/* 0x097e */ 0xFFFF ,/* 0x097f */ }; static const uint16_t toUnicodeTable[256]={ 0x0000,/* 0x00 */ 0x0001,/* 0x01 */ 0x0002,/* 0x02 */ 0x0003,/* 0x03 */ 0x0004,/* 0x04 */ 0x0005,/* 0x05 */ 0x0006,/* 0x06 */ 0x0007,/* 0x07 */ 0x0008,/* 0x08 */ 0x0009,/* 0x09 */ 0x000a,/* 0x0a */ 0x000b,/* 0x0b */ 0x000c,/* 0x0c */ 0x000d,/* 0x0d */ 0x000e,/* 0x0e */ 0x000f,/* 0x0f */ 0x0010,/* 0x10 */ 0x0011,/* 0x11 */ 0x0012,/* 0x12 */ 0x0013,/* 0x13 */ 0x0014,/* 0x14 */ 0x0015,/* 0x15 */ 0x0016,/* 0x16 */ 0x0017,/* 0x17 */ 0x0018,/* 0x18 */ 0x0019,/* 0x19 */ 0x001a,/* 0x1a */ 0x001b,/* 0x1b */ 0x001c,/* 0x1c */ 0x001d,/* 0x1d */ 0x001e,/* 0x1e */ 0x001f,/* 0x1f */ 0x0020,/* 0x20 */ 0x0021,/* 0x21 */ 0x0022,/* 0x22 */ 0x0023,/* 0x23 */ 0x0024,/* 0x24 */ 0x0025,/* 0x25 */ 0x0026,/* 0x26 */ 0x0027,/* 0x27 */ 0x0028,/* 0x28 */ 0x0029,/* 0x29 */ 0x002a,/* 0x2a */ 0x002b,/* 0x2b */ 0x002c,/* 0x2c */ 0x002d,/* 0x2d */ 0x002e,/* 0x2e */ 0x002f,/* 0x2f */ 0x0030,/* 0x30 */ 0x0031,/* 0x31 */ 0x0032,/* 0x32 */ 0x0033,/* 0x33 */ 0x0034,/* 0x34 */ 0x0035,/* 0x35 */ 0x0036,/* 0x36 */ 0x0037,/* 0x37 */ 0x0038,/* 0x38 */ 0x0039,/* 0x39 */ 0x003A,/* 0x3A */ 0x003B,/* 0x3B */ 0x003c,/* 0x3c */ 0x003d,/* 0x3d */ 0x003e,/* 0x3e */ 0x003f,/* 0x3f */ 0x0040,/* 0x40 */ 0x0041,/* 0x41 */ 0x0042,/* 0x42 */ 0x0043,/* 0x43 */ 0x0044,/* 0x44 */ 0x0045,/* 0x45 */ 0x0046,/* 0x46 */ 0x0047,/* 0x47 */ 0x0048,/* 0x48 */ 0x0049,/* 0x49 */ 0x004a,/* 0x4a */ 0x004b,/* 0x4b */ 0x004c,/* 0x4c */ 0x004d,/* 0x4d */ 0x004e,/* 0x4e */ 0x004f,/* 0x4f */ 0x0050,/* 0x50 */ 0x0051,/* 0x51 */ 0x0052,/* 0x52 */ 0x0053,/* 0x53 */ 0x0054,/* 0x54 */ 0x0055,/* 0x55 */ 0x0056,/* 0x56 */ 0x0057,/* 0x57 */ 0x0058,/* 0x58 */ 0x0059,/* 0x59 */ 0x005a,/* 0x5a */ 0x005b,/* 0x5b */ 0x005c,/* 0x5c */ 0x005d,/* 0x5d */ 0x005e,/* 0x5e */ 0x005f,/* 0x5f */ 0x0060,/* 0x60 */ 0x0061,/* 0x61 */ 0x0062,/* 0x62 */ 0x0063,/* 0x63 */ 0x0064,/* 0x64 */ 0x0065,/* 0x65 */ 0x0066,/* 0x66 */ 0x0067,/* 0x67 */ 0x0068,/* 0x68 */ 0x0069,/* 0x69 */ 0x006a,/* 0x6a */ 0x006b,/* 0x6b */ 0x006c,/* 0x6c */ 0x006d,/* 0x6d */ 0x006e,/* 0x6e */ 0x006f,/* 0x6f */ 0x0070,/* 0x70 */ 0x0071,/* 0x71 */ 0x0072,/* 0x72 */ 0x0073,/* 0x73 */ 0x0074,/* 0x74 */ 0x0075,/* 0x75 */ 0x0076,/* 0x76 */ 0x0077,/* 0x77 */ 0x0078,/* 0x78 */ 0x0079,/* 0x79 */ 0x007a,/* 0x7a */ 0x007b,/* 0x7b */ 0x007c,/* 0x7c */ 0x007d,/* 0x7d */ 0x007e,/* 0x7e */ 0x007f,/* 0x7f */ 0x0080,/* 0x80 */ 0x0081,/* 0x81 */ 0x0082,/* 0x82 */ 0x0083,/* 0x83 */ 0x0084,/* 0x84 */ 0x0085,/* 0x85 */ 0x0086,/* 0x86 */ 0x0087,/* 0x87 */ 0x0088,/* 0x88 */ 0x0089,/* 0x89 */ 0x008a,/* 0x8a */ 0x008b,/* 0x8b */ 0x008c,/* 0x8c */ 0x008d,/* 0x8d */ 0x008e,/* 0x8e */ 0x008f,/* 0x8f */ 0x0090,/* 0x90 */ 0x0091,/* 0x91 */ 0x0092,/* 0x92 */ 0x0093,/* 0x93 */ 0x0094,/* 0x94 */ 0x0095,/* 0x95 */ 0x0096,/* 0x96 */ 0x0097,/* 0x97 */ 0x0098,/* 0x98 */ 0x0099,/* 0x99 */ 0x009a,/* 0x9a */ 0x009b,/* 0x9b */ 0x009c,/* 0x9c */ 0x009d,/* 0x9d */ 0x009e,/* 0x9e */ 0x009f,/* 0x9f */ 0x00A0,/* 0xa0 */ 0x0901,/* 0xa1 */ 0x0902,/* 0xa2 */ 0x0903,/* 0xa3 */ 0x0905,/* 0xa4 */ 0x0906,/* 0xa5 */ 0x0907,/* 0xa6 */ 0x0908,/* 0xa7 */ 0x0909,/* 0xa8 */ 0x090a,/* 0xa9 */ 0x090b,/* 0xaa */ 0x090e,/* 0xab */ 0x090f,/* 0xac */ 0x0910,/* 0xad */ 0x090d,/* 0xae */ 0x0912,/* 0xaf */ 0x0913,/* 0xb0 */ 0x0914,/* 0xb1 */ 0x0911,/* 0xb2 */ 0x0915,/* 0xb3 */ 0x0916,/* 0xb4 */ 0x0917,/* 0xb5 */ 0x0918,/* 0xb6 */ 0x0919,/* 0xb7 */ 0x091a,/* 0xb8 */ 0x091b,/* 0xb9 */ 0x091c,/* 0xba */ 0x091d,/* 0xbb */ 0x091e,/* 0xbc */ 0x091f,/* 0xbd */ 0x0920,/* 0xbe */ 0x0921,/* 0xbf */ 0x0922,/* 0xc0 */ 0x0923,/* 0xc1 */ 0x0924,/* 0xc2 */ 0x0925,/* 0xc3 */ 0x0926,/* 0xc4 */ 0x0927,/* 0xc5 */ 0x0928,/* 0xc6 */ 0x0929,/* 0xc7 */ 0x092a,/* 0xc8 */ 0x092b,/* 0xc9 */ 0x092c,/* 0xca */ 0x092d,/* 0xcb */ 0x092e,/* 0xcc */ 0x092f,/* 0xcd */ 0x095f,/* 0xce */ 0x0930,/* 0xcf */ 0x0931,/* 0xd0 */ 0x0932,/* 0xd1 */ 0x0933,/* 0xd2 */ 0x0934,/* 0xd3 */ 0x0935,/* 0xd4 */ 0x0936,/* 0xd5 */ 0x0937,/* 0xd6 */ 0x0938,/* 0xd7 */ 0x0939,/* 0xd8 */ 0x200D,/* 0xd9 */ 0x093e,/* 0xda */ 0x093f,/* 0xdb */ 0x0940,/* 0xdc */ 0x0941,/* 0xdd */ 0x0942,/* 0xde */ 0x0943,/* 0xdf */ 0x0946,/* 0xe0 */ 0x0947,/* 0xe1 */ 0x0948,/* 0xe2 */ 0x0945,/* 0xe3 */ 0x094a,/* 0xe4 */ 0x094b,/* 0xe5 */ 0x094c,/* 0xe6 */ 0x0949,/* 0xe7 */ 0x094d,/* 0xe8 */ 0x093c,/* 0xe9 */ 0x0964,/* 0xea */ 0xFFFF,/* 0xeb */ 0xFFFF,/* 0xec */ 0xFFFF,/* 0xed */ 0xFFFF,/* 0xee */ 0xFFFF,/* 0xef */ 0xFFFF,/* 0xf0 */ 0x0966,/* 0xf1 */ 0x0967,/* 0xf2 */ 0x0968,/* 0xf3 */ 0x0969,/* 0xf4 */ 0x096a,/* 0xf5 */ 0x096b,/* 0xf6 */ 0x096c,/* 0xf7 */ 0x096d,/* 0xf8 */ 0x096e,/* 0xf9 */ 0x096f,/* 0xfa */ 0xFFFF,/* 0xfb */ 0xFFFF,/* 0xfc */ 0xFFFF,/* 0xfd */ 0xFFFF,/* 0xfe */ 0xFFFF /* 0xff */ }; static const uint16_t vowelSignESpecialCases[][2]={ { 2 /*length of array*/ , 0 }, { 0xA4 , 0x0904 }, }; static const uint16_t nuktaSpecialCases[][2]={ { 16 /*length of array*/ , 0 }, { 0xA6 , 0x090c }, { 0xEA , 0x093D }, { 0xDF , 0x0944 }, { 0xA1 , 0x0950 }, { 0xb3 , 0x0958 }, { 0xb4 , 0x0959 }, { 0xb5 , 0x095a }, { 0xba , 0x095b }, { 0xbf , 0x095c }, { 0xC0 , 0x095d }, { 0xc9 , 0x095e }, { 0xAA , 0x0960 }, { 0xA7 , 0x0961 }, { 0xDB , 0x0962 }, { 0xDC , 0x0963 }, }; #define WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err){ \ /* write the targetUniChar to target */ \ if(target <targetLimit){ \ if(targetByteUnit <= 0xFF){ \ *(target)++ = (uint8_t)(targetByteUnit); \ if(offsets){ \ *(offsets++) = (int32_t)(source - args->source-1); \ } \ }else{ \ *(target)++ = (uint8_t)(targetByteUnit>>8); \ if(offsets){ \ *(offsets++) = (int32_t)(source - args->source-1); \ } \ if(target < targetLimit){ \ *(target)++ = (uint8_t) targetByteUnit; \ if(offsets){ \ *(offsets++) = (int32_t)(source - args->source-1); \ } \ }else{ \ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ (uint8_t) (targetByteUnit); \ *err = U_BUFFER_OVERFLOW_ERROR; \ } \ } \ }else{ \ if(targetByteUnit & 0xFF00){ \ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ (uint8_t) (targetByteUnit >>8); \ } \ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \ (uint8_t) (targetByteUnit); \ *err = U_BUFFER_OVERFLOW_ERROR; \ } \ } /* Rules: * Explicit Halant : * <HALANT> + <ZWNJ> * Soft Halant : * <HALANT> + <ZWJ> */ static void UConverter_fromUnicode_ISCII_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, UErrorCode * err){ const UChar *source = args->source; const UChar *sourceLimit = args->sourceLimit; unsigned char *target = (unsigned char *) args->target; unsigned char *targetLimit = (unsigned char *) args->targetLimit; int32_t* offsets = args->offsets; uint32_t targetByteUnit = 0x0000; UChar32 sourceChar = 0x0000; UConverterDataISCII *converterData; uint16_t newDelta=0; uint16_t range = 0; UBool deltaChanged = FALSE; if ((args->converter == NULL) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)){ *err = U_ILLEGAL_ARGUMENT_ERROR; return; } /* initialize data */ converterData=(UConverterDataISCII*)args->converter->extraInfo; newDelta=converterData->currentDeltaFromUnicode; range = (uint16_t)(newDelta/DELTA); if((sourceChar = args->converter->fromUChar32)!=0) { goto getTrail; } /*writing the char to the output stream */ while(source < sourceLimit){ targetByteUnit = missingCharMarker; sourceChar = *source++; /*check if input is in ASCII and C0 control codes range*/ if (sourceChar <= ASCII_END) { WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,sourceChar,err); if(U_FAILURE(*err)){ break; } if(sourceChar == LF){ targetByteUnit = ATR<<8; targetByteUnit += (uint8_t) lookupInitialData[range].isciiLang; args->converter->fromUnicodeStatus=sourceChar; /* now append ATR and language code */ WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err); if(U_FAILURE(*err)){ break; } } continue; } switch(sourceChar){ case ZWNJ: /* contextChar has HALANT */ if(converterData->contextCharFromUnicode){ converterData->contextCharFromUnicode = 0x00; targetByteUnit = ISCII_HALANT; }else{ /* consume ZWNJ and continue */ converterData->contextCharFromUnicode = 0x00; continue; } break; case ZWJ: /* contextChar has HALANT */ if(converterData->contextCharFromUnicode){ targetByteUnit = ISCII_NUKTA; }else{ targetByteUnit =ISCII_INV; } converterData->contextCharFromUnicode = 0x00; break; default: /* is the sourceChar in the INDIC_RANGE? */ if((uint16_t)(INDIC_BLOCK_END-sourceChar) <= INDIC_RANGE){ /* Danda and Double Danda are valid in Northern scripts.. since Unicode * does not include these codepoints in all Northern scrips we need to * filter them out */ if(sourceChar!= DANDA && sourceChar != DOUBLE_DANDA){ /* find out to which block the souceChar belongs*/ range =(uint16_t)((sourceChar-INDIC_BLOCK_BEGIN)/DELTA); newDelta =(uint16_t)(range*DELTA); /* Now are we in the same block as the previous? */ if(newDelta!= converterData->currentDeltaFromUnicode || converterData->isFirstBuffer){ converterData->currentDeltaFromUnicode = newDelta; converterData->currentMaskFromUnicode = lookupInitialData[range].maskEnum; deltaChanged =TRUE; converterData->isFirstBuffer=FALSE; } /* Normalize all Indic codepoints to Devanagari and map them to ISCII */ /* now subtract the new delta from sourceChar*/ sourceChar -= converterData->currentDeltaFromUnicode ; } /* get the target byte unit */ targetByteUnit=fromUnicodeTable[(uint8_t)sourceChar]; /* is the code point valid in current script? */ if((validityTable[(uint8_t)sourceChar] & converterData->currentMaskFromUnicode)==0){ /* Vocallic RR is assigne in ISCII Telugu and Unicode */ if(converterData->currentDeltaFromUnicode!=(TELUGU_DELTA) && sourceChar!=VOCALLIC_RR){ targetByteUnit=missingCharMarker; } } if(deltaChanged){ /* we are in a script block which is different than * previous sourceChar's script block write ATR and language codes */ uint16_t temp=0; temp =(uint16_t)(ATR<<8); temp += (uint16_t)((uint8_t) lookupInitialData[range].isciiLang); /* reset */ deltaChanged=FALSE; /* now append ATR and language code */ WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,temp,err); if(U_FAILURE(*err)){ break; } } } /* reset context char */ converterData->contextCharFromUnicode = 0x00; break; } if(targetByteUnit != missingCharMarker){ if(targetByteUnit==ISCII_HALANT){ converterData->contextCharFromUnicode = (UChar)targetByteUnit; } /* write targetByteUnit to target*/ WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err); if(U_FAILURE(*err)){ break; } } else{ /* oops.. the code point is unassigned */ /*check if the char is a First surrogate*/ if(UTF_IS_SURROGATE(sourceChar)) { if(UTF_IS_SURROGATE_FIRST(sourceChar)) { getTrail: /*look ahead to find the trail surrogate*/ if(source < sourceLimit) { /* test the following code unit */ UChar trail= (*source); if(UTF_IS_SECOND_SURROGATE(trail)) { source++; sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); *err =U_INVALID_CHAR_FOUND; /* convert this surrogate code point */ /* exit this condition tree */ } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ *err=U_ILLEGAL_CHAR_FOUND; } } else { /* no more input */ *err = U_ZERO_ERROR; } } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ *err=U_ILLEGAL_CHAR_FOUND; } } else { /* callback(unassigned) for a BMP code point */ *err = U_INVALID_CHAR_FOUND; } args->converter->fromUChar32=sourceChar; break; } }/* end while(mySourceIndex<mySourceLength) */ /*save the state and return */ args->source = source; args->target = (char*)target; } static const int32_t lookupTable[][2]={ { ZERO, ZERO }, /*DEFALT*/ { ZERO, ZERO }, /*ROMAN*/ { DEVANAGARI, DEV_MASK }, { BENGALI, BNG_MASK }, { TAMIL, TML_MASK }, { TELUGU, KND_MASK }, { BENGALI, BNG_MASK }, { ORIYA, ORI_MASK }, { KANNADA, KND_MASK }, { MALAYALAM, MLM_MASK }, { GUJARATI, GJR_MASK }, { GURMUKHI, PNJ_MASK } }; #define WRITE_TO_TARGET_TO_U(args,source,target,offsets,offset,targetUniChar,delta, err){\ /* add offset to current Indic Block */ \ if(targetUniChar>ASCII_END && \ targetUniChar != ZWJ && \ targetUniChar != ZWNJ && \ targetUniChar != DANDA && \ targetUniChar != DOUBLE_DANDA){ \ \ targetUniChar+=(uint16_t)(delta); \ } \ /* now write the targetUniChar */ \ if(target<args->targetLimit){ \ *(target)++ = (UChar)targetUniChar; \ if(offsets){ \ *(offsets)++ = (int32_t)(offset); \ } \ }else{ \ args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++] = \ (UChar)targetUniChar; \ *err = U_BUFFER_OVERFLOW_ERROR; \ } \ } #define GET_MAPPING(sourceChar,targetUniChar,data){ \ targetUniChar = toUnicodeTable[(sourceChar)] ; \ /* is the code point valid in current script? */ \ if(sourceChar> ASCII_END && \ (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode)==0){ \ /* Vocallic RR is assigne in ISCII Telugu and Unicode */ \ if(data->currentDeltaToUnicode!=(TELUGU_DELTA) && \ targetUniChar!=VOCALLIC_RR){ \ targetUniChar=missingCharMarker; \ } \ } \ } /*********** * Rules for ISCII to Unicode converter * ISCII is stateful encoding. To convert ISCII bytes to Unicode, * which has both precomposed and decomposed forms characters * pre-context and post-context need to be considered. * * Post context * i) ATR : Attribute code is used to declare the font and script switching. * Currently we only switch scripts and font codes consumed without generating an error * ii) EXT : Extention code is used to declare switching to Sanskrit and for obscure, * obsolete characters * Pre context * i) Halant: if preceeded by a halant then it is a explicit halant * ii) Nukta : * a) if preceeded by a halant then it is a soft halant * b) if preceeded by specific consonants and the ligatures have pre-composed * characters in Unicode then convert to pre-composed characters * iii) Danda: If Danda is preceeded by a Danda then convert to Double Danda * */ static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, UErrorCode* err){ const char *source = ( char *) args->source; UChar *target = args->target; const char *sourceLimit = args->sourceLimit; const UChar* targetLimit = args->targetLimit; uint32_t targetUniChar = 0x0000; uint8_t sourceChar = 0x0000; UConverterDataISCII* data; UChar32* toUnicodeStatus=NULL; UChar* contextCharToUnicode = NULL; UBool found; int i; if ((args->converter == NULL) || (target < args->target) || (source < args->source)){ *err = U_ILLEGAL_ARGUMENT_ERROR; return; } data = (UConverterDataISCII*)(args->converter->extraInfo); contextCharToUnicode = &data->contextCharToUnicode; /* contains previous ISCII codepoint visited */ toUnicodeStatus = (UChar32*)&args->converter->toUnicodeStatus;/* contains the mapping to Unicode of the above codepoint*/ while(source<sourceLimit){ targetUniChar = missingCharMarker; if(target < targetLimit){ sourceChar = (unsigned char)*(source)++; /* look at the post-context preform special processing */ if(*contextCharToUnicode==ATR){ /* If we have ATR in *contextCharToUnicode then we need to change our * state to the Indic Script specified by sourceChar */ /* check if the sourceChar is supported script range*/ if((uint8_t)(PNJ-sourceChar)<=PNJ-DEV){ data->currentDeltaToUnicode = (uint16_t)(lookupTable[sourceChar & 0x0F][0] * DELTA); data->currentMaskToUnicode = (MaskEnum)lookupTable[sourceChar & 0x0F][1] ; } else if(sourceChar==DEF){ /* switch back to default */ data->currentDeltaToUnicode = data->defDeltaToUnicode; data->currentMaskToUnicode = data->defMaskToUnicode; }else{ if((sourceChar >= 0x21 && sourceChar <= 0x3F)){ /* these are display codes consume and continue */ }else{ *err =U_ILLEGAL_CHAR_FOUND; /* reset */ *contextCharToUnicode=NO_CHAR_MARKER; goto CALLBACK; } } /* reset */ *contextCharToUnicode=NO_CHAR_MARKER; continue; }else if(*contextCharToUnicode==EXT){ /* check if sourceChar is in 0xA1-0xEE range */ if((uint8_t) (EXT_RANGE_END - sourceChar) <= (EXT_RANGE_END - EXT_RANGE_BEGIN)){ /* We currently support only Anudatta and Devanagari abbreviation sign */ if(sourceChar==0xBF || sourceChar == 0xB8){ targetUniChar = (sourceChar==0xBF) ? DEV_ABBR_SIGN : DEV_ANUDATTA; /* find out if the mapping is valid in this state */ if(validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode){ *contextCharToUnicode= NO_CHAR_MARKER; /* write to target */ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2), targetUniChar,data->currentDeltaToUnicode,err); continue; } } /* byte unit is unassigned */ targetUniChar = missingCharMarker; *err= U_INVALID_CHAR_FOUND; }else{ /* only 0xA1 - 0xEE are legal after EXT char */ *contextCharToUnicode= NO_CHAR_MARKER; *err = U_ILLEGAL_CHAR_FOUND; } goto CALLBACK; }else if(*contextCharToUnicode==ISCII_INV){ if(sourceChar==ISCII_HALANT){ targetUniChar = 0x0020; /* replace with space accoding to Indic FAQ */ }else{ targetUniChar = ZWJ; } /* write to target */ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2), targetUniChar,data->currentDeltaToUnicode,err); /* reset */ *contextCharToUnicode=NO_CHAR_MARKER; } /* look at the pre-context and perform special processing */ switch(sourceChar){ case ISCII_INV: case EXT: /*falls through*/ case ATR: *contextCharToUnicode = (UChar)sourceChar; if(*toUnicodeStatus != missingCharMarker){ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2), *toUnicodeStatus,data->currentDeltaToUnicode,err); *toUnicodeStatus = missingCharMarker; } continue; case ISCII_DANDA: /* handle double danda*/ if(*contextCharToUnicode== ISCII_DANDA){ targetUniChar = DOUBLE_DANDA; /* clear the context */ *contextCharToUnicode = NO_CHAR_MARKER; *toUnicodeStatus = missingCharMarker; }else{ GET_MAPPING(sourceChar,targetUniChar,data); *contextCharToUnicode = sourceChar; } break; case ISCII_HALANT: /* handle explicit halant */ if(*contextCharToUnicode == ISCII_HALANT){ targetUniChar = ZWNJ; /* clear the context */ *contextCharToUnicode = NO_CHAR_MARKER; }else{ GET_MAPPING(sourceChar,targetUniChar,data); *contextCharToUnicode = sourceChar; } break; case 0x0A: /* fall through */ case 0x0D: data->resetToDefaultToUnicode = TRUE; GET_MAPPING(sourceChar,targetUniChar,data); *contextCharToUnicode = sourceChar; break; case ISCII_VOWEL_SIGN_E: i=1; found=FALSE; for( ;i<vowelSignESpecialCases[0][0];i++){ if(vowelSignESpecialCases[i][0]==(uint8_t)*contextCharToUnicode){ targetUniChar=vowelSignESpecialCases[i][1]; found=TRUE; break; } } if(found) { /* find out if the mapping is valid in this state */ if(validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode){ /*targetUniChar += data->currentDeltaToUnicode ;*/ *contextCharToUnicode= NO_CHAR_MARKER; *toUnicodeStatus = missingCharMarker; break; } } GET_MAPPING(sourceChar,targetUniChar,data); *contextCharToUnicode = sourceChar; break; case ISCII_NUKTA: /* handle soft halant */ if(*contextCharToUnicode == ISCII_HALANT){ targetUniChar = ZWJ; /* clear the context */ *contextCharToUnicode = NO_CHAR_MARKER; break; }else{ /* try to handle <CHAR> + ISCII_NUKTA special mappings */ i=1; found =FALSE; for( ;i<nuktaSpecialCases[0][0];i++){ if(nuktaSpecialCases[i][0]==(uint8_t)*contextCharToUnicode){ targetUniChar=nuktaSpecialCases[i][1]; found =TRUE; break; } } if(found){ /* find out if the mapping is valid in this state */ if(validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode){ /*targetUniChar += data->currentDeltaToUnicode ;*/ *contextCharToUnicode= NO_CHAR_MARKER; *toUnicodeStatus = missingCharMarker; break; } /* else fall through to default */ } /* else fall through to default */ } default: GET_MAPPING(sourceChar,targetUniChar,data); *contextCharToUnicode = sourceChar; break; } if(*toUnicodeStatus != missingCharMarker){ /* write the previously mapped codepoint */ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2), *toUnicodeStatus,data->currentDeltaToUnicode,err); *toUnicodeStatus = missingCharMarker; } if(targetUniChar != missingCharMarker ){ /* now save the targetUniChar for delayed write */ *toUnicodeStatus = (UChar) targetUniChar; if(data->resetToDefaultToUnicode==TRUE){ data->currentDeltaToUnicode = data->defDeltaToUnicode; data->currentMaskToUnicode = data->defMaskToUnicode; data->resetToDefaultToUnicode=FALSE; } }else{ /* we reach here only if targetUniChar == missingCharMarker * so assign codes to reason and err */ *err = U_INVALID_CHAR_FOUND; CALLBACK: args->converter->toUBytes[0] = (uint8_t) sourceChar; args->converter->toULength = 1; break; } } else{ *err =U_BUFFER_OVERFLOW_ERROR; break; } } if(U_SUCCESS(*err) && args->flush && source == sourceLimit) { /* end of the input stream */ UConverter *cnv = args->converter; if(*contextCharToUnicode==ATR || *contextCharToUnicode==EXT || *contextCharToUnicode==ISCII_INV){ /* set toUBytes[] */ cnv->toUBytes[0] = (uint8_t)*contextCharToUnicode; cnv->toULength = 1; /* avoid looping on truncated sequences */ *contextCharToUnicode = NO_CHAR_MARKER; }else{ cnv->toULength = 0; } if(*toUnicodeStatus != missingCharMarker) { /* output a remaining target character */ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source - args->source -1), *toUnicodeStatus,data->currentDeltaToUnicode,err); *toUnicodeStatus = missingCharMarker; } } args->target = target; args->source = source; } /* structure for SafeClone calculations */ struct cloneISCIIStruct { UConverter cnv; UConverterDataISCII mydata; }; static UConverter * _ISCII_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status) { struct cloneISCIIStruct * localClone; int32_t bufferSizeNeeded = sizeof(struct cloneISCIIStruct); if (U_FAILURE(*status)){ return 0; } if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ *pBufferSize = bufferSizeNeeded; return 0; } localClone = (struct cloneISCIIStruct *)stackBuffer; /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataISCII)); localClone->cnv.extraInfo = &localClone->mydata; localClone->cnv.isExtraLocal = TRUE; return &localClone->cnv; } static void _ISCIIGetUnicodeSet(const UConverter *cnv, const USetAdder *sa, UConverterUnicodeSet which, UErrorCode *pErrorCode) { int32_t idx, script; uint8_t mask; /* Since all ISCII versions allow switching to other ISCII scripts, we add all roundtrippable characters to this set. */ sa->addRange(sa->set, 0, ASCII_END); for (script = DEVANAGARI; script <= MALAYALAM; script++) { mask = (uint8_t)(lookupInitialData[script].maskEnum); for (idx = 0; idx < DELTA; idx++) { if (validityTable[idx] & mask) { sa->add(sa->set, idx + (script * DELTA) + INDIC_BLOCK_BEGIN); } } } sa->add(sa->set, DANDA); sa->add(sa->set, DOUBLE_DANDA); sa->add(sa->set, ZWNJ); sa->add(sa->set, ZWJ); } static const UConverterImpl _ISCIIImpl={ UCNV_ISCII, NULL, NULL, _ISCIIOpen, _ISCIIClose, _ISCIIReset, UConverter_toUnicode_ISCII_OFFSETS_LOGIC, UConverter_toUnicode_ISCII_OFFSETS_LOGIC, UConverter_fromUnicode_ISCII_OFFSETS_LOGIC, UConverter_fromUnicode_ISCII_OFFSETS_LOGIC, NULL, NULL, _ISCIIgetName, NULL, _ISCII_SafeClone, _ISCIIGetUnicodeSet }; static const UConverterStaticData _ISCIIStaticData={ sizeof(UConverterStaticData), "ISCII", 0, UCNV_IBM, UCNV_ISCII, 1, 4, { 0x1a, 0, 0, 0 }, 0x1, FALSE, FALSE, 0x0, 0x0, { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */ }; const UConverterSharedData _ISCIIData={ sizeof(UConverterSharedData), ~((uint32_t) 0), NULL, NULL, &_ISCIIStaticData, FALSE, &_ISCIIImpl, 0 }; #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */