// UniUtil.cpp // // Example code for handling UTF-8 encoding // // Otfried Cheong #include class Uni { public: // convert UTF8 to Unicode (forward and backward) enum { KIllegal = 0xffff, KIncomplete = 0xf800, KHeadless = 0xf801 }; static TText16 WChar(const TDesC8 &aDes, TInt &aPos); static TText16 Previous(const TDesC8 &aDes, TInt &aPos); // convert Unicode to UTF8 static TBuf8<3> UTF8(TText16 wc); static void AppendUTF8(TDes8 &aUtf, TText16 aWChar); // Count characters in string static TInt Count(const TDesC8 &aDes); // Find Length of prefix in bytes for given # of chars static TInt Bytes(const TDesC8 &aDes, TInt aNoChars); }; //----------------------------------------------------------------- GLDEF_C const TUint8 bytesFromUTF8[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5}; LOCAL_C const TText16 firstByteMark[7] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0 }; // convert UTF8 to Unicode // special cases: end of string in UTF8 sequence --> KIncomplete // UTF8 sequence stops prematurely --> KIllegal // UTF8 sequence starts with trailer --> KHeadless TText16 Uni::WChar(const TDesC8 &aDes, TInt &aPos) { TUint wch = aDes[aPos++]; if ((wch & 0xc0) == 0x80) { // KHeadless while (aPos < aDes.Length() && (aDes[aPos] & 0xc0) == 0x80) aPos++; return KHeadless; } TInt extraBytes = bytesFromUTF8[wch & 0xff]; wch -= firstByteMark[extraBytes]; while (extraBytes--) { if (aPos >= aDes.Length()) return TText16(KIncomplete); if ((aDes[aPos] & 0xc0) != 0x80) // return TText16(aDes[aPos - 1]); return TText16(KIllegal); wch <<= 6; wch |= aDes[aPos++] & 0x3f; } return TText16(wch); } // same backwards TText16 Uni::Previous(const TDesC8 &aDes, TInt &aPos) { TInt i = aPos - 1; while (i >= 0 && aDes[i] >= 0x80 && aDes[i] < 0xc0) i--; aPos = i; return WChar(aDes, i); } // convert Unicode to UTF8 TBuf8<3> Uni::UTF8(TText16 wch) { TBuf8<3> aBuf; AppendUTF8(aBuf, wch); return aBuf; } // convert Unicode to UTF8 void Uni::AppendUTF8(TDes8 &aUtf, TText16 wch) { if (wch < 0x80) { aUtf.Append(TChar(wch)); } else if (wch < 0x800) { aUtf.Append(TChar(((wch & 0x7c0) >> 6) | 0xc0)); aUtf.Append(TChar((wch & 0x03f) | 0x80)); } else { // We never need to write UCS larger than 0x10000, // so now it's three bytes aUtf.Append(TChar(((wch & 0x0f000) >> 12) | 0xe0)); aUtf.Append(TChar(((wch & 0xfc0) >> 6) | 0x80)); aUtf.Append(TChar((wch & 0x03f) | 0x80)); } } // Count characters in string TInt Uni::Count(const TDesC8 &aDes) { TInt i, k = 0; TUint ch; for (i = 0; i < aDes.Length(); i++) { ch = aDes[i]; if (ch < 0x80 || ch >= 0xc0) k++; } // last character might not be complete return k; } // Find Length of prefix in bytes for given # chars TInt Uni::Bytes(const TDesC8 &aDes, TInt aNoChars) { TInt i = 0; while (aNoChars && i < aDes.Length()) { WChar(aDes, i); aNoChars--; } return i; } // --------------------------------------------------------------------