/* Copyright 2016 The Chromium OS Authors. All rights reserved. * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file. */ #include <stdlib.h> #include <stdint.h> #include <sys/types.h> #ifdef CRAS_DBUS #include <dbus/dbus.h> #endif #include "cras_utf8.h" #include "cras_util.h" static const uint8_t kUTF8ByteOrderMask[3] = { 0xef, 0xbb, 0xbf }; typedef struct u8range { uint8_t min; uint8_t max; } u8range_t; static const u8range_t kUTF8TwoByteSeq[] = { { 0xc2, 0xdf }, { 0x80, 0xbf }, { 0, 0 } }; static const u8range_t kUTF8ByteSeqE0[] = { { 0xe0, 0xe0 }, { 0xa0, 0xbf }, { 0x80, 0xbf }, { 0, 0 } }; static const u8range_t kUTF8ByteSeqE1EC[] = { { 0xe1, 0xec }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0, 0 } }; static const u8range_t kUTF8ByteSeqED[] = { { 0xed, 0xed }, { 0x80, 0x9f }, { 0x80, 0xbf }, { 0, 0 } }; static const u8range_t kUTF8ByteSeqEEEF[] = { { 0xee, 0xef }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0, 0 } }; static const u8range_t kUTF8ByteSeqF0[] = { { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0, 0 } }; static const u8range_t kUTF8ByteSeqF1F3[] = { { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0, 0 } }; static const u8range_t kUTF8ByteSeqF4[] = { { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0, 0 } }; static const u8range_t kUTF8NullRange[] = { { 0, 0 } }; typedef struct utf8seq { const u8range_t *ranges; } utf8seq_t; static const utf8seq_t kUTF8Sequences[] = { { kUTF8TwoByteSeq }, { kUTF8ByteSeqE0 }, { kUTF8ByteSeqE1EC }, { kUTF8ByteSeqED }, { kUTF8ByteSeqEEEF }, { kUTF8ByteSeqF0 }, { kUTF8ByteSeqF1F3 }, { kUTF8ByteSeqF4 }, { kUTF8NullRange } }; int valid_utf8_string(const char *string, size_t *bad_pos) { int bom_chars = 0; uint8_t byte; const char *pos = string; int ret = 1; const utf8seq_t *seq = NULL; const u8range_t *range = NULL; if (!pos) { ret = 0; goto error; } while ((byte = (uint8_t)*(pos++))) { if (!range || range->min == 0) { if (byte < 128) { /* Ascii character. */ continue; } if (bom_chars < ARRAY_SIZE(kUTF8ByteOrderMask)) { if (byte == kUTF8ByteOrderMask[bom_chars]) { bom_chars++; continue; } else { /* Characters not matching BOM. * Rewind and assume that there is * no BOM. */ bom_chars = ARRAY_SIZE(kUTF8ByteOrderMask); pos = string; continue; } } /* Find the matching sequence of characters by * matching the first character in the sequence. */ seq = kUTF8Sequences; while (seq->ranges->min != 0) { if (byte >= seq->ranges->min && byte <= seq->ranges->max) { /* Matching sequence. */ break; } seq++; } if (seq->ranges->min == 0) { /* Could not find a matching sequence. */ ret = 0; goto error; } /* Found the appropriate sequence. */ range = seq->ranges + 1; continue; } if (byte >= range->min && byte <= range->max) { range++; continue; } /* This character doesn't belong in UTF8. */ ret = 0; goto error; } if (range && range->min != 0) { /* Stopped in the middle of a sequence. */ ret = 0; } error: if (bad_pos) *bad_pos = pos - string - 1; return ret; } #ifdef CRAS_DBUS /* Use the DBus implementation if available to ensure that the UTF-8 * sequences match those expected by the DBus implementation. */ int is_utf8_string(const char *string) { return !!dbus_validate_utf8(string, NULL); } #else int is_utf8_string (const char *string) { return valid_utf8_string(string, NULL); } #endif