// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "base/utf_offset_string_conversions.h" #include <algorithm> #include "base/scoped_ptr.h" #include "base/string_piece.h" #include "base/utf_string_conversion_utils.h" using base::PrepareForUTF16Or32Output; using base::ReadUnicodeCharacter; using base::WriteUnicodeCharacter; // Generalized Unicode converter ----------------------------------------------- // Converts the given source Unicode character type to the given destination // Unicode character type as a STL string. The given input buffer and size // determine the source, and the given output STL string will be replaced by // the result. template<typename SRC_CHAR> bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, std::wstring* output, std::vector<size_t>* offsets_for_adjustment) { if (offsets_for_adjustment) { std::for_each(offsets_for_adjustment->begin(), offsets_for_adjustment->end(), LimitOffset<std::wstring>(src_len)); } // ICU requires 32-bit numbers. bool success = true; AdjustOffset::Adjustments adjustments; int32 src_len32 = static_cast<int32>(src_len); for (int32 i = 0; i < src_len32; i++) { uint32 code_point; size_t original_i = i; size_t chars_written = 0; if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { chars_written = WriteUnicodeCharacter(code_point, output); } else { chars_written = WriteUnicodeCharacter(0xFFFD, output); success = false; } if (offsets_for_adjustment) { // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last // character read, not after it (so that incrementing it in the loop // increment will place it at the right location), so we need to account // for that in determining the amount that was read. adjustments.push_back(AdjustOffset::Adjustment( original_i, i - original_i + 1, chars_written)); } } // Make offset adjustment. if (offsets_for_adjustment && !adjustments.empty()) { std::for_each(offsets_for_adjustment->begin(), offsets_for_adjustment->end(), AdjustOffset(adjustments)); } return success; } // UTF-8 <-> Wide -------------------------------------------------------------- bool UTF8ToWideAndAdjustOffset(const char* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment) { std::vector<size_t> offsets; if (offset_for_adjustment) offsets.push_back(*offset_for_adjustment); PrepareForUTF16Or32Output(src, src_len, output); bool ret = ConvertUnicode(src, src_len, output, &offsets); if (offset_for_adjustment) *offset_for_adjustment = offsets[0]; return ret; } bool UTF8ToWideAndAdjustOffsets(const char* src, size_t src_len, std::wstring* output, std::vector<size_t>* offsets_for_adjustment) { PrepareForUTF16Or32Output(src, src_len, output); return ConvertUnicode(src, src_len, output, offsets_for_adjustment); } std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, size_t* offset_for_adjustment) { std::vector<size_t> offsets; if (offset_for_adjustment) offsets.push_back(*offset_for_adjustment); std::wstring result; UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, &offsets); if (offset_for_adjustment) *offset_for_adjustment = offsets[0]; return result; } std::wstring UTF8ToWideAndAdjustOffsets(const base::StringPiece& utf8, std::vector<size_t>* offsets_for_adjustment) { std::wstring result; UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, offsets_for_adjustment); return result; } // UTF-16 <-> Wide ------------------------------------------------------------- #if defined(WCHAR_T_IS_UTF16) // When wide == UTF-16, then conversions are a NOP. bool UTF16ToWideAndAdjustOffset(const char16* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment) { output->assign(src, src_len); if (offset_for_adjustment && (*offset_for_adjustment >= src_len)) *offset_for_adjustment = std::wstring::npos; return true; } bool UTF16ToWideAndAdjustOffsets(const char16* src, size_t src_len, std::wstring* output, std::vector<size_t>* offsets_for_adjustment) { output->assign(src, src_len); if (offsets_for_adjustment) { std::for_each(offsets_for_adjustment->begin(), offsets_for_adjustment->end(), LimitOffset<std::wstring>(src_len)); } return true; } std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, size_t* offset_for_adjustment) { if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length())) *offset_for_adjustment = std::wstring::npos; return utf16; } std::wstring UTF16ToWideAndAdjustOffsets( const string16& utf16, std::vector<size_t>* offsets_for_adjustment) { if (offsets_for_adjustment) { std::for_each(offsets_for_adjustment->begin(), offsets_for_adjustment->end(), LimitOffset<std::wstring>(utf16.length())); } return utf16; } #elif defined(WCHAR_T_IS_UTF32) bool UTF16ToWideAndAdjustOffset(const char16* src, size_t src_len, std::wstring* output, size_t* offset_for_adjustment) { std::vector<size_t> offsets; if (offset_for_adjustment) offsets.push_back(*offset_for_adjustment); output->clear(); // Assume that normally we won't have any non-BMP characters so the counts // will be the same. output->reserve(src_len); bool ret = ConvertUnicode(src, src_len, output, &offsets); if (offset_for_adjustment) *offset_for_adjustment = offsets[0]; return ret; } bool UTF16ToWideAndAdjustOffsets(const char16* src, size_t src_len, std::wstring* output, std::vector<size_t>* offsets_for_adjustment) { output->clear(); // Assume that normally we won't have any non-BMP characters so the counts // will be the same. output->reserve(src_len); return ConvertUnicode(src, src_len, output, offsets_for_adjustment); } std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, size_t* offset_for_adjustment) { std::vector<size_t> offsets; if (offset_for_adjustment) offsets.push_back(*offset_for_adjustment); std::wstring result; UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, &offsets); if (offset_for_adjustment) *offset_for_adjustment = offsets[0]; return result; } std::wstring UTF16ToWideAndAdjustOffsets( const string16& utf16, std::vector<size_t>* offsets_for_adjustment) { std::wstring result; UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, offsets_for_adjustment); return result; } #endif // defined(WCHAR_T_IS_UTF32) AdjustOffset::Adjustment::Adjustment(size_t location, size_t old_length, size_t new_length) : location(location), old_length(old_length), new_length(new_length) {} AdjustOffset::AdjustOffset(const Adjustments& adjustments) : adjustments_(adjustments) {} void AdjustOffset::operator()(size_t& offset) { if (offset == std::wstring::npos) return; size_t adjustment = 0; for (Adjustments::const_iterator i = adjustments_.begin(); i != adjustments_.end(); ++i) { size_t location = i->location; if (offset == location && i->new_length == 0) { offset = std::wstring::npos; return; } if (offset <= location) break; if (offset < (location + i->old_length)) { offset = std::wstring::npos; return; } adjustment += (i->old_length - i->new_length); } offset -= adjustment; }