|
|
|
@ -661,50 +661,132 @@ int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front)
|
|
|
|
|
class IcuStringIterator : public StringIterator
|
|
|
|
|
{
|
|
|
|
|
icu::BreakIterator *char_itr; ///< ICU iterator for characters.
|
|
|
|
|
icu::BreakIterator *word_itr; ///< ICU iterator for words.
|
|
|
|
|
const char *string; ///< Iteration string in UTF-8.
|
|
|
|
|
|
|
|
|
|
SmallVector<UChar, 32> utf16_str; ///< UTF-16 copy of the string.
|
|
|
|
|
SmallVector<size_t, 32> utf16_to_utf8; ///< Mapping from UTF-16 code point position to index in the UTF-8 source string.
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
IcuStringIterator() : char_itr(NULL)
|
|
|
|
|
IcuStringIterator() : char_itr(NULL), word_itr(NULL)
|
|
|
|
|
{
|
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
|
this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
|
|
|
|
|
this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
|
|
|
|
|
|
|
|
|
|
*this->utf16_str.Append() = '\0';
|
|
|
|
|
*this->utf16_to_utf8.Append() = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual ~IcuStringIterator()
|
|
|
|
|
{
|
|
|
|
|
delete this->char_itr;
|
|
|
|
|
delete this->word_itr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual void SetString(const char *s)
|
|
|
|
|
{
|
|
|
|
|
this->string = s;
|
|
|
|
|
|
|
|
|
|
/* Unfortunately current ICU versions only provide rudimentary support
|
|
|
|
|
* for word break iterators (especially for CJK languages) in combination
|
|
|
|
|
* with UTF-8 input. As a work around we have to convert the input to
|
|
|
|
|
* UTF-16 and create a mapping back to UTF-8 character indices. */
|
|
|
|
|
this->utf16_str.Clear();
|
|
|
|
|
this->utf16_to_utf8.Clear();
|
|
|
|
|
|
|
|
|
|
while (*s != '\0') {
|
|
|
|
|
size_t idx = s - this->string;
|
|
|
|
|
|
|
|
|
|
WChar c = Utf8Consume(&s);
|
|
|
|
|
if (c < 0x10000) {
|
|
|
|
|
*this->utf16_str.Append() = (UChar)c;
|
|
|
|
|
} else {
|
|
|
|
|
/* Make a surrogate pair. */
|
|
|
|
|
*this->utf16_str.Append() = (UChar)(0xD800 + ((c - 0x10000) >> 10));
|
|
|
|
|
*this->utf16_str.Append() = (UChar)(0xDC00 + ((c - 0x10000) & 0x3FF));
|
|
|
|
|
*this->utf16_to_utf8.Append() = idx;
|
|
|
|
|
}
|
|
|
|
|
*this->utf16_to_utf8.Append() = idx;
|
|
|
|
|
}
|
|
|
|
|
*this->utf16_str.Append() = '\0';
|
|
|
|
|
*this->utf16_to_utf8.Append() = s - this->string;
|
|
|
|
|
|
|
|
|
|
UText text = UTEXT_INITIALIZER;
|
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
|
utext_openUTF8(&text, s, -1, &status);
|
|
|
|
|
utext_openUChars(&text, this->utf16_str.Begin(), this->utf16_str.Length() - 1, &status);
|
|
|
|
|
this->char_itr->setText(&text, status);
|
|
|
|
|
this->word_itr->setText(&text, status);
|
|
|
|
|
this->char_itr->first();
|
|
|
|
|
this->word_itr->first();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual size_t SetCurPosition(size_t pos)
|
|
|
|
|
{
|
|
|
|
|
/* Convert incoming position to an UTF-16 string index. */
|
|
|
|
|
uint utf16_pos = 0;
|
|
|
|
|
for (uint i = 0; i < this->utf16_to_utf8.Length(); i++) {
|
|
|
|
|
if (this->utf16_to_utf8[i] == pos) {
|
|
|
|
|
utf16_pos = i;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* isBoundary has the documented side-effect of setting the current
|
|
|
|
|
* position to the first valid boundary equal to or greater than
|
|
|
|
|
* the passed value. */
|
|
|
|
|
this->char_itr->isBoundary((int32_t)pos);
|
|
|
|
|
return this->char_itr->current();
|
|
|
|
|
this->char_itr->isBoundary(utf16_pos);
|
|
|
|
|
return this->utf16_to_utf8[this->char_itr->current()];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual size_t Next()
|
|
|
|
|
virtual size_t Next(IterType what)
|
|
|
|
|
{
|
|
|
|
|
int32_t pos = this->char_itr->next();
|
|
|
|
|
return pos == icu::BreakIterator::DONE ? END : pos;
|
|
|
|
|
int32_t pos;
|
|
|
|
|
switch (what) {
|
|
|
|
|
case ITER_CHARACTER:
|
|
|
|
|
pos = this->char_itr->next();
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case ITER_WORD:
|
|
|
|
|
pos = this->word_itr->following(this->char_itr->current());
|
|
|
|
|
/* The ICU word iterator considers both the start and the end of a word a valid
|
|
|
|
|
* break point, but we only want word starts. Move to the next location in
|
|
|
|
|
* case the new position points to whitespace. */
|
|
|
|
|
while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->next();
|
|
|
|
|
|
|
|
|
|
this->char_itr->isBoundary(pos);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
NOT_REACHED();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual size_t Prev()
|
|
|
|
|
virtual size_t Prev(IterType what)
|
|
|
|
|
{
|
|
|
|
|
int32_t pos = this->char_itr->previous();
|
|
|
|
|
return pos == icu::BreakIterator::DONE ? END : pos;
|
|
|
|
|
int32_t pos;
|
|
|
|
|
switch (what) {
|
|
|
|
|
case ITER_CHARACTER:
|
|
|
|
|
pos = this->char_itr->previous();
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case ITER_WORD:
|
|
|
|
|
pos = this->word_itr->preceding(this->char_itr->current());
|
|
|
|
|
/* The ICU word iterator considers both the start and the end of a word a valid
|
|
|
|
|
* break point, but we only want word starts. Move to the previous location in
|
|
|
|
|
* case the new position points to whitespace. */
|
|
|
|
|
while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->previous();
|
|
|
|
|
|
|
|
|
|
this->char_itr->isBoundary(pos);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
NOT_REACHED();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
@ -742,26 +824,79 @@ public:
|
|
|
|
|
return this->cur_pos = pos;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual size_t Next()
|
|
|
|
|
virtual size_t Next(IterType what)
|
|
|
|
|
{
|
|
|
|
|
assert(this->string != NULL);
|
|
|
|
|
|
|
|
|
|
/* Already at the end? */
|
|
|
|
|
if (this->cur_pos >= this->len) return END;
|
|
|
|
|
|
|
|
|
|
WChar c;
|
|
|
|
|
this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
|
|
|
|
|
return this->cur_pos;
|
|
|
|
|
switch (what) {
|
|
|
|
|
case ITER_CHARACTER: {
|
|
|
|
|
WChar c;
|
|
|
|
|
this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
|
|
|
|
|
return this->cur_pos;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case ITER_WORD: {
|
|
|
|
|
WChar c;
|
|
|
|
|
/* Consume current word. */
|
|
|
|
|
size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
|
|
|
|
|
while (this->cur_pos < this->len && !IsWhitespace(c)) {
|
|
|
|
|
this->cur_pos += offs;
|
|
|
|
|
offs = Utf8Decode(&c, this->string + this->cur_pos);
|
|
|
|
|
}
|
|
|
|
|
/* Consume whitespace to the next word. */
|
|
|
|
|
while (this->cur_pos < this->len && IsWhitespace(c)) {
|
|
|
|
|
this->cur_pos += offs;
|
|
|
|
|
offs = Utf8Decode(&c, this->string + this->cur_pos);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return this->cur_pos;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
NOT_REACHED();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return END;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual size_t Prev()
|
|
|
|
|
virtual size_t Prev(IterType what)
|
|
|
|
|
{
|
|
|
|
|
assert(this->string != NULL);
|
|
|
|
|
|
|
|
|
|
/* Already at the beginning? */
|
|
|
|
|
if (this->cur_pos == 0) return END;
|
|
|
|
|
|
|
|
|
|
return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
|
|
|
|
|
switch (what) {
|
|
|
|
|
case ITER_CHARACTER:
|
|
|
|
|
return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
|
|
|
|
|
|
|
|
|
|
case ITER_WORD: {
|
|
|
|
|
const char *s = this->string + this->cur_pos;
|
|
|
|
|
WChar c;
|
|
|
|
|
/* Consume preceding whitespace. */
|
|
|
|
|
do {
|
|
|
|
|
s = Utf8PrevChar(s);
|
|
|
|
|
Utf8Decode(&c, s);
|
|
|
|
|
} while (s > this->string && IsWhitespace(c));
|
|
|
|
|
/* Consume preceding word. */
|
|
|
|
|
while (s > this->string && !IsWhitespace(c)) {
|
|
|
|
|
s = Utf8PrevChar(s);
|
|
|
|
|
Utf8Decode(&c, s);
|
|
|
|
|
}
|
|
|
|
|
/* Move caret back to the beginning of the word. */
|
|
|
|
|
if (IsWhitespace(c)) Utf8Consume(&s);
|
|
|
|
|
|
|
|
|
|
return this->cur_pos = s - this->string;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
NOT_REACHED();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return END;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|