@ -60,245 +60,248 @@
error .
*/
utf8_scan_result
is_utf8 ( const unsigned char * str ,
size_t len ,
const char * * message ,
int * faulty_bytes ,
nonstd : : optional < unsigned char > terminator )
is_utf8 ( string_fragment str , nonstd : : optional < unsigned char > terminator )
{
bool has_ansi = false ;
const auto * ustr = str . udata ( ) ;
utf8_scan_result retval ;
ssize_t i = 0 ;
* message = nullptr ;
* faulty_bytes = 0 ;
while ( i < len ) {
if ( str [ i ] = = ' \x1b ' ) {
has_ansi = true ;
while ( i < str . length ( ) ) {
if ( ustr [ i ] = = ' \x1b ' ) {
retval . usr_has_ansi = true ;
}
if ( terminator & & str[ i ] = = terminator . value ( ) ) {
* message = nullptr ;
return { i , has_ansi } ;
if ( terminator & & u str[ i ] = = terminator . value ( ) ) {
retval . usr_term = i ;
break ;
}
if ( str[ i ] < = 0x7F ) /* 00..7F */ {
if ( retval. usr_message ! = nullptr ) {
i + = 1 ;
} else if ( str [ i ] > = 0xC2 & & str [ i ] < = 0xDF ) /* C2..DF 80..BF */ {
if ( i + 1 < len ) /* Expect a 2nd byte */ {
if ( str [ i + 1 ] < 0x80 | | str [ i + 1 ] > 0xBF ) {
* message
continue ;
}
retval . usr_valid_end = i ;
if ( ustr [ i ] < = 0x7F ) /* 00..7F */ {
i + = 1 ;
} else if ( ustr [ i ] > = 0xC2 & & ustr [ i ] < = 0xDF ) /* C2..DF 80..BF */ {
if ( i + 1 < str . length ( ) ) /* Expect a 2nd byte */ {
if ( ustr [ i + 1 ] < 0x80 | | ustr [ i + 1 ] > 0xBF ) {
retval . usr_message
= " After a first byte between C2 and DF, expecting a "
" 2nd byte between 80 and BF " ;
* faulty_bytes = 2 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 2 ;
continue ;
}
} else {
* message
retval . usr_ message
= " After a first byte between C2 and DF, expecting a 2nd "
" byte. " ;
* faulty_bytes = 1 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 1 ;
continue ;
}
i + = 2 ;
} else if ( str[ i ] = = 0xE0 ) /* E0 A0..BF 80..BF */ {
if ( i + 2 < len) /* Expect a 2nd and 3rd byte */ {
if ( str[ i + 1 ] < 0xA0 | | str[ i + 1 ] > 0xBF ) {
* message
} else if ( u str[ i ] = = 0xE0 ) /* E0 A0..BF 80..BF */ {
if ( i + 2 < str. length( ) ) /* Expect a 2nd and 3rd byte */ {
if ( u str[ i + 1 ] < 0xA0 | | u str[ i + 1 ] > 0xBF ) {
retval . usr_ message
= " After a first byte of E0, expecting a 2nd byte "
" between A0 and BF. " ;
* faulty_bytes = 2 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 2 ;
continue ;
}
if ( str[ i + 2 ] < 0x80 | | str[ i + 2 ] > 0xBF ) {
* message
if ( u str[ i + 2 ] < 0x80 | | u str[ i + 2 ] > 0xBF ) {
retval . usr_ message
= " After a first byte of E0, expecting a 3nd byte "
" between 80 and BF. " ;
* faulty_bytes = 3 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 3 ;
continue ;
}
} else {
* message
retval . usr_ message
= " After a first byte of E0, expecting two following "
" bytes. " ;
* faulty_bytes = 1 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 1 ;
continue ;
}
i + = 3 ;
} else if ( str [ i ] > = 0xE1 & & str [ i ] < = 0xEC ) /* E1..EC 80..BF 80..BF */
} else if ( ustr [ i ] > = 0xE1
& & ustr [ i ] < = 0xEC ) /* E1..EC 80..BF 80..BF */
{
if ( i + 2 < len) /* Expect a 2nd and 3rd byte */ {
if ( str[ i + 1 ] < 0x80 | | str[ i + 1 ] > 0xBF ) {
* message
if ( i + 2 < str. length( ) ) /* Expect a 2nd and 3rd byte */ {
if ( u str[ i + 1 ] < 0x80 | | u str[ i + 1 ] > 0xBF ) {
retval . usr_ message
= " After a first byte between E1 and EC, expecting the "
" 2nd byte between 80 and BF. " ;
* faulty_bytes = 2 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 2 ;
continue ;
}
if ( str[ i + 2 ] < 0x80 | | str[ i + 2 ] > 0xBF ) {
* message
if ( u str[ i + 2 ] < 0x80 | | u str[ i + 2 ] > 0xBF ) {
retval . usr_ message
= " After a first byte between E1 and EC, expecting the "
" 3rd byte between 80 and BF. " ;
* faulty_bytes = 3 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 3 ;
continue ;
}
} else {
* message
retval . usr_ message
= " After a first byte between E1 and EC, expecting two "
" following bytes. " ;
* faulty_bytes = 1 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 1 ;
continue ;
}
i + = 3 ;
} else if ( str[ i ] = = 0xED ) /* ED 80..9F 80..BF */ {
if ( i + 2 < len) /* Expect a 2nd and 3rd byte */ {
if ( str[ i + 1 ] < 0x80 | | str[ i + 1 ] > 0x9F ) {
* message
} else if ( u str[ i ] = = 0xED ) /* ED 80..9F 80..BF */ {
if ( i + 2 < str. length( ) ) /* Expect a 2nd and 3rd byte */ {
if ( u str[ i + 1 ] < 0x80 | | u str[ i + 1 ] > 0x9F ) {
retval . usr_ message
= " After a first byte of ED, expecting 2nd byte "
" between 80 and 9F. " ;
* faulty_bytes = 2 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 2 ;
continue ;
}
if ( str[ i + 2 ] < 0x80 | | str[ i + 2 ] > 0xBF ) {
* message
if ( u str[ i + 2 ] < 0x80 | | u str[ i + 2 ] > 0xBF ) {
retval . usr_ message
= " After a first byte of ED, expecting 3rd byte "
" between 80 and BF. " ;
* faulty_bytes = 3 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 3 ;
continue ;
}
} else {
* message
retval . usr_ message
= " After a first byte of ED, expecting two following "
" bytes. " ;
* faulty_bytes = 1 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 1 ;
continue ;
}
i + = 3 ;
} else if ( str [ i ] > = 0xEE & & str [ i ] < = 0xEF ) /* EE..EF 80..BF 80..BF */
} else if ( ustr [ i ] > = 0xEE
& & ustr [ i ] < = 0xEF ) /* EE..EF 80..BF 80..BF */
{
if ( i + 2 < len) /* Expect a 2nd and 3rd byte */ {
if ( str[ i + 1 ] < 0x80 | | str[ i + 1 ] > 0xBF ) {
* message
if ( i + 2 < str. length( ) ) /* Expect a 2nd and 3rd byte */ {
if ( u str[ i + 1 ] < 0x80 | | u str[ i + 1 ] > 0xBF ) {
retval . usr_ message
= " After a first byte between EE and EF, expecting 2nd "
" byte between 80 and BF. " ;
* faulty_bytes = 2 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 2 ;
continue ;
}
if ( str[ i + 2 ] < 0x80 | | str[ i + 2 ] > 0xBF ) {
* message
if ( u str[ i + 2 ] < 0x80 | | u str[ i + 2 ] > 0xBF ) {
retval . usr_ message
= " After a first byte between EE and EF, expecting 3rd "
" byte between 80 and BF. " ;
* faulty_bytes = 3 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 3 ;
continue ;
}
} else {
* message
retval . usr_ message
= " After a first byte between EE and EF, two following "
" bytes. " ;
* faulty_bytes = 1 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 1 ;
continue ;
}
i + = 3 ;
} else if ( str[ i ] = = 0xF0 ) /* F0 90..BF 80..BF 80..BF */ {
if ( i + 3 < len) /* Expect a 2nd, 3rd 3th byte */ {
if ( str[ i + 1 ] < 0x90 | | str[ i + 1 ] > 0xBF ) {
* message
} else if ( u str[ i ] = = 0xF0 ) /* F0 90..BF 80..BF 80..BF */ {
if ( i + 3 < str. length( ) ) /* Expect a 2nd, 3rd 3th byte */ {
if ( u str[ i + 1 ] < 0x90 | | u str[ i + 1 ] > 0xBF ) {
retval . usr_ message
= " After a first byte of F0, expecting 2nd byte "
" between 90 and BF. " ;
* faulty_bytes = 2 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 2 ;
continue ;
}
if ( str[ i + 2 ] < 0x80 | | str[ i + 2 ] > 0xBF ) {
* message
if ( u str[ i + 2 ] < 0x80 | | u str[ i + 2 ] > 0xBF ) {
retval . usr_ message
= " After a first byte of F0, expecting 3rd byte "
" between 80 and BF. " ;
* faulty_bytes = 3 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 3 ;
continue ;
}
if ( str[ i + 3 ] < 0x80 | | str[ i + 3 ] > 0xBF ) {
* message
if ( u str[ i + 3 ] < 0x80 | | u str[ i + 3 ] > 0xBF ) {
retval . usr_ message
= " After a first byte of F0, expecting 4th byte "
" between 80 and BF. " ;
* faulty_bytes = 4 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 4 ;
continue ;
}
} else {
* message
retval . usr_ message
= " After a first byte of F0, expecting three following "
" bytes. " ;
* faulty_bytes = 1 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 1 ;
continue ;
}
i + = 4 ;
} else if ( str[ i ] > = 0xF1
& & str[ i ] < = 0xF3 ) /* F1..F3 80..BF 80..BF 80..BF */
} else if ( u str[ i ] > = 0xF1
& & u str[ i ] < = 0xF3 ) /* F1..F3 80..BF 80..BF 80..BF */
{
if ( i + 3 < len) /* Expect a 2nd, 3rd 3th byte */ {
if ( str[ i + 1 ] < 0x80 | | str[ i + 1 ] > 0xBF ) {
* message
if ( i + 3 < str. length( ) ) /* Expect a 2nd, 3rd 3th byte */ {
if ( u str[ i + 1 ] < 0x80 | | u str[ i + 1 ] > 0xBF ) {
retval . usr_ message
= " After a first byte of F1, F2, or F3, expecting a "
" 2nd byte between 80 and BF. " ;
* faulty_bytes = 2 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 2 ;
continue ;
}
if ( str[ i + 2 ] < 0x80 | | str[ i + 2 ] > 0xBF ) {
* message
if ( u str[ i + 2 ] < 0x80 | | u str[ i + 2 ] > 0xBF ) {
retval . usr_ message
= " After a first byte of F1, F2, or F3, expecting a "
" 3rd byte between 80 and BF. " ;
* faulty_bytes = 3 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 3 ;
continue ;
}
if ( str[ i + 3 ] < 0x80 | | str[ i + 3 ] > 0xBF ) {
* message
if ( u str[ i + 3 ] < 0x80 | | u str[ i + 3 ] > 0xBF ) {
retval . usr_ message
= " After a first byte of F1, F2, or F3, expecting a "
" 4th byte between 80 and BF. " ;
* faulty_bytes = 4 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 4 ;
continue ;
}
} else {
* message
retval . usr_ message
= " After a first byte of F1, F2, or F3, expecting three "
" following bytes. " ;
* faulty_bytes = 1 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 1 ;
continue ;
}
i + = 4 ;
} else if ( str[ i ] = = 0xF4 ) /* F4 80..8F 80..BF 80..BF */ {
if ( i + 3 < len) /* Expect a 2nd, 3rd 3th byte */ {
if ( str[ i + 1 ] < 0x80 | | str[ i + 1 ] > 0x8F ) {
* message
} else if ( u str[ i ] = = 0xF4 ) /* F4 80..8F 80..BF 80..BF */ {
if ( i + 3 < str. length( ) ) /* Expect a 2nd, 3rd 3th byte */ {
if ( u str[ i + 1 ] < 0x80 | | u str[ i + 1 ] > 0x8F ) {
retval . usr_ message
= " After a first byte of F4, expecting 2nd byte "
" between 80 and 8F. " ;
* faulty_bytes = 2 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 2 ;
continue ;
}
if ( str[ i + 2 ] < 0x80 | | str[ i + 2 ] > 0xBF ) {
* message
if ( u str[ i + 2 ] < 0x80 | | u str[ i + 2 ] > 0xBF ) {
retval . usr_ message
= " After a first byte of F4, expecting 3rd byte "
" between 80 and BF. " ;
* faulty_bytes = 3 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 3 ;
continue ;
}
if ( str[ i + 3 ] < 0x80 | | str[ i + 3 ] > 0xBF ) {
* message
if ( u str[ i + 3 ] < 0x80 | | u str[ i + 3 ] > 0xBF ) {
retval . usr_ message
= " After a first byte of F4, expecting 4th byte "
" between 80 and BF. " ;
* faulty_bytes = 4 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 4 ;
continue ;
}
} else {
* message
retval . usr_ message
= " After a first byte of F4, expecting three following "
" bytes. " ;
* faulty_bytes = 1 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 1 ;
continue ;
}
i + = 4 ;
} else {
* message
retval . usr_ message
= " Expecting bytes in the following ranges: 00..7F C2..F4. " ;
* faulty_bytes = 1 ;
return { i , has_ansi } ;
retval . usr_ faulty_bytes = 1 ;
continue ;
}
}
return { - 1 , has_ansi } ;
return retval ;
}