Edit File by line
/home/zeestwma/ceyloniy.../wp-inclu...
File: compat-utf8.php
<?php
[0] Fix | Delete
[1] Fix | Delete
/**
[2] Fix | Delete
* Finds spans of valid and invalid UTF-8 bytes in a given string.
[3] Fix | Delete
*
[4] Fix | Delete
* This is a low-level tool to power various UTF-8 functionality.
[5] Fix | Delete
* It scans through a string until it finds invalid byte spans.
[6] Fix | Delete
* When it does this, it does three things:
[7] Fix | Delete
*
[8] Fix | Delete
* - Assigns `$at` to the position after the last successful code point.
[9] Fix | Delete
* - Assigns `$invalid_length` to the length of the maximal subpart of
[10] Fix | Delete
* the invalid bytes starting at `$at`.
[11] Fix | Delete
* - Returns how many code points were successfully scanned.
[12] Fix | Delete
*
[13] Fix | Delete
* This information is enough to build a number of useful UTF-8 functions.
[14] Fix | Delete
*
[15] Fix | Delete
* Example:
[16] Fix | Delete
*
[17] Fix | Delete
* // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1.
[18] Fix | Delete
* "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' );
[19] Fix | Delete
* $at = $invalid_length = 0;
[20] Fix | Delete
*
[21] Fix | Delete
* // The first step finds the invalid 0xF1 byte.
[22] Fix | Delete
* 2 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
[23] Fix | Delete
* $at === 2; $invalid_length === 1;
[24] Fix | Delete
*
[25] Fix | Delete
* // The second step continues to the end of the string.
[26] Fix | Delete
* 1 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
[27] Fix | Delete
* $at === 4; $invalid_length === 0;
[28] Fix | Delete
*
[29] Fix | Delete
* Note! While passing an options array here might be convenient from a calling-code standpoint,
[30] Fix | Delete
* this function is intended to serve as a very low-level foundation upon which to build
[31] Fix | Delete
* higher level functionality. For the sake of keeping costs explicit all arguments are
[32] Fix | Delete
* passed directly.
[33] Fix | Delete
*
[34] Fix | Delete
* @since 6.9.0
[35] Fix | Delete
* @access private
[36] Fix | Delete
*
[37] Fix | Delete
* @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
[38] Fix | Delete
* @param int $at Where to start scanning.
[39] Fix | Delete
* @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
[40] Fix | Delete
* @param int|null $max_bytes Stop scanning after this many bytes have been seen.
[41] Fix | Delete
* @param int|null $max_code_points Stop scanning after this many code points have been seen.
[42] Fix | Delete
* @param bool|null $has_noncharacters Set to indicate if scanned string contained noncharacters.
[43] Fix | Delete
* @return int How many code points were successfully scanned.
[44] Fix | Delete
*/
[45] Fix | Delete
function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
[46] Fix | Delete
$byte_length = strlen( $bytes );
[47] Fix | Delete
$end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
[48] Fix | Delete
$invalid_length = 0;
[49] Fix | Delete
$count = 0;
[50] Fix | Delete
$max_count = $max_code_points ?? PHP_INT_MAX;
[51] Fix | Delete
$has_noncharacters = false;
[52] Fix | Delete
[53] Fix | Delete
for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
[54] Fix | Delete
/*
[55] Fix | Delete
* Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
[56] Fix | Delete
*
[57] Fix | Delete
* This optimization step improves the speed from 10x to 100x
[58] Fix | Delete
* depending on whether the JIT has optimized the function.
[59] Fix | Delete
*/
[60] Fix | Delete
$ascii_byte_count = strspn(
[61] Fix | Delete
$bytes,
[62] Fix | Delete
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
[63] Fix | Delete
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
[64] Fix | Delete
" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
[65] Fix | Delete
$i,
[66] Fix | Delete
$end - $i
[67] Fix | Delete
);
[68] Fix | Delete
[69] Fix | Delete
if ( $count + $ascii_byte_count >= $max_count ) {
[70] Fix | Delete
$at = $i + ( $max_count - $count );
[71] Fix | Delete
$count = $max_count;
[72] Fix | Delete
return $count;
[73] Fix | Delete
}
[74] Fix | Delete
[75] Fix | Delete
$count += $ascii_byte_count;
[76] Fix | Delete
$i += $ascii_byte_count;
[77] Fix | Delete
[78] Fix | Delete
if ( $i >= $end ) {
[79] Fix | Delete
$at = $end;
[80] Fix | Delete
return $count;
[81] Fix | Delete
}
[82] Fix | Delete
[83] Fix | Delete
/**
[84] Fix | Delete
* The above fast-track handled all single-byte UTF-8 characters. What
[85] Fix | Delete
* follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
[86] Fix | Delete
*
[87] Fix | Delete
* Therefore everything past here is checking those multibyte sequences.
[88] Fix | Delete
*
[89] Fix | Delete
* It may look like there’s a need to check against the max bytes here,
[90] Fix | Delete
* but since each match of a single character returns, this functions will
[91] Fix | Delete
* bail already if crossing the max-bytes threshold. This function SHALL
[92] Fix | Delete
* NOT return in the middle of a multi-byte character, so if a character
[93] Fix | Delete
* falls on each side of the max bytes, the entire character will be scanned.
[94] Fix | Delete
*
[95] Fix | Delete
* Because it’s possible that there are truncated characters, the use of
[96] Fix | Delete
* the null-coalescing operator with "\xC0" is a convenience for skipping
[97] Fix | Delete
* length checks on every continuation bytes. This works because 0xC0 is
[98] Fix | Delete
* always invalid in a UTF-8 string, meaning that if the string has been
[99] Fix | Delete
* truncated, it will find 0xC0 and reject as invalid UTF-8.
[100] Fix | Delete
*
[101] Fix | Delete
* > [The following table] lists all of the byte sequences that are well-formed
[102] Fix | Delete
* > in UTF-8. A range of byte values such as A0..BF indicates that any byte
[103] Fix | Delete
* > from A0 to BF (inclusive) is well-formed in that position. Any byte value
[104] Fix | Delete
* > outside of the ranges listed is ill-formed.
[105] Fix | Delete
*
[106] Fix | Delete
* > Table 3-7. Well-Formed UTF-8 Byte Sequences
[107] Fix | Delete
* ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
[108] Fix | Delete
* │ Code Points │ First Byte │ Second Byte │ Third Byte │ Fourth Byte │
[109] Fix | Delete
* ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
[110] Fix | Delete
* │ U+0000..U+007F │ 00..7F │ │ │ │
[111] Fix | Delete
* │ U+0080..U+07FF │ C2..DF │ 80..BF │ │ │
[112] Fix | Delete
* │ U+0800..U+0FFF │ E0 │ A0..BF │ 80..BF │ │
[113] Fix | Delete
* │ U+1000..U+CFFF │ E1..EC │ 80..BF │ 80..BF │ │
[114] Fix | Delete
* │ U+D000..U+D7FF │ ED │ 80..9F │ 80..BF │ │
[115] Fix | Delete
* │ U+E000..U+FFFF │ EE..EF │ 80..BF │ 80..BF │ │
[116] Fix | Delete
* │ U+10000..U+3FFFF │ F0 │ 90..BF │ 80..BF │ 80..BF │
[117] Fix | Delete
* │ U+40000..U+FFFFF │ F1..F3 │ 80..BF │ 80..BF │ 80..BF │
[118] Fix | Delete
* │ U+100000..U+10FFFF │ F4 │ 80..8F │ 80..BF │ 80..BF │
[119] Fix | Delete
* ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
[120] Fix | Delete
*
[121] Fix | Delete
* @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
[122] Fix | Delete
*/
[123] Fix | Delete
[124] Fix | Delete
// Valid two-byte code points.
[125] Fix | Delete
$b1 = ord( $bytes[ $i ] );
[126] Fix | Delete
$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
[127] Fix | Delete
[128] Fix | Delete
if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
[129] Fix | Delete
++$count;
[130] Fix | Delete
++$i;
[131] Fix | Delete
continue;
[132] Fix | Delete
}
[133] Fix | Delete
[134] Fix | Delete
// Valid three-byte code points.
[135] Fix | Delete
$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
[136] Fix | Delete
[137] Fix | Delete
if ( $b3 < 0x80 || $b3 > 0xBF ) {
[138] Fix | Delete
goto invalid_utf8;
[139] Fix | Delete
}
[140] Fix | Delete
[141] Fix | Delete
if (
[142] Fix | Delete
( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
[143] Fix | Delete
( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
[144] Fix | Delete
( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
[145] Fix | Delete
( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
[146] Fix | Delete
) {
[147] Fix | Delete
++$count;
[148] Fix | Delete
$i += 2;
[149] Fix | Delete
[150] Fix | Delete
// Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF.
[151] Fix | Delete
if ( 0xEF === $b1 ) {
[152] Fix | Delete
$has_noncharacters |= (
[153] Fix | Delete
( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
[154] Fix | Delete
( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
[155] Fix | Delete
);
[156] Fix | Delete
}
[157] Fix | Delete
[158] Fix | Delete
continue;
[159] Fix | Delete
}
[160] Fix | Delete
[161] Fix | Delete
// Valid four-byte code points.
[162] Fix | Delete
$b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
[163] Fix | Delete
[164] Fix | Delete
if ( $b4 < 0x80 || $b4 > 0xBF ) {
[165] Fix | Delete
goto invalid_utf8;
[166] Fix | Delete
}
[167] Fix | Delete
[168] Fix | Delete
if (
[169] Fix | Delete
( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
[170] Fix | Delete
( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
[171] Fix | Delete
( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
[172] Fix | Delete
) {
[173] Fix | Delete
++$count;
[174] Fix | Delete
$i += 3;
[175] Fix | Delete
[176] Fix | Delete
// Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF.
[177] Fix | Delete
$has_noncharacters |= (
[178] Fix | Delete
( 0x0F === ( $b2 & 0x0F ) ) &&
[179] Fix | Delete
0xBF === $b3 &&
[180] Fix | Delete
( 0xBE === $b4 || 0xBF === $b4 )
[181] Fix | Delete
);
[182] Fix | Delete
[183] Fix | Delete
continue;
[184] Fix | Delete
}
[185] Fix | Delete
[186] Fix | Delete
/**
[187] Fix | Delete
* When encountering invalid byte sequences, Unicode suggests finding the
[188] Fix | Delete
* maximal subpart of a text and replacing that subpart with a single
[189] Fix | Delete
* replacement character.
[190] Fix | Delete
*
[191] Fix | Delete
* > This practice is more secure because it does not result in the
[192] Fix | Delete
* > conversion consuming parts of valid sequences as though they were
[193] Fix | Delete
* > invalid. It also guarantees at least one replacement character will
[194] Fix | Delete
* > occur for each instance of an invalid sequence in the original text.
[195] Fix | Delete
* > Furthermore, this practice can be defined consistently for better
[196] Fix | Delete
* > interoperability between different implementations of conversion.
[197] Fix | Delete
*
[198] Fix | Delete
* @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
[199] Fix | Delete
*/
[200] Fix | Delete
invalid_utf8:
[201] Fix | Delete
$at = $i;
[202] Fix | Delete
$invalid_length = 1;
[203] Fix | Delete
[204] Fix | Delete
// Single-byte and two-byte characters.
[205] Fix | Delete
if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) {
[206] Fix | Delete
return $count;
[207] Fix | Delete
}
[208] Fix | Delete
[209] Fix | Delete
$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
[210] Fix | Delete
$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
[211] Fix | Delete
[212] Fix | Delete
// Find the maximal subpart and skip past it.
[213] Fix | Delete
if ( 0xE0 === ( $b1 & 0xF0 ) ) {
[214] Fix | Delete
// Three-byte characters.
[215] Fix | Delete
$b2_valid = (
[216] Fix | Delete
( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
[217] Fix | Delete
( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
[218] Fix | Delete
( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
[219] Fix | Delete
( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
[220] Fix | Delete
);
[221] Fix | Delete
[222] Fix | Delete
$invalid_length = min( $end - $i, $b2_valid ? 2 : 1 );
[223] Fix | Delete
return $count;
[224] Fix | Delete
} elseif ( 0xF0 === ( $b1 & 0xF8 ) ) {
[225] Fix | Delete
// Four-byte characters.
[226] Fix | Delete
$b2_valid = (
[227] Fix | Delete
( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
[228] Fix | Delete
( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
[229] Fix | Delete
( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
[230] Fix | Delete
);
[231] Fix | Delete
[232] Fix | Delete
$b3_valid = $b3 >= 0x80 && $b3 <= 0xBF;
[233] Fix | Delete
[234] Fix | Delete
$invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 );
[235] Fix | Delete
return $count;
[236] Fix | Delete
}
[237] Fix | Delete
[238] Fix | Delete
return $count;
[239] Fix | Delete
}
[240] Fix | Delete
[241] Fix | Delete
$at = $i;
[242] Fix | Delete
return $count;
[243] Fix | Delete
}
[244] Fix | Delete
[245] Fix | Delete
/**
[246] Fix | Delete
* Fallback mechanism for safely validating UTF-8 bytes.
[247] Fix | Delete
*
[248] Fix | Delete
* @since 6.9.0
[249] Fix | Delete
* @access private
[250] Fix | Delete
*
[251] Fix | Delete
* @see wp_is_valid_utf8()
[252] Fix | Delete
*
[253] Fix | Delete
* @param string $bytes String which might contain text encoded as UTF-8.
[254] Fix | Delete
* @return bool Whether the provided bytes can decode as valid UTF-8.
[255] Fix | Delete
*/
[256] Fix | Delete
function _wp_is_valid_utf8_fallback( string $bytes ): bool {
[257] Fix | Delete
$bytes_length = strlen( $bytes );
[258] Fix | Delete
if ( 0 === $bytes_length ) {
[259] Fix | Delete
return true;
[260] Fix | Delete
}
[261] Fix | Delete
[262] Fix | Delete
$next_byte_at = 0;
[263] Fix | Delete
$invalid_length = 0;
[264] Fix | Delete
[265] Fix | Delete
_wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
[266] Fix | Delete
[267] Fix | Delete
return $bytes_length === $next_byte_at && 0 === $invalid_length;
[268] Fix | Delete
}
[269] Fix | Delete
[270] Fix | Delete
/**
[271] Fix | Delete
* Fallback mechanism for replacing invalid spans of UTF-8 bytes.
[272] Fix | Delete
*
[273] Fix | Delete
* Example:
[274] Fix | Delete
*
[275] Fix | Delete
* 'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252.
[276] Fix | Delete
*
[277] Fix | Delete
* @since 6.9.0
[278] Fix | Delete
* @access private
[279] Fix | Delete
*
[280] Fix | Delete
* @see wp_scrub_utf8()
[281] Fix | Delete
*
[282] Fix | Delete
* @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes.
[283] Fix | Delete
* @return string Input string with spans of invalid bytes swapped with the replacement character.
[284] Fix | Delete
*/
[285] Fix | Delete
function _wp_scrub_utf8_fallback( string $bytes ): string {
[286] Fix | Delete
$bytes_length = strlen( $bytes );
[287] Fix | Delete
$next_byte_at = 0;
[288] Fix | Delete
$was_at = 0;
[289] Fix | Delete
$invalid_length = 0;
[290] Fix | Delete
$scrubbed = '';
[291] Fix | Delete
[292] Fix | Delete
while ( $next_byte_at <= $bytes_length ) {
[293] Fix | Delete
_wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
[294] Fix | Delete
[295] Fix | Delete
if ( $next_byte_at >= $bytes_length ) {
[296] Fix | Delete
if ( 0 === $was_at ) {
[297] Fix | Delete
return $bytes;
[298] Fix | Delete
}
[299] Fix | Delete
[300] Fix | Delete
return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length );
[301] Fix | Delete
}
[302] Fix | Delete
[303] Fix | Delete
$scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at );
[304] Fix | Delete
$scrubbed .= "\u{FFFD}";
[305] Fix | Delete
[306] Fix | Delete
$next_byte_at += $invalid_length;
[307] Fix | Delete
$was_at = $next_byte_at;
[308] Fix | Delete
}
[309] Fix | Delete
[310] Fix | Delete
return $scrubbed;
[311] Fix | Delete
}
[312] Fix | Delete
[313] Fix | Delete
/**
[314] Fix | Delete
* Returns how many code points are found in the given UTF-8 string.
[315] Fix | Delete
*
[316] Fix | Delete
* Invalid spans of bytes count as a single code point according
[317] Fix | Delete
* to the maximal subpart rule. This function is a fallback method
[318] Fix | Delete
* for calling `mb_strlen( $text, 'UTF-8' )`.
[319] Fix | Delete
*
[320] Fix | Delete
* When negative values are provided for the byte offsets or length,
[321] Fix | Delete
* this will always report zero code points.
[322] Fix | Delete
*
[323] Fix | Delete
* Example:
[324] Fix | Delete
*
[325] Fix | Delete
* 4 === _wp_utf8_codepoint_count( 'text' );
[326] Fix | Delete
*
[327] Fix | Delete
* // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'.
[328] Fix | Delete
* 13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" );
[329] Fix | Delete
*
[330] Fix | Delete
* @since 6.9.0
[331] Fix | Delete
* @access private
[332] Fix | Delete
*
[333] Fix | Delete
* @param string $text Count code points in this string.
[334] Fix | Delete
* @param ?int $byte_offset Start counting after this many bytes in `$text`. Must be positive.
[335] Fix | Delete
* @param ?int $max_byte_length Optional. Stop counting after having scanned past this many bytes.
[336] Fix | Delete
* Default is to scan until the end of the string. Must be positive.
[337] Fix | Delete
* @return int How many code points were found.
[338] Fix | Delete
*/
[339] Fix | Delete
function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
[340] Fix | Delete
if ( $byte_offset < 0 ) {
[341] Fix | Delete
return 0;
[342] Fix | Delete
}
[343] Fix | Delete
[344] Fix | Delete
$count = 0;
[345] Fix | Delete
$at = $byte_offset;
[346] Fix | Delete
$end = strlen( $text );
[347] Fix | Delete
$invalid_length = 0;
[348] Fix | Delete
$max_byte_length = min( $end - $at, $max_byte_length );
[349] Fix | Delete
[350] Fix | Delete
while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) {
[351] Fix | Delete
$count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) );
[352] Fix | Delete
$count += $invalid_length > 0 ? 1 : 0;
[353] Fix | Delete
$at += $invalid_length;
[354] Fix | Delete
}
[355] Fix | Delete
[356] Fix | Delete
return $count;
[357] Fix | Delete
}
[358] Fix | Delete
[359] Fix | Delete
/**
[360] Fix | Delete
* Given a starting offset within a string and a maximum number of code points,
[361] Fix | Delete
* return how many bytes are occupied by the span of characters.
[362] Fix | Delete
*
[363] Fix | Delete
* Invalid spans of bytes count as a single code point according to the maximal
[364] Fix | Delete
* subpart rule. This function is a fallback method for calling
[365] Fix | Delete
* `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`.
[366] Fix | Delete
*
[367] Fix | Delete
* @since 6.9.0
[368] Fix | Delete
* @access private
[369] Fix | Delete
*
[370] Fix | Delete
* @param string $text Count bytes of span in this text.
[371] Fix | Delete
* @param int $byte_offset Start counting at this byte offset.
[372] Fix | Delete
* @param int $max_code_points Stop counting after this many code points have been seen,
[373] Fix | Delete
* or at the end of the string.
[374] Fix | Delete
* @param ?int $found_code_points Optional. Will be set to number of found code points in
[375] Fix | Delete
* span, as this might be smaller than the maximum count if
[376] Fix | Delete
* the string is not long enough.
[377] Fix | Delete
* @return int Number of bytes spanned by the code points.
[378] Fix | Delete
*/
[379] Fix | Delete
function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
[380] Fix | Delete
$was_at = $byte_offset;
[381] Fix | Delete
$invalid_length = 0;
[382] Fix | Delete
$end = strlen( $text );
[383] Fix | Delete
$found_code_points = 0;
[384] Fix | Delete
[385] Fix | Delete
while ( $byte_offset < $end && $found_code_points < $max_code_points ) {
[386] Fix | Delete
$needed = $max_code_points - $found_code_points;
[387] Fix | Delete
$chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed );
[388] Fix | Delete
[389] Fix | Delete
$found_code_points += $chunk_count;
[390] Fix | Delete
[391] Fix | Delete
// Invalid spans only convey one code point count regardless of how long they are.
[392] Fix | Delete
if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) {
[393] Fix | Delete
++$found_code_points;
[394] Fix | Delete
$byte_offset += $invalid_length;
[395] Fix | Delete
}
[396] Fix | Delete
}
[397] Fix | Delete
[398] Fix | Delete
return $byte_offset - $was_at;
[399] Fix | Delete
}
[400] Fix | Delete
[401] Fix | Delete
/**
[402] Fix | Delete
* Fallback support for determining if a string contains Unicode noncharacters.
[403] Fix | Delete
*
[404] Fix | Delete
* @since 6.9.0
[405] Fix | Delete
* @access private
[406] Fix | Delete
*
[407] Fix | Delete
* @see \wp_has_noncharacters()
[408] Fix | Delete
*
[409] Fix | Delete
* @param string $text Are there noncharacters in this string?
[410] Fix | Delete
* @return bool Whether noncharacters were found in the string.
[411] Fix | Delete
*/
[412] Fix | Delete
function _wp_has_noncharacters_fallback( string $text ): bool {
[413] Fix | Delete
$at = 0;
[414] Fix | Delete
$invalid_length = 0;
[415] Fix | Delete
$has_noncharacters = false;
[416] Fix | Delete
$end = strlen( $text );
[417] Fix | Delete
[418] Fix | Delete
while ( $at < $end && ! $has_noncharacters ) {
[419] Fix | Delete
_wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
[420] Fix | Delete
$at += $invalid_length;
[421] Fix | Delete
}
[422] Fix | Delete
[423] Fix | Delete
return $has_noncharacters;
[424] Fix | Delete
}
[425] Fix | Delete
[426] Fix | Delete
/**
[427] Fix | Delete
* Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
[428] Fix | Delete
* with the deprecated function from the PHP standard library.
[429] Fix | Delete
*
[430] Fix | Delete
* @since 6.9.0
[431] Fix | Delete
* @access private
[432] Fix | Delete
*
[433] Fix | Delete
* @see \utf8_encode()
[434] Fix | Delete
*
[435] Fix | Delete
* @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
[436] Fix | Delete
* @return string Text converted into UTF-8.
[437] Fix | Delete
*/
[438] Fix | Delete
function _wp_utf8_encode_fallback( $iso_8859_1_text ) {
[439] Fix | Delete
$iso_8859_1_text = (string) $iso_8859_1_text;
[440] Fix | Delete
$at = 0;
[441] Fix | Delete
$was_at = 0;
[442] Fix | Delete
$end = strlen( $iso_8859_1_text );
[443] Fix | Delete
$utf8 = '';
[444] Fix | Delete
[445] Fix | Delete
while ( $at < $end ) {
[446] Fix | Delete
// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
[447] Fix | Delete
$ascii_byte_count = strspn(
[448] Fix | Delete
$iso_8859_1_text,
[449] Fix | Delete
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
[450] Fix | Delete
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
[451] Fix | Delete
" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
[452] Fix | Delete
$at
[453] Fix | Delete
);
[454] Fix | Delete
[455] Fix | Delete
if ( $ascii_byte_count > 0 ) {
[456] Fix | Delete
$at += $ascii_byte_count;
[457] Fix | Delete
continue;
[458] Fix | Delete
}
[459] Fix | Delete
[460] Fix | Delete
// All other bytes transform into two-byte UTF-8 sequences.
[461] Fix | Delete
$code_point = ord( $iso_8859_1_text[ $at ] );
[462] Fix | Delete
$byte1 = chr( 0xC0 | ( $code_point >> 6 ) );
[463] Fix | Delete
$byte2 = chr( 0x80 | ( $code_point & 0x3F ) );
[464] Fix | Delete
[465] Fix | Delete
$utf8 .= substr( $iso_8859_1_text, $was_at, $at - $was_at );
[466] Fix | Delete
$utf8 .= "{$byte1}{$byte2}";
[467] Fix | Delete
[468] Fix | Delete
++$at;
[469] Fix | Delete
$was_at = $at;
[470] Fix | Delete
}
[471] Fix | Delete
[472] Fix | Delete
if ( 0 === $was_at ) {
[473] Fix | Delete
return $iso_8859_1_text;
[474] Fix | Delete
}
[475] Fix | Delete
[476] Fix | Delete
$utf8 .= substr( $iso_8859_1_text, $was_at );
[477] Fix | Delete
return $utf8;
[478] Fix | Delete
}
[479] Fix | Delete
[480] Fix | Delete
/**
[481] Fix | Delete
* Converts a string from UTF-8 to ISO-8859-1, maintaining backwards compatibility
[482] Fix | Delete
* with the deprecated function from the PHP standard library.
[483] Fix | Delete
*
[484] Fix | Delete
* @since 6.9.0
[485] Fix | Delete
* @access private
[486] Fix | Delete
*
[487] Fix | Delete
* @see \utf8_decode()
[488] Fix | Delete
*
[489] Fix | Delete
* @param string $utf8_text Text treated as UTF-8 bytes.
[490] Fix | Delete
* @return string Text converted into ISO-8859-1.
[491] Fix | Delete
*/
[492] Fix | Delete
function _wp_utf8_decode_fallback( $utf8_text ) {
[493] Fix | Delete
$utf8_text = (string) $utf8_text;
[494] Fix | Delete
$at = 0;
[495] Fix | Delete
$was_at = 0;
[496] Fix | Delete
$end = strlen( $utf8_text );
[497] Fix | Delete
$iso_8859_1_text = '';
[498] Fix | Delete
[499] Fix | Delete
12
It is recommended that you Edit text format, this type of Fix handles quite a lot in one request
Function