1<?php
2
3/**
4 * Finds spans of valid and invalid UTF-8 bytes in a given string.
5 *
6 * This is a low-level tool to power various UTF-8 functionality.
7 * It scans through a string until it finds invalid byte spans.
8 * When it does this, it does three things:
9 *
10 * - Assigns `$at` to the position after the last successful code point.
11 * - Assigns `$invalid_length` to the length of the maximal subpart of
12 * the invalid bytes starting at `$at`.
13 * - Returns how many code points were successfully scanned.
14 *
15 * This information is enough to build a number of useful UTF-8 functions.
16 *
17 * Example:
18 *
19 * // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1.
20 * "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' );
21 * $at = $invalid_length = 0;
22 *
23 * // The first step finds the invalid 0xF1 byte.
24 * 2 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
25 * $at === 2; $invalid_length === 1;
26 *
27 * // The second step continues to the end of the string.
28 * 1 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
29 * $at === 4; $invalid_length === 0;
30 *
31 * Note! While passing an options array here might be convenient from a calling-code standpoint,
32 * this function is intended to serve as a very low-level foundation upon which to build
33 * higher level functionality. For the sake of keeping costs explicit all arguments are
34 * passed directly.
35 *
36 * @since 6.9.0
37 * @access private
38 *
39 * @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
40 * @param int $at Where to start scanning.
41 * @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
42 * @param int|null $max_bytes Stop scanning after this many bytes have been seen.
43 * @param int|null $max_code_points Stop scanning after this many code points have been seen.
44 * @param bool|null $has_noncharacters Set to indicate if scanned string contained noncharacters.
45 * @return int How many code points were successfully scanned.
46 */
47function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
48 $byte_length = strlen( $bytes );
49 $end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
50 $invalid_length = 0;
51 $count = 0;
52 $max_count = $max_code_points ?? PHP_INT_MAX;
53 $has_noncharacters = false;
54
55 for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
56 /*
57 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
58 *
59 * This optimization step improves the speed from 10x to 100x
60 * depending on whether the JIT has optimized the function.
61 */
62 $ascii_byte_count = strspn(
63 $bytes,
64 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
65 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
66 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
67 $i,
68 $end - $i
69 );
70
71 if ( $count + $ascii_byte_count >= $max_count ) {
72 $at = $i + ( $max_count - $count );
73 $count = $max_count;
74 return $count;
75 }
76
77 $count += $ascii_byte_count;
78 $i += $ascii_byte_count;
79
80 if ( $i >= $end ) {
81 $at = $end;
82 return $count;
83 }
84
85 /**
86 * The above fast-track handled all single-byte UTF-8 characters. What
87 * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
88 *
89 * Therefore everything past here is checking those multibyte sequences.
90 *
91 * It may look like there’s a need to check against the max bytes here,
92 * but since each match of a single character returns, this functions will
93 * bail already if crossing the max-bytes threshold. This function SHALL
94 * NOT return in the middle of a multi-byte character, so if a character
95 * falls on each side of the max bytes, the entire character will be scanned.
96 *
97 * Because it’s possible that there are truncated characters, the use of
98 * the null-coalescing operator with "\xC0" is a convenience for skipping
99 * length checks on every continuation bytes. This works because 0xC0 is
100 * always invalid in a UTF-8 string, meaning that if the string has been
101 * truncated, it will find 0xC0 and reject as invalid UTF-8.
102 *
103 * > [The following table] lists all of the byte sequences that are well-formed
104 * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
105 * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
106 * > outside of the ranges listed is ill-formed.
107 *
108 * > Table 3-7. Well-Formed UTF-8 Byte Sequences
109 * ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
110 * │ Code Points │ First Byte │ Second Byte │ Third Byte │ Fourth Byte │
111 * ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
112 * │ U+0000..U+007F │ 00..7F │ │ │ │
113 * │ U+0080..U+07FF │ C2..DF │ 80..BF │ │ │
114 * │ U+0800..U+0FFF │ E0 │ A0..BF │ 80..BF │ │
115 * │ U+1000..U+CFFF │ E1..EC │ 80..BF │ 80..BF │ │
116 * │ U+D000..U+D7FF │ ED │ 80..9F │ 80..BF │ │
117 * │ U+E000..U+FFFF │ EE..EF │ 80..BF │ 80..BF │ │
118 * │ U+10000..U+3FFFF │ F0 │ 90..BF │ 80..BF │ 80..BF │
119 * │ U+40000..U+FFFFF │ F1..F3 │ 80..BF │ 80..BF │ 80..BF │
120 * │ U+100000..U+10FFFF │ F4 │ 80..8F │ 80..BF │ 80..BF │
121 * ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
122 *
123 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
124 */
125
126 // Valid two-byte code points.
127 $b1 = ord( $bytes[ $i ] );
128 $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
129
130 if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
131 ++$count;
132 ++$i;
133 continue;
134 }
135
136 // Valid three-byte code points.
137 $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
138
139 if ( $b3 < 0x80 || $b3 > 0xBF ) {
140 goto invalid_utf8;
141 }
142
143 if (
144 ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
145 ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
146 ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
147 ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
148 ) {
149 ++$count;
150 $i += 2;
151
152 // Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF.
153 if ( 0xEF === $b1 ) {
154 $has_noncharacters |= (
155 ( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
156 ( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
157 );
158 }
159
160 continue;
161 }
162
163 // Valid four-byte code points.
164 $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
165
166 if ( $b4 < 0x80 || $b4 > 0xBF ) {
167 goto invalid_utf8;
168 }
169
170 if (
171 ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
172 ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
173 ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
174 ) {
175 ++$count;
176 $i += 3;
177
178 // Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF.
179 $has_noncharacters |= (
180 ( 0x0F === ( $b2 & 0x0F ) ) &&
181 0xBF === $b3 &&
182 ( 0xBE === $b4 || 0xBF === $b4 )
183 );
184
185 continue;
186 }
187
188 /**
189 * When encountering invalid byte sequences, Unicode suggests finding the
190 * maximal subpart of a text and replacing that subpart with a single
191 * replacement character.
192 *
193 * > This practice is more secure because it does not result in the
194 * > conversion consuming parts of valid sequences as though they were
195 * > invalid. It also guarantees at least one replacement character will
196 * > occur for each instance of an invalid sequence in the original text.
197 * > Furthermore, this practice can be defined consistently for better
198 * > interoperability between different implementations of conversion.
199 *
200 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
201 */
202 invalid_utf8:
203 $at = $i;
204 $invalid_length = 1;
205
206 // Single-byte and two-byte characters.
207 if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) {
208 return $count;
209 }
210
211 $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
212 $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
213
214 // Find the maximal subpart and skip past it.
215 if ( 0xE0 === ( $b1 & 0xF0 ) ) {
216 // Three-byte characters.
217 $b2_valid = (
218 ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
219 ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
220 ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
221 ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
222 );
223
224 $invalid_length = min( $end - $i, $b2_valid ? 2 : 1 );
225 return $count;
226 } elseif ( 0xF0 === ( $b1 & 0xF8 ) ) {
227 // Four-byte characters.
228 $b2_valid = (
229 ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
230 ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
231 ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
232 );
233
234 $b3_valid = $b3 >= 0x80 && $b3 <= 0xBF;
235
236 $invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 );
237 return $count;
238 }
239
240 return $count;
241 }
242
243 $at = $i;
244 return $count;
245}
246
247/**
248 * Fallback mechanism for safely validating UTF-8 bytes.
249 *
250 * @since 6.9.0
251 * @access private
252 *
253 * @see wp_is_valid_utf8()
254 *
255 * @param string $bytes String which might contain text encoded as UTF-8.
256 * @return bool Whether the provided bytes can decode as valid UTF-8.
257 */
258function _wp_is_valid_utf8_fallback( string $bytes ): bool {
259 $bytes_length = strlen( $bytes );
260 if ( 0 === $bytes_length ) {
261 return true;
262 }
263
264 $next_byte_at = 0;
265 $invalid_length = 0;
266
267 _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
268
269 return $bytes_length === $next_byte_at && 0 === $invalid_length;
270}
271
272/**
273 * Fallback mechanism for replacing invalid spans of UTF-8 bytes.
274 *
275 * Example:
276 *
277 * 'Pi�a' === _wp_scrub_utf8_fallback( "Pi\xF1a" ); // “ñ” is 0xF1 in Windows-1252.
278 *
279 * @since 6.9.0
280 * @access private
281 *
282 * @see wp_scrub_utf8()
283 *
284 * @param string $bytes UTF-8 encoded string which might contain spans of invalid bytes.
285 * @return string Input string with spans of invalid bytes swapped with the replacement character.
286 */
287function _wp_scrub_utf8_fallback( string $bytes ): string {
288 $bytes_length = strlen( $bytes );
289 $next_byte_at = 0;
290 $was_at = 0;
291 $invalid_length = 0;
292 $scrubbed = '';
293
294 while ( $next_byte_at <= $bytes_length ) {
295 _wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
296
297 if ( $next_byte_at >= $bytes_length ) {
298 if ( 0 === $was_at ) {
299 return $bytes;
300 }
301
302 return $scrubbed . substr( $bytes, $was_at, $next_byte_at - $was_at - $invalid_length );
303 }
304
305 $scrubbed .= substr( $bytes, $was_at, $next_byte_at - $was_at );
306 $scrubbed .= "\u{FFFD}";
307
308 $next_byte_at += $invalid_length;
309 $was_at = $next_byte_at;
310 }
311
312 return $scrubbed;
313}
314
315/**
316 * Returns how many code points are found in the given UTF-8 string.
317 *
318 * Invalid spans of bytes count as a single code point according
319 * to the maximal subpart rule. This function is a fallback method
320 * for calling `mb_strlen( $text, 'UTF-8' )`.
321 *
322 * When negative values are provided for the byte offsets or length,
323 * this will always report zero code points.
324 *
325 * Example:
326 *
327 * 4 === _wp_utf8_codepoint_count( 'text' );
328 *
329 * // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'.
330 * 13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" );
331 *
332 * @since 6.9.0
333 * @access private
334 *
335 * @param string $text Count code points in this string.
336 * @param ?int $byte_offset Start counting after this many bytes in `$text`. Must be positive.
337 * @param ?int $max_byte_length Optional. Stop counting after having scanned past this many bytes.
338 * Default is to scan until the end of the string. Must be positive.
339 * @return int How many code points were found.
340 */
341function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
342 if ( $byte_offset < 0 ) {
343 return 0;
344 }
345
346 $count = 0;
347 $at = $byte_offset;
348 $end = strlen( $text );
349 $invalid_length = 0;
350 $max_byte_length = min( $end - $at, $max_byte_length );
351
352 while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) {
353 $count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) );
354 $count += $invalid_length > 0 ? 1 : 0;
355 $at += $invalid_length;
356 }
357
358 return $count;
359}
360
361/**
362 * Given a starting offset within a string and a maximum number of code points,
363 * return how many bytes are occupied by the span of characters.
364 *
365 * Invalid spans of bytes count as a single code point according to the maximal
366 * subpart rule. This function is a fallback method for calling
367 * `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`.
368 *
369 * @since 6.9.0
370 * @access private
371 *
372 * @param string $text Count bytes of span in this text.
373 * @param int $byte_offset Start counting at this byte offset.
374 * @param int $max_code_points Stop counting after this many code points have been seen,
375 * or at the end of the string.
376 * @param ?int $found_code_points Optional. Will be set to number of found code points in
377 * span, as this might be smaller than the maximum count if
378 * the string is not long enough.
379 * @return int Number of bytes spanned by the code points.
380 */
381function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
382 $was_at = $byte_offset;
383 $invalid_length = 0;
384 $end = strlen( $text );
385 $found_code_points = 0;
386
387 while ( $byte_offset < $end && $found_code_points < $max_code_points ) {
388 $needed = $max_code_points - $found_code_points;
389 $chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed );
390
391 $found_code_points += $chunk_count;
392
393 // Invalid spans only convey one code point count regardless of how long they are.
394 if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) {
395 ++$found_code_points;
396 $byte_offset += $invalid_length;
397 }
398 }
399
400 return $byte_offset - $was_at;
401}
402
403/**
404 * Fallback support for determining if a string contains Unicode noncharacters.
405 *
406 * @since 6.9.0
407 * @access private
408 *
409 * @see \wp_has_noncharacters()
410 *
411 * @param string $text Are there noncharacters in this string?
412 * @return bool Whether noncharacters were found in the string.
413 */
414function _wp_has_noncharacters_fallback( string $text ): bool {
415 $at = 0;
416 $invalid_length = 0;
417 $has_noncharacters = false;
418 $end = strlen( $text );
419
420 while ( $at < $end && ! $has_noncharacters ) {
421 _wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
422 $at += $invalid_length;
423 }
424
425 return $has_noncharacters;
426}
427
428/**
429 * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
430 * with the deprecated function from the PHP standard library.
431 *
432 * @since 6.9.0
433 * @access private
434 *
435 * @see \utf8_encode()
436 *
437 * @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
438 * @return string Text converted into UTF-8.
439 */
440function _wp_utf8_encode_fallback( $iso_8859_1_text ) {
441 $iso_8859_1_text = (string) $iso_8859_1_text;
442 $at = 0;
443 $was_at = 0;
444 $end = strlen( $iso_8859_1_text );
445 $utf8 = '';
446
447 while ( $at < $end ) {
448 // US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
449 $ascii_byte_count = strspn(
450 $iso_8859_1_text,
451 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
452 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
453 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
454 $at
455 );
456
457 if ( $ascii_byte_count > 0 ) {
458 $at += $ascii_byte_count;
459 continue;
460 }
461
462 // All other bytes transform into two-byte UTF-8 sequences.
463 $code_point = ord( $iso_8859_1_text[ $at ] );
464 $byte1 = chr( 0xC0 | ( $code_point >> 6 ) );
465 $byte2 = chr( 0x80 | ( $code_point & 0x3F ) );
466
467 $utf8 .= substr( $iso_8859_1_text, $was_at, $at - $was_at );
468 $utf8 .= "{$byte1}{$byte2}";
469
470 ++$at;
471 $was_at = $at;
472 }
473
474 if ( 0 === $was_at ) {
475 return $iso_8859_1_text;
476 }
477
478 $utf8 .= substr( $iso_8859_1_text, $was_at );
479 return $utf8;
480}
481
482/**
483 * Converts a string from UTF-8 to ISO-8859-1, maintaining backwards compatibility
484 * with the deprecated function from the PHP standard library.
485 *
486 * @since 6.9.0
487 * @access private
488 *
489 * @see \utf8_decode()
490 *
491 * @param string $utf8_text Text treated as UTF-8 bytes.
492 * @return string Text converted into ISO-8859-1.
493 */
494function _wp_utf8_decode_fallback( $utf8_text ) {
495 $utf8_text = (string) $utf8_text;
496 $at = 0;
497 $was_at = 0;
498 $end = strlen( $utf8_text );
499 $iso_8859_1_text = '';
500
501 while ( $at < $end ) {
502 // US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
503 $ascii_byte_count = strspn(
504 $utf8_text,
505 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
506 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
507 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
508 $at
509 );
510
511 if ( $ascii_byte_count > 0 ) {
512 $at += $ascii_byte_count;
513 continue;
514 }
515
516 $next_at = $at;
517 $invalid_length = 0;
518 $found = _wp_scan_utf8( $utf8_text, $next_at, $invalid_length, null, 1 );
519 $span_length = $next_at - $at;
520 $next_byte = '?';
521
522 if ( 1 !== $found ) {
523 if ( $invalid_length > 0 ) {
524 $next_byte = '';
525 goto flush_sub_part;
526 }
527
528 break;
529 }
530
531 // All convertible code points are two-bytes long.
532 $byte1 = ord( $utf8_text[ $at ] );
533 if ( 0xC0 !== ( $byte1 & 0xE0 ) ) {
534 goto flush_sub_part;
535 }
536
537 // All convertible code points are not greater than U+FF.
538 $byte2 = ord( $utf8_text[ $at + 1 ] );
539 $code_point = ( ( $byte1 & 0x1F ) << 6 ) | ( ( $byte2 & 0x3F ) );
540 if ( $code_point > 0xFF ) {
541 goto flush_sub_part;
542 }
543
544 $next_byte = chr( $code_point );
545
546 flush_sub_part:
547 $iso_8859_1_text .= substr( $utf8_text, $was_at, $at - $was_at );
548 $iso_8859_1_text .= $next_byte;
549 $at += $span_length;
550 $was_at = $at;
551
552 if ( $invalid_length > 0 ) {
553 $iso_8859_1_text .= '?';
554 $at += $invalid_length;
555 $was_at = $at;
556 }
557 }
558
559 if ( 0 === $was_at ) {
560 return $utf8_text;
561 }
562
563 $iso_8859_1_text .= substr( $utf8_text, $was_at );
564 return $iso_8859_1_text;
565}
566