1<?php
2/**
3 * Block Serialization Parser
4 *
5 * @package WordPress
6 */
7
8/**
9 * Class WP_Block_Parser
10 *
11 * Parses a document and constructs a list of parsed block objects
12 *
13 * @since 5.0.0
14 * @since 4.0.0 returns arrays not objects, all attributes are arrays
15 */
16class WP_Block_Parser {
17 /**
18 * Input document being parsed
19 *
20 * @example "Pre-text\n<!-- wp:paragraph -->This is inside a block!<!-- /wp:paragraph -->"
21 *
22 * @since 5.0.0
23 * @var string
24 */
25 public $document;
26
27 /**
28 * Tracks parsing progress through document
29 *
30 * @since 5.0.0
31 * @var int
32 */
33 public $offset;
34
35 /**
36 * List of parsed blocks
37 *
38 * @since 5.0.0
39 * @var array[]
40 */
41 public $output;
42
43 /**
44 * Stack of partially-parsed structures in memory during parse
45 *
46 * @since 5.0.0
47 * @var WP_Block_Parser_Frame[]
48 */
49 public $stack;
50
51 /**
52 * Parses a document and returns a list of block structures
53 *
54 * When encountering an invalid parse will return a best-effort
55 * parse. In contrast to the specification parser this does not
56 * return an error on invalid inputs.
57 *
58 * @since 5.0.0
59 *
60 * @param string $document Input document being parsed.
61 * @return array[]
62 */
63 public function parse( $document ) {
64 $this->document = $document;
65 $this->offset = 0;
66 $this->output = array();
67 $this->stack = array();
68
69 while ( $this->proceed() ) {
70 continue;
71 }
72
73 return $this->output;
74 }
75
76 /**
77 * Processes the next token from the input document
78 * and returns whether to proceed eating more tokens
79 *
80 * This is the "next step" function that essentially
81 * takes a token as its input and decides what to do
82 * with that token before descending deeper into a
83 * nested block tree or continuing along the document
84 * or breaking out of a level of nesting.
85 *
86 * @internal
87 * @since 5.0.0
88 * @return bool
89 */
90 public function proceed() {
91 $next_token = $this->next_token();
92 list( $token_type, $block_name, $attrs, $start_offset, $token_length ) = $next_token;
93 $stack_depth = count( $this->stack );
94
95 // we may have some HTML soup before the next block.
96 $leading_html_start = $start_offset > $this->offset ? $this->offset : null;
97
98 switch ( $token_type ) {
99 case 'no-more-tokens':
100 // if not in a block then flush output.
101 if ( 0 === $stack_depth ) {
102 $this->add_freeform();
103 return false;
104 }
105
106 /*
107 * Otherwise we have a problem
108 * This is an error
109 *
110 * we have options
111 * - treat it all as freeform text
112 * - assume an implicit closer (easiest when not nesting)
113 */
114
115 // for the easy case we'll assume an implicit closer.
116 if ( 1 === $stack_depth ) {
117 $this->add_block_from_stack();
118 return false;
119 }
120
121 /*
122 * for the nested case where it's more difficult we'll
123 * have to assume that multiple closers are missing
124 * and so we'll collapse the whole stack piecewise
125 */
126 while ( 0 < count( $this->stack ) ) {
127 $this->add_block_from_stack();
128 }
129 return false;
130
131 case 'void-block':
132 /*
133 * easy case is if we stumbled upon a void block
134 * in the top-level of the document
135 */
136 if ( 0 === $stack_depth ) {
137 if ( isset( $leading_html_start ) ) {
138 $this->output[] = (array) $this->freeform(
139 substr(
140 $this->document,
141 $leading_html_start,
142 $start_offset - $leading_html_start
143 )
144 );
145 }
146
147 $this->output[] = (array) new WP_Block_Parser_Block( $block_name, $attrs, array(), '', array() );
148 $this->offset = $start_offset + $token_length;
149 return true;
150 }
151
152 // otherwise we found an inner block.
153 $this->add_inner_block(
154 new WP_Block_Parser_Block( $block_name, $attrs, array(), '', array() ),
155 $start_offset,
156 $token_length
157 );
158 $this->offset = $start_offset + $token_length;
159 return true;
160
161 case 'block-opener':
162 // track all newly-opened blocks on the stack.
163 array_push(
164 $this->stack,
165 new WP_Block_Parser_Frame(
166 new WP_Block_Parser_Block( $block_name, $attrs, array(), '', array() ),
167 $start_offset,
168 $token_length,
169 $start_offset + $token_length,
170 $leading_html_start
171 )
172 );
173 $this->offset = $start_offset + $token_length;
174 return true;
175
176 case 'block-closer':
177 /*
178 * if we're missing an opener we're in trouble
179 * This is an error
180 */
181 if ( 0 === $stack_depth ) {
182 /*
183 * we have options
184 * - assume an implicit opener
185 * - assume _this_ is the opener
186 * - give up and close out the document
187 */
188 $this->add_freeform();
189 return false;
190 }
191
192 // if we're not nesting then this is easy - close the block.
193 if ( 1 === $stack_depth ) {
194 $this->add_block_from_stack( $start_offset );
195 $this->offset = $start_offset + $token_length;
196 return true;
197 }
198
199 /*
200 * otherwise we're nested and we have to close out the current
201 * block and add it as a new innerBlock to the parent
202 */
203 $stack_top = array_pop( $this->stack );
204 $html = substr( $this->document, $stack_top->prev_offset, $start_offset - $stack_top->prev_offset );
205 $stack_top->block->innerHTML .= $html;
206 $stack_top->block->innerContent[] = $html;
207 $stack_top->prev_offset = $start_offset + $token_length;
208
209 $this->add_inner_block(
210 $stack_top->block,
211 $stack_top->token_start,
212 $stack_top->token_length,
213 $start_offset + $token_length
214 );
215 $this->offset = $start_offset + $token_length;
216 return true;
217
218 default:
219 // This is an error.
220 $this->add_freeform();
221 return false;
222 }
223 }
224
225 /**
226 * Scans the document from where we last left off
227 * and finds the next valid token to parse if it exists
228 *
229 * Returns the type of the find: kind of find, block information, attributes
230 *
231 * @internal
232 * @since 5.0.0
233 * @since 4.6.1 fixed a bug in attribute parsing which caused catastrophic backtracking on invalid block comments
234 * @return array
235 */
236 public function next_token() {
237 $matches = null;
238
239 /*
240 * aye the magic
241 * we're using a single RegExp to tokenize the block comment delimiters
242 * we're also using a trick here because the only difference between a
243 * block opener and a block closer is the leading `/` before `wp:` (and
244 * a closer has no attributes). we can trap them both and process the
245 * match back in PHP to see which one it was.
246 */
247 $has_match = preg_match(
248 '/<!--\s+(?P<closer>\/)?wp:(?P<namespace>[a-z][a-z0-9_-]*\/)?(?P<name>[a-z][a-z0-9_-]*)\s+(?P<attrs>{(?:(?:[^}]+|}+(?=})|(?!}\s+\/?-->).)*+)?}\s+)?(?P<void>\/)?-->/s',
249 $this->document,
250 $matches,
251 PREG_OFFSET_CAPTURE,
252 $this->offset
253 );
254
255 // if we get here we probably have catastrophic backtracking or out-of-memory in the PCRE.
256 if ( false === $has_match ) {
257 return array( 'no-more-tokens', null, null, null, null );
258 }
259
260 // we have no more tokens.
261 if ( 0 === $has_match ) {
262 return array( 'no-more-tokens', null, null, null, null );
263 }
264
265 list( $match, $started_at ) = $matches[0];
266
267 $length = strlen( $match );
268 $is_closer = isset( $matches['closer'] ) && -1 !== $matches['closer'][1];
269 $is_void = isset( $matches['void'] ) && -1 !== $matches['void'][1];
270 $namespace = $matches['namespace'];
271 $namespace = ( isset( $namespace ) && -1 !== $namespace[1] ) ? $namespace[0] : 'core/';
272 $name = $namespace . $matches['name'][0];
273 $has_attrs = isset( $matches['attrs'] ) && -1 !== $matches['attrs'][1];
274
275 /*
276 * Fun fact! It's not trivial in PHP to create "an empty associative array" since all arrays
277 * are associative arrays. If we use `array()` we get a JSON `[]`
278 */
279 $attrs = $has_attrs
280 ? json_decode( $matches['attrs'][0], /* as-associative */ true )
281 : array();
282
283 /*
284 * This state isn't allowed
285 * This is an error
286 */
287 if ( $is_closer && ( $is_void || $has_attrs ) ) {
288 // we can ignore them since they don't hurt anything.
289 }
290
291 if ( $is_void ) {
292 return array( 'void-block', $name, $attrs, $started_at, $length );
293 }
294
295 if ( $is_closer ) {
296 return array( 'block-closer', $name, null, $started_at, $length );
297 }
298
299 return array( 'block-opener', $name, $attrs, $started_at, $length );
300 }
301
302 /**
303 * Returns a new block object for freeform HTML
304 *
305 * @internal
306 * @since 5.0.0
307 *
308 * @param string $inner_html HTML content of block.
309 * @return WP_Block_Parser_Block freeform block object.
310 */
311 public function freeform( $inner_html ) {
312 return new WP_Block_Parser_Block( null, array(), array(), $inner_html, array( $inner_html ) );
313 }
314
315 /**
316 * Pushes a length of text from the input document
317 * to the output list as a freeform block.
318 *
319 * @internal
320 * @since 5.0.0
321 * @param null $length how many bytes of document text to output.
322 */
323 public function add_freeform( $length = null ) {
324 $length = $length ? $length : strlen( $this->document ) - $this->offset;
325
326 if ( 0 === $length ) {
327 return;
328 }
329
330 $this->output[] = (array) $this->freeform( substr( $this->document, $this->offset, $length ) );
331 }
332
333 /**
334 * Given a block structure from memory pushes
335 * a new block to the output list.
336 *
337 * @internal
338 * @since 5.0.0
339 * @param WP_Block_Parser_Block $block The block to add to the output.
340 * @param int $token_start Byte offset into the document where the first token for the block starts.
341 * @param int $token_length Byte length of entire block from start of opening token to end of closing token.
342 * @param int|null $last_offset Last byte offset into document if continuing form earlier output.
343 */
344 public function add_inner_block( WP_Block_Parser_Block $block, $token_start, $token_length, $last_offset = null ) {
345 $parent = $this->stack[ count( $this->stack ) - 1 ];
346 $parent->block->innerBlocks[] = (array) $block;
347 $html = substr( $this->document, $parent->prev_offset, $token_start - $parent->prev_offset );
348
349 if ( ! empty( $html ) ) {
350 $parent->block->innerHTML .= $html;
351 $parent->block->innerContent[] = $html;
352 }
353
354 $parent->block->innerContent[] = null;
355 $parent->prev_offset = $last_offset ? $last_offset : $token_start + $token_length;
356 }
357
358 /**
359 * Pushes the top block from the parsing stack to the output list.
360 *
361 * @internal
362 * @since 5.0.0
363 * @param int|null $end_offset byte offset into document for where we should stop sending text output as HTML.
364 */
365 public function add_block_from_stack( $end_offset = null ) {
366 $stack_top = array_pop( $this->stack );
367 $prev_offset = $stack_top->prev_offset;
368
369 $html = isset( $end_offset )
370 ? substr( $this->document, $prev_offset, $end_offset - $prev_offset )
371 : substr( $this->document, $prev_offset );
372
373 if ( ! empty( $html ) ) {
374 $stack_top->block->innerHTML .= $html;
375 $stack_top->block->innerContent[] = $html;
376 }
377
378 if ( isset( $stack_top->leading_html_start ) ) {
379 $this->output[] = (array) $this->freeform(
380 substr(
381 $this->document,
382 $stack_top->leading_html_start,
383 $stack_top->token_start - $stack_top->leading_html_start
384 )
385 );
386 }
387
388 $this->output[] = (array) $stack_top->block;
389 }
390}
391
392/**
393 * WP_Block_Parser_Block class.
394 *
395 * Required for backward compatibility in WordPress Core.
396 */
397require_once __DIR__ . '/class-wp-block-parser-block.php';
398
399/**
400 * WP_Block_Parser_Frame class.
401 *
402 * Required for backward compatibility in WordPress Core.
403 */
404require_once __DIR__ . '/class-wp-block-parser-frame.php';
405