From 47a523cb08da0b89b06faad15a4be88d08bc0366 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 24 Jun 2026 09:01:04 +0000 Subject: [PATCH 1/3] HTML API: Extract IN BODY any other end tag handling Moves the IN BODY insertion mode's any other end tag processing to a private helper so other parsing algorithms can invoke it directly. --- .../html-api/class-wp-html-processor.php | 68 ++++++++++++------- 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 1828123ff879d..27fc0db8a6d46 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -3257,38 +3257,60 @@ private function step_in_body(): bool { /* * > Any other end tag */ + return $this->step_in_body_any_other_end_tag(); + } - /* - * Find the corresponding tag opener in the stack of open elements, if - * it exists before reaching a special element, which provides a kind - * of boundary in the stack. For example, a `` should not - * close anything beyond its containing `P` or `DIV` element. - */ - foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { - if ( 'html' === $node->namespace && $token_name === $node->node_name ) { - break; - } + $this->bail( 'Should not have been able to reach end of IN BODY processing. Check HTML API code.' ); + // This unnecessary return prevents tools from inaccurately reporting type errors. + return false; + } - if ( self::is_special( $node ) ) { - // This is a parse error, ignore the token. - return $this->step(); - } + /** + * Applies the "any other end tag" parsing instructions for the IN BODY insertion mode. + * + * @since 7.1.0 + * @ignore + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-inbody + * @see WP_HTML_Processor::step_in_body + * + * @return bool Whether an element was found. + */ + private function step_in_body_any_other_end_tag(): bool { + $token_name = $this->get_token_name(); + + /* + * Find the corresponding tag opener in the stack of open elements, if + * it exists before reaching a special element, which provides a kind + * of boundary in the stack. For example, a `` should not + * close anything beyond its containing `P` or `DIV` element. + */ + foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { + if ( 'html' === $node->namespace && $token_name === $node->node_name ) { + break; } - $this->generate_implied_end_tags( $token_name ); - if ( $node !== $this->state->stack_of_open_elements->current_node() ) { - // @todo Record parse error: this error doesn't impact parsing. + if ( self::is_special( $node ) ) { + // This is a parse error, ignore the token. + return $this->step(); } + } - foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { - $this->state->stack_of_open_elements->pop(); - if ( $node === $item ) { - return true; - } + $this->generate_implied_end_tags( $token_name ); + if ( $node !== $this->state->stack_of_open_elements->current_node() ) { + // @todo Record parse error: this error doesn't impact parsing. + } + + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + $this->state->stack_of_open_elements->pop(); + if ( $node === $item ) { + return true; } } - $this->bail( 'Should not have been able to reach end of IN BODY processing. Check HTML API code.' ); + $this->bail( 'Should not have been able to reach end of "any other end tag" IN BODY processing. Check HTML API code.' ); // This unnecessary return prevents tools from inaccurately reporting type errors. return false; } From 86c344f206f13b26354bfdfec4cae496d6dc8582 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 24 Jun 2026 09:04:44 +0000 Subject: [PATCH 2/3] HTML API: Reconstruct active formatting elements Builds on the extracted IN BODY any other end tag helper while combining active formatting element reconstruction with adoption agency improvements. Preserves attributes for virtual reconstructed formatting elements and updates focused HTML API tests for reconstructed formatting and unclosed A elements. --- ...ass-wp-html-active-formatting-elements.php | 39 +- .../html-api/class-wp-html-open-elements.php | 17 +- .../html-api/class-wp-html-processor.php | 398 +++++++++++++++--- .../html-api/class-wp-html-tag-processor.php | 29 +- .../tests/html-api/wpHtmlProcessor.php | 15 +- .../html-api/wpHtmlProcessorBreadcrumbs.php | 68 +-- .../html-api/wpHtmlProcessorHtml5lib.php | 5 + 7 files changed, 461 insertions(+), 110 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php index d73561843bcb2..453cb6f734b76 100644 --- a/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php +++ b/src/wp-includes/html-api/class-wp-html-active-formatting-elements.php @@ -44,6 +44,18 @@ class WP_HTML_Active_Formatting_Elements { */ private $stack = array(); + /** + * Returns the node at the given 1-offset index in the list of active formatting elements. + * + * @since 7.0.0 + * + * @param int $nth Number of nodes from the top node to return. + * @return WP_HTML_Token|null Node at the given index in the stack, if one exists, otherwise null. + */ + public function at( int $nth ): ?WP_HTML_Token { + return $this->stack[ $nth - 1 ] ?? null; + } + /** * Reports if a specific node is in the stack of active formatting elements. * @@ -111,8 +123,9 @@ public function insert_marker(): void { * @see https://html.spec.whatwg.org/#push-onto-the-list-of-active-formatting-elements * * @param WP_HTML_Token $token Push this node onto the stack. + * @return bool Whether a node was pushed onto the stack of active formatting elements. */ - public function push( WP_HTML_Token $token ) { + public function push( WP_HTML_Token $token ): bool { /* * > If there are already three elements in the list of active formatting elements after the last marker, * > if any, or anywhere in the list if there are no markers, that have the same tag name, namespace, and @@ -121,11 +134,31 @@ public function push( WP_HTML_Token $token ) { * > created by the parser; two elements have the same attributes if all their parsed attributes can be * > paired such that the two attributes in each pair have identical names, namespaces, and values * > (the order of the attributes does not matter). - * - * @todo Implement the "Noah's Ark clause" to only add up to three of any given kind of formatting elements to the stack. */ + if ( 'marker' !== $token->node_name ) { + $existing_count = 0; + foreach ( $this->walk_up() as $item ) { + if ( 'marker' === $item->node_name ) { + break; + } + + if ( + $item->node_name === $token->node_name && + $item->namespace === $token->namespace + // @todo Compare attributes. For now, bail if there are three matching tag names + namespaces. + ) { + ++$existing_count; + if ( $existing_count >= 3 ) { + // @todo Implement removing the earliest element and moving forward. + return false; + } + } + } + } + // > Add element to the list of active formatting elements. $this->stack[] = $token; + return true; } /** diff --git a/src/wp-includes/html-api/class-wp-html-open-elements.php b/src/wp-includes/html-api/class-wp-html-open-elements.php index aeee107250895..183e3e70567c4 100644 --- a/src/wp-includes/html-api/class-wp-html-open-elements.php +++ b/src/wp-includes/html-api/class-wp-html-open-elements.php @@ -614,12 +614,23 @@ public function remove_node( WP_HTML_Token $token ): bool { * see WP_HTML_Open_Elements::walk_up(). * * @since 6.4.0 + * @since 7.0.0 Accepts $below_this_node to start traversal below a given node, if it exists. + * + * @param WP_HTML_Token|null $below_this_node Start traversing below this node, if provided and if the node exists. */ - public function walk_down() { - $count = count( $this->stack ); + public function walk_down( ?WP_HTML_Token $below_this_node = null ) { + $has_found_node = null === $below_this_node; + $count = count( $this->stack ); for ( $i = 0; $i < $count; $i++ ) { - yield $this->stack[ $i ]; + $node = $this->stack[ $i ]; + + if ( ! $has_found_node ) { + $has_found_node = $node === $below_this_node; + continue; + } + + yield $node; } } diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 27fc0db8a6d46..7ff5da6b01c55 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -258,6 +258,18 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private $context_node = null; + /** + * If a formatting element has been reconstructed, this will hold + * the parsed attributes from the original format, once requested. + * + * These attributes are not modifiable. + * + * @since 7.0.0 + * + * @var array + */ + protected $actively_reconstructed_formatting_attributes = array(); + /* * Public Interface Functions */ @@ -2863,7 +2875,10 @@ private function step_in_body(): bool { $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); - $this->state->active_formatting_elements->push( $this->state->current_token ); + if ( false === $this->state->active_formatting_elements->push( $this->state->current_token ) ) { + $this->bail( 'Cannot track formatting elements when encountering a fourth identical token.' ); + } + $this->actively_reconstructed_formatting_attributes[ $this->state->current_token->bookmark_name ] = $this->attributes; return true; /* @@ -2884,7 +2899,10 @@ private function step_in_body(): bool { case '+U': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); - $this->state->active_formatting_elements->push( $this->state->current_token ); + if ( false === $this->state->active_formatting_elements->push( $this->state->current_token ) ) { + $this->bail( 'Cannot track formatting elements when encountering a fourth identical token.' ); + } + $this->actively_reconstructed_formatting_attributes[ $this->state->current_token->bookmark_name ] = $this->attributes; return true; /* @@ -2900,7 +2918,10 @@ private function step_in_body(): bool { } $this->insert_html_element( $this->state->current_token ); - $this->state->active_formatting_elements->push( $this->state->current_token ); + if ( false === $this->state->active_formatting_elements->push( $this->state->current_token ) ) { + $this->bail( 'Cannot track formatting elements when encountering a fourth identical token.' ); + } + $this->actively_reconstructed_formatting_attributes[ $this->state->current_token->bookmark_name ] = $this->attributes; return true; /* @@ -2921,8 +2942,7 @@ private function step_in_body(): bool { case '-STRONG': case '-TT': case '-U': - $this->run_adoption_agency_algorithm(); - return true; + return $this->run_adoption_agency_algorithm(); /* * > A start tag whose tag name is one of: "applet", "marquee", "object" @@ -5420,7 +5440,46 @@ public function get_token_type(): ?string { * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. */ public function get_attribute( $name ) { - return $this->is_virtual() ? null : parent::get_attribute( $name ); + if ( $this->is_virtual() ) { + $virtual_attributes = $this->actively_reconstructed_formatting_attributes[ $this->current_element->token->bookmark_name ?? '' ] ?? null; + if ( null === $virtual_attributes ) { + return null; + } + + $current_attributes = $this->attributes; + $current_updates = $this->lexical_updates; + $this->lexical_updates = array(); + $this->attributes = $virtual_attributes; + $parser_state = $this->parser_state; + $this->parser_state = WP_HTML_Tag_Processor::STATE_MATCHED_TAG; + $attribute_value = parent::get_attribute( $name ); + $this->attributes = $current_attributes; + $this->parser_state = $parser_state; + $this->lexical_updates = $current_updates; + + return $attribute_value; + } + + return parent::get_attribute( $name ); + } + + /** + * Returns the adjusted attribute name for a given attribute, taking into + * account the current parsing context, whether HTML, SVG, or MathML. + * + * @since 7.0.0 Subclassed for the HTML Processor. + * + * @param string $attribute_name Which attribute name to adjust. + * + * @return string|null The qualified attribute name or null if not on matched tag. + */ + public function get_qualified_attribute_name( $attribute_name ): ?string { + if ( $this->is_virtual() ) { + $namespace = $this->current_element->token->namespace; + return self::lookup_qualified_attribute_name( $namespace, $attribute_name ); + } + + return parent::get_qualified_attribute_name( $attribute_name ); } /** @@ -5498,7 +5557,27 @@ public function remove_attribute( $name ): bool { * @return array|null List of attribute names, or `null` when no tag opener is matched. */ public function get_attribute_names_with_prefix( $prefix ): ?array { - return $this->is_virtual() ? null : parent::get_attribute_names_with_prefix( $prefix ); + if ( $this->is_virtual() ) { + $virtual_attributes = $this->actively_reconstructed_formatting_attributes[ $this->current_element->token->bookmark_name ?? '' ] ?? null; + if ( null === $virtual_attributes ) { + return null; + } + + $current_attributes = $this->attributes; + $current_updates = $this->lexical_updates; + $this->lexical_updates = array(); + $this->attributes = $virtual_attributes; + $parser_state = $this->parser_state; + $this->parser_state = WP_HTML_Tag_Processor::STATE_MATCHED_TAG; + $attribute_names = parent::get_attribute_names_with_prefix( $prefix ); + $this->attributes = $current_attributes; + $this->parser_state = $parser_state; + $this->lexical_updates = $current_updates; + + return $attribute_names; + } + + return parent::get_attribute_names_with_prefix( $prefix ); } /** @@ -6005,6 +6084,7 @@ private function get_adjusted_current_node(): ?WP_HTML_Token { * > been explicitly closed. * * @since 6.4.0 + * @since 7.0.0 Added additional support. * @ignore * * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. @@ -6014,34 +6094,89 @@ private function get_adjusted_current_node(): ?WP_HTML_Token { * @return bool Whether any formatting elements needed to be reconstructed. */ private function reconstruct_active_formatting_elements(): bool { + $count = $this->state->active_formatting_elements->count(); /* - * > If there are no entries in the list of active formatting elements, then there is nothing - * > to reconstruct; stop this algorithm. + * > 1. If there are no entries in the list of active formatting elements, + * > then there is nothing to reconstruct; stop this algorithm. */ - if ( 0 === $this->state->active_formatting_elements->count() ) { + if ( 0 === $count ) { return false; } - $last_entry = $this->state->active_formatting_elements->current_node(); + $currently_at = $count; + $last_entry = $this->state->active_formatting_elements->at( $currently_at ); + /* + * > 2. If the last (most recently added) entry in the list of active formatting + * > elements is a marker, or if it is an element that is in the stack of open + * > elements, then there is nothing to reconstruct; stop this algorithm. + */ if ( - - /* - * > If the last (most recently added) entry in the list of active formatting elements is a marker; - * > stop this algorithm. - */ 'marker' === $last_entry->node_name || - - /* - * > If the last (most recently added) entry in the list of active formatting elements is an - * > element that is in the stack of open elements, then there is nothing to reconstruct; - * > stop this algorithm. - */ $this->state->stack_of_open_elements->contains_node( $last_entry ) ) { return false; } - $this->bail( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' ); + /* + * > 3. Let entry be the last (most recently added) element + * > in the list of active formatting elements. + */ + $entry = $last_entry; + + /* + * > 4. Rewind: If there are no entries before entry in the list of active + * > formatting elements, then jump to the step labeled create. + */ + rewind: + if ( 1 === $currently_at ) { + goto create; + } + + /* + * > 5. Let entry be the entry one earlier than entry + * > in the list of active formatting elements. + */ + $entry = $this->state->active_formatting_elements->at( --$currently_at ); + + /* + * > 6. If entry is neither a marker nor an element that is also in + * > the stack of open elements, go to the step labeled rewind. + */ + if ( + 'marker' !== $entry->node_name && + ! $this->state->stack_of_open_elements->contains_node( $entry ) + ) { + goto rewind; + } + + /* + * > 7. Advance: Let entry be the element one later than entry + * > in the list of active formatting elements. + */ + advance: + $entry = $this->state->active_formatting_elements->at( ++$currently_at ); + + /* + * > 8. Create: Insert an HTML element for the token for which the + * > element entry was created, to obtain new element. + */ + create: + $this->insert_html_element( $entry ); + + /* + * > 9. Replace the entry for _entry_ in the list with an entry for new element. + * > This doesn't need to happen here since no DOM is being created. + */ + + /* + * > 10. If the entry for new element in the list of active formatting elements + * > is not the last entry in the list, return to the step labeled advance. + */ + if ( $count !== $currently_at ) { + goto advance; + } + + return true; } /** @@ -6246,33 +6381,35 @@ private function reset_insertion_mode_appropriately(): void { * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. * * @see https://html.spec.whatwg.org/#adoption-agency-algorithm + * + * @return bool Whether the current token was handled without exhausting input. */ - private function run_adoption_agency_algorithm(): void { + private function run_adoption_agency_algorithm(): bool { $budget = 1000; $subject = $this->get_tag(); $current_node = $this->state->stack_of_open_elements->current_node(); + /* + * > 2. If the current node is an HTML element whose tag name is subject, + * > and the current node is not in the list of active formatting elements, + * > then pop the current node off the stack of open elements and return. + */ if ( - // > If the current node is an HTML element whose tag name is subject - $current_node && $subject === $current_node->node_name && - // > the current node is not in the list of active formatting elements + $current_node && + 'html' === $current_node->namespace && + $subject === $current_node->node_name && ! $this->state->active_formatting_elements->contains_node( $current_node ) ) { $this->state->stack_of_open_elements->pop(); - return; + return true; } - $outer_loop_counter = 0; - while ( $budget-- > 0 ) { - if ( $outer_loop_counter++ >= 8 ) { - return; - } - + for ( $outer_loop_counter = 0; $outer_loop_counter < 8; ++$outer_loop_counter ) { /* - * > Let formatting element be the last element in the list of active formatting elements that: - * > - is between the end of the list and the last marker in the list, - * > if any, or the start of the list otherwise, - * > - and has the tag name subject. + * > 3. Let formatting element be the last element in the list of active formatting elements that: + * > - is between the end of the list and the last marker in the list, + * > if any, or the start of the list otherwise, + * > - and has the tag name subject. */ $formatting_element = null; foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { @@ -6286,38 +6423,41 @@ private function run_adoption_agency_algorithm(): void { } } - // > If there is no such element, then return and instead act as described in the "any other end tag" entry above. + /* + * > If there is no such element, then act as described + * > in the "any other end tag" entry above and return. + */ if ( null === $formatting_element ) { - $this->bail( 'Cannot run adoption agency when "any other end tag" is required.' ); + return $this->step_in_body_any_other_end_tag(); } - // > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return. + /* + * > 4. If formatting element is not in the stack of open elements, then + * > this is a parse error; remove the element from the list, and return. + */ if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) { $this->state->active_formatting_elements->remove_node( $formatting_element ); - return; + return true; } - // > If formatting element is in the stack of open elements, but the element is not in scope, then this is a parse error; return. + /* + * > 5. If formatting element is in the stack of open elements, but the element + * > is not in scope, then this is a parse error; return. + */ if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $formatting_element->node_name ) ) { - return; + return true; } /* - * > Let furthest block be the topmost node in the stack of open elements that is lower in the stack - * > than formatting element, and is an element in the special category. There might not be one. + * > 6. If formatting element is not the current node, this is a parse error. (But do not return.) */ - $is_above_formatting_element = true; - $furthest_block = null; - foreach ( $this->state->stack_of_open_elements->walk_down() as $item ) { - if ( $is_above_formatting_element && $formatting_element->bookmark_name !== $item->bookmark_name ) { - continue; - } - - if ( $is_above_formatting_element ) { - $is_above_formatting_element = false; - continue; - } + /* + * > 7. Let furthest block be the topmost node in the stack of open elements that is lower in the stack + * > than formatting element, and is an element in the special category. There might not be one. + */ + $furthest_block = null; + foreach ( $this->state->stack_of_open_elements->walk_down( $formatting_element ) as $item ) { if ( self::is_special( $item ) ) { $furthest_block = $item; break; @@ -6325,25 +6465,155 @@ private function run_adoption_agency_algorithm(): void { } /* - * > If there is no furthest block, then the UA must first pop all the nodes from the bottom of the - * > stack of open elements, from the current node up to and including formatting element, then - * > remove formatting element from the list of active formatting elements, and finally return. + * > 8. If there is no furthest block, then the UA must first pop all the nodes from the bottom of + * > the stack of open elements, from the current node up to and including formatting element, + * > then remove formatting element from the list of active formatting elements, and finally return. */ if ( null === $furthest_block ) { foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { $this->state->stack_of_open_elements->pop(); - if ( $formatting_element->bookmark_name === $item->bookmark_name ) { + if ( $formatting_element === $item ) { $this->state->active_formatting_elements->remove_node( $formatting_element ); - return; + return true; } } } - $this->bail( 'Cannot extract common ancestor in adoption agency algorithm.' ); + /* + * > 9. Let common ancestor be the element immediately above + * > formatting element in the stack of open elements. + */ + $common_ancestor = null; + foreach ( $this->state->stack_of_open_elements->walk_up( $formatting_element ) as $item ) { + $common_ancestor = $item; + break; + } + + /* + * > 10. Let a bookmark note the position of formatting element in the list of active + * > formatting elements relative to the elements on either side of it in the list. + */ + $formatting_element_index = 0; + foreach ( $this->state->active_formatting_elements->walk_down() as $item ) { + if ( $formatting_element === $item ) { + break; + } + + ++$formatting_element_index; + } + + /* + * > 11. Let node and last node be furthest block. + */ + $node = $furthest_block; + $last_node = $furthest_block; + + $inner_loop_counter = 0; + while ( $budget-- > 0 ) { + /* + * > 1. Increment innerLoopCounter by 1. + */ + ++$inner_loop_counter; + + /* + * > 2. Let node be the element immediately above node in the stack of open elements, + * > or if node is no longer in the stack of open elements (e.g. because it got + * > removed by this algorithm), the element that was immediately above node in + * > the stack of open elements before node was removed. + */ + if ( $this->state->stack_of_open_elements->contains_node( $node ) ) { + foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { + $node = $item; + break; + } + } else { + $this->bail( 'Cannot adjust node pointer above removed node.' ); + } + + /* + * > 3. If node is formattingElement, then break. + */ + if ( $formatting_element === $node ) { + break; + } + + /* + * > 4. If innerLoopCounter is greater than 3 and node is in the list of active formatting + * > elements, then remove node from the list of active formatting elements. + */ + if ( $inner_loop_counter > 3 && $this->state->active_formatting_elements->contains_node( $node ) ) { + $this->state->active_formatting_elements->remove_node( $node ); + } + + /* + * > 5. If node is not in the list of active formatting elements, then remove node from + * > the stack of open elements and continue. + */ + if ( ! $this->state->active_formatting_elements->contains_node( $node ) ) { + $this->state->stack_of_open_elements->remove_node( $node ); + continue; + } + + /* + * > 6. Create an element for the token for which the element node was created, + * > in the HTML namespace, with common ancestor as the intended parent; + * > replace the entry for node in the list of active formatting elements + * > with an entry for the new element, replace the entry for node in the + * > stack of open elements with an entry for the new element, and let node + * > be the new element. + */ + $this->bail( 'Cannot create and reference new element for which no token exists.' ); + + /* + * > 7. If last node is furthestBlock, then move the aforementioned bookmark to + * > be immediately after the new node in the list of active formatting elements. + */ + + /* + * > 8. Append lastNode to node. + */ + + /* + * > 9. Set lastNode to node. + */ + $last_node = $node; + } + + /* + * > 14. Insert whatever last node ended up being in the previous step at the appropriate + * > place for inserting a node, but using common ancestor as the override target. + */ + $this->bail( 'Cannot create and reference new element for which no token exists.' ); + + /* + * > 15. Create an element for the token for which formattingElement was created, + * > in the HTML namespace, with furthestBlock as the intended parent. + */ + + /* + * > 16. Take all of the child nodes of furthestBlock and append them to the element + * > created in the last step. + */ + + /* + * > 17. Append that new element to furthestBlock. + */ + + /* + * > 18. Remove formattingElement from the list of active formatting elements, + * > and insert the new element into the list of active formatting elements + * > at the position of the aforementioned bookmark. + */ + + /* + * > 19. Remove formattingElement from the stack of open elements, and insert the + * > new element into the stack of open elements immediately below the position + * > of furthestBlock in that stack. + */ } - $this->bail( 'Cannot run adoption agency when looping required.' ); + return true; } /** diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index e41e1120550b5..fbbe8d18740c8 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -708,7 +708,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var WP_HTML_Attribute_Token[] */ - private $attributes = array(); + protected $attributes = array(); /** * Tracks spans of duplicate attributes on a given tag, used for removing @@ -3076,23 +3076,38 @@ public function get_qualified_tag_name(): ?string { * * @since 6.7.0 * - * @param string $attribute_name Which attribute to adjust. + * @param string $attribute_name Which attribute name to adjust. * - * @return string|null + * @return string|null The qualified attribute name or null if not on matched tag. */ public function get_qualified_attribute_name( $attribute_name ): ?string { if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return null; } - $namespace = $this->get_namespace(); + $namespace = $this->get_namespace(); + return self::lookup_qualified_attribute_name( $namespace, $attribute_name ); + } + + /** + * Returns the adjusted attribute name for a given attribute, taking into + * account the provided namespace. + * + * @since 7.0.0 + * + * @param string $ns The namespace to use: 'html', 'svg', or 'math'. + * @param string $attribute_name Which attribute to adjust. + * + * @return string The qualified attribute name. + */ + final protected static function lookup_qualified_attribute_name( string $ns, string $attribute_name ): string { $lower_name = strtolower( $attribute_name ); - if ( 'math' === $namespace && 'definitionurl' === $lower_name ) { + if ( 'math' === $ns && 'definitionurl' === $lower_name ) { return 'definitionURL'; } - if ( 'svg' === $this->get_namespace() ) { + if ( 'svg' === $ns ) { switch ( $lower_name ) { case 'attributename': return 'attributeName'; @@ -3270,7 +3285,7 @@ public function get_qualified_attribute_name( $attribute_name ): ?string { } } - if ( 'html' !== $namespace ) { + if ( 'html' !== $ns ) { switch ( $lower_name ) { case 'xlink:actuate': return 'xlink actuate'; diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index bb18629563493..fa492cde6939f 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -185,18 +185,23 @@ public function test_clear_to_navigate_after_seeking() { } /** - * Ensures that support is added for reconstructing active formatting elements - * before the HTML Processor handles situations with unclosed formats requiring it. + * Ensures that support is added for reconstructing active formatting elements. * * @ticket 58517 * * @covers WP_HTML_Processor::reconstruct_active_formatting_elements */ - public function test_fails_to_reconstruct_formatting_elements() { - $processor = WP_HTML_Processor::create_fragment( '

One

Two

Three

Four' ); + public function test_reconstructs_formatting_elements() { + $processor = WP_HTML_Processor::create_fragment( '

One

Two

Three

Four' ); $this->assertTrue( $processor->next_tag( 'EM' ), 'Could not find first EM.' ); - $this->assertFalse( $processor->next_tag( 'EM' ), 'Should have aborted before finding second EM as it required reconstructing the first EM.' ); + $this->assertSame( array( 'HTML', 'BODY', 'P', 'EM' ), $processor->get_breadcrumbs(), 'Found incorrect breadcrumbs for first EM.' ); + $this->assertTrue( $processor->next_tag( 'SPAN' ), 'Could not find test span.' ); + $this->assertSame( + array( 'HTML', 'BODY', 'P', 'EM', 'EM', 'SPAN' ), + $processor->get_breadcrumbs(), + 'Found incorrect breadcrumbs for test SPAN; should have created two EMs.' + ); } /** diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php index b54fc047ab040..cf609e36c60f5 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php @@ -165,46 +165,58 @@ public static function data_single_tag_of_supported_elements() { } /** - * @ticket 58517 - * - * @dataProvider data_unsupported_markup + * Ensures that formats inside unclosed A elements are reconstructed. * - * @param string $html HTML containing unsupported markup. + * @ticket 61576 */ - public function test_fails_when_encountering_unsupported_markup( $html, $description ) { - $processor = WP_HTML_Processor::create_fragment( $html ); - - while ( $processor->next_token() && null === $processor->get_attribute( 'supported' ) ) { - continue; - } + public function test_reconstructs_formatting_from_unclosed_a_elements() { + $processor = WP_HTML_Processor::create_fragment( 'Click Here' ); - $this->assertNull( - $processor->get_last_error(), - 'Bailed on unsupported input before finding supported checkpoint: check test code.' + $processor->next_tag( 'STRONG' ); + $this->assertSame( + array( 'HTML', 'BODY', 'A', 'STRONG' ), + $processor->get_breadcrumbs(), + 'Failed to construct starting breadcrumbs properly.' ); - $this->assertTrue( $processor->get_attribute( 'supported' ), 'Did not find required supported element.' ); - $processor->next_token(); - $this->assertNotNull( $processor->get_last_error(), "Didn't properly reject unsupported markup: {$description}" ); + $processor->next_tag( 'BIG' ); + $this->assertSame( + array( 'HTML', 'BODY', 'STRONG', 'A', 'BIG' ), + $processor->get_breadcrumbs(), + 'Failed to reconstruct the active formatting elements after an unclosed A element.' + ); } /** - * Data provider. + * Ensures that unclosed A elements are reconstructed. * - * @return array[] + * @ticket 61576 */ - public static function data_unsupported_markup() { - return array( - 'A with formatting following unclosed A' => array( - 'Click Here', - 'Unclosed formatting requires complicated reconstruction.', - ), + public function test_reconstructs_unclosed_a_elements() { + $processor = WP_HTML_Processor::create_fragment( '

Found me!' ); - 'A after unclosed A inside DIV' => array( - '
', - 'A is a formatting element, which requires more complicated reconstruction.', - ), + // First, there's an A tag inside the DIV. + $this->assertTrue( $processor->next_tag( 'A' ) ); + $this->assertSame( + array( 'HTML', 'BODY', 'DIV', 'A' ), + $processor->get_breadcrumbs() ); + + /* + * There's a second A tag containing the text outside the DIV. + * When the DIV closes, the unclosed A is reconstructed from inside the DIV + * to contain the following text. + */ + $this->assertTrue( $processor->next_tag( 'A' ) ); + $this->assertSame( + array( 'HTML', 'BODY', 'A' ), + $processor->get_breadcrumbs() + ); + + // Finally, the trailing text is inside the A. + $processor->next_token(); + $this->assertSame( '#text', $processor->get_token_type() ); + $this->assertSame( 'Found me!', $processor->get_modifiable_text() ); } /** diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index d87d784dbf2d4..bf6956eebbb57 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -152,6 +152,9 @@ private static function should_skip_test( ?string $test_context_element, string /** * Generates the tree-like structure represented in the Html5lib tests. * + * @throws WP_HTML_Unsupported_Exception Raises unsupported exceptions for test reporting. + * @throws Error For unexpected "impossible" cases. + * * @param string|null $fragment_context Context element in which to parse HTML, such as BODY or SVG. * @param string $html Given test HTML. * @return string|null Tree structure of parsed HTML, if supported, else null. @@ -160,6 +163,7 @@ private static function build_tree_representation( ?string $fragment_context, st $processor = $fragment_context ? WP_HTML_Processor::create_fragment( $html, "<{$fragment_context}>" ) : WP_HTML_Processor::create_full_parser( $html ); + if ( null === $processor ) { throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() ); } @@ -264,6 +268,7 @@ static function ( $a, $b ) { foreach ( $sorted_attributes as $attribute_name => $display_name ) { $val = $processor->get_attribute( $attribute_name ); + /* * Attributes with no value are `true` with the HTML API, * We map use the empty string value in the tree structure. From 1741b3fe31c0084971242484ec7d72c139e4d546 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 24 Jun 2026 09:30:05 +0000 Subject: [PATCH 3/3] HTML API: Reparse attributes for reconstructed formatting elements --- .../html-api/class-wp-html-processor.php | 118 +++++++++--------- .../html-api/class-wp-html-tag-processor.php | 25 +--- .../tests/html-api/wpHtmlProcessor.php | 76 +++++++++++ 3 files changed, 142 insertions(+), 77 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 7ff5da6b01c55..93975bcd7fc4e 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -258,18 +258,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private $context_node = null; - /** - * If a formatting element has been reconstructed, this will hold - * the parsed attributes from the original format, once requested. - * - * These attributes are not modifiable. - * - * @since 7.0.0 - * - * @var array - */ - protected $actively_reconstructed_formatting_attributes = array(); - /* * Public Interface Functions */ @@ -912,6 +900,49 @@ private function is_virtual(): bool { ); } + /** + * Returns a Tag Processor paused at the original source token for a virtual token. + * + * @since 7.0.0 + * + * @return WP_HTML_Tag_Processor|null Processor paused at the source token, or null when unavailable. + */ + private function get_virtual_token_processor(): ?WP_HTML_Tag_Processor { + if ( + ! $this->is_virtual() || + WP_HTML_Stack_Event::PUSH !== $this->current_element->operation + ) { + return null; + } + + $token = $this->current_element->token; + if ( ! isset( $token->bookmark_name, $this->bookmarks[ $token->bookmark_name ] ) ) { + return null; + } + + $source_span = $this->bookmarks[ $token->bookmark_name ]; + if ( 0 === $source_span->length ) { + return null; + } + + $processor = new WP_HTML_Tag_Processor( + substr( $this->html, $source_span->start, $source_span->length ) + ); + $processor->compat_mode = $this->compat_mode; + $processor->change_parsing_namespace( $token->namespace ); + + if ( + ! $processor->next_token() || + '#tag' !== $processor->get_token_type() || + $processor->is_tag_closer() || + $token->node_name !== $processor->get_tag() + ) { + return null; + } + + return $processor; + } + /** * Indicates if the currently-matched tag matches the given breadcrumbs. * @@ -2878,7 +2909,6 @@ private function step_in_body(): bool { if ( false === $this->state->active_formatting_elements->push( $this->state->current_token ) ) { $this->bail( 'Cannot track formatting elements when encountering a fourth identical token.' ); } - $this->actively_reconstructed_formatting_attributes[ $this->state->current_token->bookmark_name ] = $this->attributes; return true; /* @@ -2902,7 +2932,6 @@ private function step_in_body(): bool { if ( false === $this->state->active_formatting_elements->push( $this->state->current_token ) ) { $this->bail( 'Cannot track formatting elements when encountering a fourth identical token.' ); } - $this->actively_reconstructed_formatting_attributes[ $this->state->current_token->bookmark_name ] = $this->attributes; return true; /* @@ -2921,7 +2950,6 @@ private function step_in_body(): bool { if ( false === $this->state->active_formatting_elements->push( $this->state->current_token ) ) { $this->bail( 'Cannot track formatting elements when encountering a fourth identical token.' ); } - $this->actively_reconstructed_formatting_attributes[ $this->state->current_token->bookmark_name ] = $this->attributes; return true; /* @@ -5441,23 +5469,8 @@ public function get_token_type(): ?string { */ public function get_attribute( $name ) { if ( $this->is_virtual() ) { - $virtual_attributes = $this->actively_reconstructed_formatting_attributes[ $this->current_element->token->bookmark_name ?? '' ] ?? null; - if ( null === $virtual_attributes ) { - return null; - } - - $current_attributes = $this->attributes; - $current_updates = $this->lexical_updates; - $this->lexical_updates = array(); - $this->attributes = $virtual_attributes; - $parser_state = $this->parser_state; - $this->parser_state = WP_HTML_Tag_Processor::STATE_MATCHED_TAG; - $attribute_value = parent::get_attribute( $name ); - $this->attributes = $current_attributes; - $this->parser_state = $parser_state; - $this->lexical_updates = $current_updates; - - return $attribute_value; + $processor = $this->get_virtual_token_processor(); + return $processor ? $processor->get_attribute( $name ) : null; } return parent::get_attribute( $name ); @@ -5475,8 +5488,8 @@ public function get_attribute( $name ) { */ public function get_qualified_attribute_name( $attribute_name ): ?string { if ( $this->is_virtual() ) { - $namespace = $this->current_element->token->namespace; - return self::lookup_qualified_attribute_name( $namespace, $attribute_name ); + $processor = $this->get_virtual_token_processor(); + return $processor ? $processor->get_qualified_attribute_name( $attribute_name ) : null; } return parent::get_qualified_attribute_name( $attribute_name ); @@ -5558,23 +5571,8 @@ public function remove_attribute( $name ): bool { */ public function get_attribute_names_with_prefix( $prefix ): ?array { if ( $this->is_virtual() ) { - $virtual_attributes = $this->actively_reconstructed_formatting_attributes[ $this->current_element->token->bookmark_name ?? '' ] ?? null; - if ( null === $virtual_attributes ) { - return null; - } - - $current_attributes = $this->attributes; - $current_updates = $this->lexical_updates; - $this->lexical_updates = array(); - $this->attributes = $virtual_attributes; - $parser_state = $this->parser_state; - $this->parser_state = WP_HTML_Tag_Processor::STATE_MATCHED_TAG; - $attribute_names = parent::get_attribute_names_with_prefix( $prefix ); - $this->attributes = $current_attributes; - $this->parser_state = $parser_state; - $this->lexical_updates = $current_updates; - - return $attribute_names; + $processor = $this->get_virtual_token_processor(); + return $processor ? $processor->get_attribute_names_with_prefix( $prefix ) : null; } return parent::get_attribute_names_with_prefix( $prefix ); @@ -5609,15 +5607,16 @@ public function remove_class( $class_name ): bool { * * @since 6.6.0 Subclassed for the HTML Processor. * - * @todo When reconstructing active formatting elements with attributes, find a way - * to indicate if the virtually-reconstructed formatting elements contain the - * wanted class name. - * * @param string $wanted_class Look for this CSS class name, ASCII case-insensitive. * @return bool|null Whether the matched tag contains the given class name, or null if not matched. */ public function has_class( $wanted_class ): ?bool { - return $this->is_virtual() ? null : parent::has_class( $wanted_class ); + if ( $this->is_virtual() ) { + $processor = $this->get_virtual_token_processor(); + return $processor ? $processor->has_class( $wanted_class ) : null; + } + + return parent::has_class( $wanted_class ); } /** @@ -5637,7 +5636,12 @@ public function has_class( $wanted_class ): ?bool { * @since 6.6.0 Subclassed for the HTML Processor. */ public function class_list() { - return $this->is_virtual() ? null : parent::class_list(); + if ( $this->is_virtual() ) { + $processor = $this->get_virtual_token_processor(); + return $processor ? $processor->class_list() : null; + } + + return parent::class_list(); } /** diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index fbbe8d18740c8..1ac46d2996ee4 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -708,7 +708,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var WP_HTML_Attribute_Token[] */ - protected $attributes = array(); + private $attributes = array(); /** * Tracks spans of duplicate attributes on a given tag, used for removing @@ -3085,29 +3085,14 @@ public function get_qualified_attribute_name( $attribute_name ): ?string { return null; } - $namespace = $this->get_namespace(); - return self::lookup_qualified_attribute_name( $namespace, $attribute_name ); - } - - /** - * Returns the adjusted attribute name for a given attribute, taking into - * account the provided namespace. - * - * @since 7.0.0 - * - * @param string $ns The namespace to use: 'html', 'svg', or 'math'. - * @param string $attribute_name Which attribute to adjust. - * - * @return string The qualified attribute name. - */ - final protected static function lookup_qualified_attribute_name( string $ns, string $attribute_name ): string { + $namespace = $this->get_namespace(); $lower_name = strtolower( $attribute_name ); - if ( 'math' === $ns && 'definitionurl' === $lower_name ) { + if ( 'math' === $namespace && 'definitionurl' === $lower_name ) { return 'definitionURL'; } - if ( 'svg' === $ns ) { + if ( 'svg' === $namespace ) { switch ( $lower_name ) { case 'attributename': return 'attributeName'; @@ -3285,7 +3270,7 @@ final protected static function lookup_qualified_attribute_name( string $ns, str } } - if ( 'html' !== $ns ) { + if ( 'html' !== $namespace ) { switch ( $lower_name ) { case 'xlink:actuate': return 'xlink actuate'; diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index fa492cde6939f..09ba1961ce830 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -204,6 +204,82 @@ public function test_reconstructs_formatting_elements() { ); } + /** + * Ensures reconstructed active formatting elements expose their original attributes. + * + * @ticket 58517 + * + * @covers WP_HTML_Processor::reconstruct_active_formatting_elements + * @covers WP_HTML_Processor::get_attribute + * @covers WP_HTML_Processor::get_attribute_names_with_prefix + * @covers WP_HTML_Processor::get_qualified_attribute_name + * @covers WP_HTML_Processor::has_class + * @covers WP_HTML_Processor::class_list + */ + public function test_reconstructed_formatting_elements_expose_source_attributes() { + $processor = WP_HTML_Processor::create_fragment( + '

One

Two' + ); + + $this->assertTrue( $processor->next_tag( 'EM' ), 'Could not find original EM.' ); + $this->assertTrue( $processor->next_tag( 'EM' ), 'Could not find reconstructed EM.' ); + + $this->assertSame( 'one two', $processor->get_attribute( 'class' ) ); + $this->assertSame( '14', $processor->get_attribute( 'data-test-id' ) ); + $this->assertTrue( $processor->get_attribute( 'inert' ) ); + $this->assertSame( 'data-test-id', $processor->get_qualified_attribute_name( 'data-test-id' ) ); + $this->assertTrue( $processor->has_class( 'one' ) ); + $this->assertFalse( $processor->has_class( 'missing' ) ); + $this->assertSame( array( 'one', 'two' ), iterator_to_array( $processor->class_list() ) ); + + $attribute_names = $processor->get_attribute_names_with_prefix( '' ); + sort( $attribute_names ); + $this->assertSame( array( 'class', 'data-test-id', 'inert' ), $attribute_names ); + } + + /** + * Ensures reconstructed active formatting elements reparse updated source attributes. + * + * @ticket 58517 + * + * @covers WP_HTML_Processor::reconstruct_active_formatting_elements + * @covers WP_HTML_Processor::get_attribute + */ + public function test_reconstructed_formatting_elements_expose_updated_source_attributes() { + $processor = WP_HTML_Processor::create_fragment( '

One

Two' ); + + $this->assertTrue( $processor->next_tag( 'EM' ), 'Could not find original EM.' ); + $processor->set_attribute( 'class', 'after' ); + $processor->get_updated_html(); + + $this->assertTrue( $processor->next_tag( 'EM' ), 'Could not find reconstructed EM.' ); + $this->assertSame( 'after', $processor->get_attribute( 'class' ) ); + } + + /** + * Ensures synthetic virtual elements do not report source attributes. + * + * @ticket 58517 + * + * @covers WP_HTML_Processor::get_attribute + * @covers WP_HTML_Processor::get_attribute_names_with_prefix + * @covers WP_HTML_Processor::get_qualified_attribute_name + * @covers WP_HTML_Processor::has_class + * @covers WP_HTML_Processor::class_list + */ + public function test_synthetic_virtual_elements_do_not_expose_attributes() { + $processor = WP_HTML_Processor::create_fragment( '

' ); + + $this->assertTrue( $processor->next_tag(), 'Could not find implied P opener.' ); + $this->assertSame( 'P', $processor->get_tag() ); + $this->assertFalse( $processor->is_tag_closer() ); + $this->assertNull( $processor->get_attribute( 'class' ) ); + $this->assertNull( $processor->get_attribute_names_with_prefix( '' ) ); + $this->assertNull( $processor->get_qualified_attribute_name( 'class' ) ); + $this->assertNull( $processor->has_class( 'anything' ) ); + $this->assertNull( $processor->class_list() ); + } + /** * Ensure non-nesting tags do not nest. *