Skip to content

Commit

Permalink
HTML API: Add missing subclass methods to HTML Processor and add toke…
Browse files Browse the repository at this point in the history
…n provenance.

This patch introduces two related changes:

 - It adds missing subclass methods on the HTML Processor which needed
   to be implemented since it started visiting virtual nodes. These
   methods need to account for the fact that not all tokens truly exist.

 - It adds a new concept and internal method, `is_virtual()`, indicating
   if the currently-matched token comes from the raw text in the input
   HTML document or if it was the byproduct of semantic parsing rules.
   This internal method and new vocabulary around token provenance
   considerably simplifies the logic spread throughout the rest of the
   class and its subclass methods.

Developed in WordPress#6860
Discussed in https://core.trac.wordpress.org/ticket/61348

Follow-up to [58304].

Props dmsnell, jonsurrell, gziolo.
See #61348.


git-svn-id: https://develop.svn.wordpress.org/trunk@58558 602fd350-edb4-49c9-b593-d223f7449a82
  • Loading branch information
dmsnell committed Jun 25, 2024
1 parent c2e7ab3 commit 5655fb0
Show file tree
Hide file tree
Showing 2 changed files with 186 additions and 55 deletions.
218 changes: 168 additions & 50 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -349,13 +349,19 @@ public function __construct( $html, $use_the_static_create_methods_instead = nul

$this->state->stack_of_open_elements->set_push_handler(
function ( WP_HTML_Token $token ) {
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH );
$is_virtual = ! isset( $this->state->current_token ) || $this->is_tag_closer();
$same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name;
$provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real';
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH, $provenance );
}
);

$this->state->stack_of_open_elements->set_pop_handler(
function ( WP_HTML_Token $token ) {
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP );
$is_virtual = ! isset( $this->state->current_token ) || ! $this->is_tag_closer();
$same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name;
$provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real';
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP, $provenance );
}
);

Expand Down Expand Up @@ -569,11 +575,26 @@ public function next_token() {
* @return bool Whether the current tag is a tag closer.
*/
public function is_tag_closer() {
return isset( $this->current_element )
? ( WP_HTML_Stack_Event::POP === $this->current_element->operation )
return $this->is_virtual()
? ( WP_HTML_Stack_Event::POP === $this->current_element->operation && '#tag' === $this->get_token_type() )
: parent::is_tag_closer();
}

/**
* Indicates if the currently-matched token is virtual, created by a stack operation
* while processing HTML, rather than a token found in the HTML text itself.
*
* @since 6.6.0
*
* @return bool Whether the current token is virtual.
*/
private function is_virtual() {
return (
isset( $this->current_element->provenance ) &&
'virtual' === $this->current_element->provenance
);
}

/**
* Indicates if the currently-matched tag matches the given breadcrumbs.
*
Expand Down Expand Up @@ -1440,7 +1461,7 @@ public function get_tag() {
return null;
}

if ( isset( $this->current_element ) ) {
if ( $this->is_virtual() ) {
return $this->current_element->token->node_name;
}

Expand All @@ -1459,6 +1480,27 @@ public function get_tag() {
}
}

/**
* Indicates if the currently matched tag contains the self-closing flag.
*
* No HTML elements ought to have the self-closing flag and for those, the self-closing
* flag will be ignored. For void elements this is benign because they "self close"
* automatically. For non-void HTML elements though problems will appear if someone
* intends to use a self-closing element in place of that element with an empty body.
* For HTML foreign elements and custom elements the self-closing flag determines if
* they self-close or not.
*
* This function does not determine if a tag is self-closing,
* but only if the self-closing flag is present in the syntax.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return bool Whether the currently matched tag contains the self-closing flag.
*/
public function has_self_closing_flag() {
return $this->is_virtual() ? false : parent::has_self_closing_flag();
}

/**
* Returns the node name represented by the token.
*
Expand All @@ -1480,11 +1522,9 @@ public function get_tag() {
* @return string|null Name of the matched token.
*/
public function get_token_name() {
if ( isset( $this->current_element ) ) {
return $this->current_element->token->node_name;
}

return parent::get_token_name();
return $this->is_virtual()
? $this->current_element->token->node_name
: parent::get_token_name();
}

/**
Expand All @@ -1510,9 +1550,16 @@ public function get_token_name() {
* @return string|null What kind of token is matched, or null.
*/
public function get_token_type() {
if ( isset( $this->current_element ) ) {
$node_name = $this->current_element->token->node_name;
if ( ctype_upper( $node_name[0] ) ) {
if ( $this->is_virtual() ) {
/*
* This logic comes from the Tag Processor.
*
* @todo It would be ideal not to repeat this here, but it's not clearly
* better to allow passing a token name to `get_token_type()`.
*/
$node_name = $this->current_element->token->node_name;
$starting_char = $node_name[0];
if ( 'A' <= $starting_char && 'Z' >= $starting_char ) {
return '#tag';
}

Expand Down Expand Up @@ -1546,25 +1593,38 @@ public function get_token_type() {
* @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`.
*/
public function get_attribute( $name ) {
if ( isset( $this->current_element ) ) {
// Closing tokens cannot contain attributes.
if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
return null;
}

$node_name = $this->current_element->token->node_name;

// Only tags can contain attributes.
if ( 'A' > $node_name[0] || 'Z' < $node_name[0] ) {
return null;
}
return $this->is_virtual() ? null : parent::get_attribute( $name );
}

if ( $this->current_element->token->bookmark_name === (string) $this->bookmark_counter ) {
return parent::get_attribute( $name );
}
}
/**
* Updates or creates a new attribute on the currently matched tag with the passed value.
*
* For boolean attributes special handling is provided:
* - When `true` is passed as the value, then only the attribute name is added to the tag.
* - When `false` is passed, the attribute gets removed if it existed before.
*
* For string attributes, the value is escaped using the `esc_attr` function.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $name The attribute name to target.
* @param string|bool $value The new attribute value.
* @return bool Whether an attribute value was set.
*/
public function set_attribute( $name, $value ) {
return $this->is_virtual() ? false : parent::set_attribute( $name, $value );
}

return null;
/**
* Remove an attribute from the currently-matched tag.
*
* @since 6.6.0 Subclassed for HTML Processor.
*
* @param string $name The attribute name to remove.
* @return bool Whether an attribute was removed.
*/
public function remove_attribute( $name ) {
return $this->is_virtual() ? false : parent::remove_attribute( $name );
}

/**
Expand Down Expand Up @@ -1594,18 +1654,63 @@ public function get_attribute( $name ) {
* @return array|null List of attribute names, or `null` when no tag opener is matched.
*/
public function get_attribute_names_with_prefix( $prefix ) {
if ( isset( $this->current_element ) ) {
if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
return null;
}
return $this->is_virtual() ? null : parent::get_attribute_names_with_prefix( $prefix );
}

$mark = $this->bookmarks[ $this->current_element->token->bookmark_name ];
if ( 0 === $mark->length ) {
return null;
}
}
/**
* Adds a new class name to the currently matched tag.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $class_name The class name to add.
* @return bool Whether the class was set to be added.
*/
public function add_class( $class_name ) {
return $this->is_virtual() ? false : parent::add_class( $class_name );
}

/**
* Removes a class name from the currently matched tag.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $class_name The class name to remove.
* @return bool Whether the class was set to be removed.
*/
public function remove_class( $class_name ) {
return $this->is_virtual() ? false : parent::remove_class( $class_name );
}

/**
* Returns if a matched tag contains the given ASCII case-insensitive class name.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $wanted_class Look for this CSS class name, ASCII case-insensitive.
* @return bool|null Whether the matched tag contains the given class name, or null if not matched.
*/
public function has_class( $wanted_class ) {
return $this->is_virtual() ? null : parent::has_class( $wanted_class );
}

return parent::get_attribute_names_with_prefix( $prefix );
/**
* Generator for a foreach loop to step through each class name for the matched tag.
*
* This generator function is designed to be used inside a "foreach" loop.
*
* Example:
*
* $p = WP_HTML_Processor::create_fragment( "<div class='free &lt;egg&lt;\tlang-en'>" );
* $p->next_tag();
* foreach ( $p->class_list() as $class_name ) {
* echo "{$class_name} ";
* }
* // Outputs: "free <egg> lang-en "
*
* @since 6.6.0 Subclassed for the HTML Processor.
*/
public function class_list() {
return $this->is_virtual() ? null : parent::class_list();
}

/**
Expand All @@ -1629,17 +1734,30 @@ public function get_attribute_names_with_prefix( $prefix ) {
* @return string
*/
public function get_modifiable_text() {
if ( isset( $this->current_element ) ) {
if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
return '';
}
return $this->is_virtual() ? '' : parent::get_modifiable_text();
}

$mark = $this->bookmarks[ $this->current_element->token->bookmark_name ];
if ( 0 === $mark->length ) {
return '';
}
}
return parent::get_modifiable_text();
/**
* Indicates what kind of comment produced the comment node.
*
* Because there are different kinds of HTML syntax which produce
* comments, the Tag Processor tracks and exposes this as a type
* for the comment. Nominally only regular HTML comments exist as
* they are commonly known, but a number of unrelated syntax errors
* also produce comments.
*
* @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT
* @see self::COMMENT_AS_CDATA_LOOKALIKE
* @see self::COMMENT_AS_INVALID_HTML
* @see self::COMMENT_AS_HTML_COMMENT
* @see self::COMMENT_AS_PI_NODE_LOOKALIKE
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return string|null
*/
public function get_comment_type() {
return $this->is_virtual() ? null : parent::get_comment_type();
}

/**
Expand Down
23 changes: 18 additions & 5 deletions src/wp-includes/html-api/class-wp-html-stack-event.php
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,27 @@ class WP_HTML_Stack_Event {
*/
public $operation;

/**
* Indicates if the stack element is a real or virtual node.
*
* @since 6.6.0
*
* @var string
*/
public $provenance;

/**
* Constructor function.
*
* @param WP_HTML_Token $token Token associated with stack event, always an opening token.
* @param string $operation One of self::PUSH or self::POP.
* @since 6.6.0
*
* @param WP_HTML_Token $token Token associated with stack event, always an opening token.
* @param string $operation One of self::PUSH or self::POP.
* @param string $provenance "virtual" or "real".
*/
public function __construct( $token, $operation ) {
$this->token = $token;
$this->operation = $operation;
public function __construct( $token, $operation, $provenance ) {
$this->token = $token;
$this->operation = $operation;
$this->provenance = $provenance;
}
}

0 comments on commit 5655fb0

Please sign in to comment.