Skip to content

Commit

Permalink
HTML API: Add support for BR, EMBED, & other tags.
Browse files Browse the repository at this point in the history
Adds support for the following HTML elements to the HTML Processor:

 - AREA, BR, EMBED, KEYGEN, WBR
 - Only the opening BR tag is supported, as the invalid closer `</br>`
   involves more complicated rules, to be implemented later.

Previously, these elements were not supported and the HTML Processor
would bail when encountering them. With this patch it will proceed to
parse an HTML document when encountering those tags as long as other
normal conditions don't cause it to bail (such as complicated format
reconstruction rules).

Props jonsurrell, dmsnell
Fixes #60283



git-svn-id: https://develop.svn.wordpress.org/trunk@57316 602fd350-edb4-49c9-b593-d223f7449a82
  • Loading branch information
dmsnell committed Jan 19, 2024
1 parent 5815624 commit 91e51f9
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 19 deletions.
31 changes: 22 additions & 9 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -102,17 +102,17 @@
* - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY.
* - Custom elements: All custom elements are supported. :)
* - Form elements: BUTTON, DATALIST, FIELDSET, LABEL, LEGEND, METER, PROGRESS, SEARCH.
* - Formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U.
* - Formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U, WBR.
* - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP.
* - Links: A.
* - Lists: DD, DL, DT, LI, OL, LI.
* - Media elements: AUDIO, CANVAS, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, VIDEO.
* - Paragraph: P.
* - Phrasing elements: ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
* - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, VIDEO.
* - Paragraph: BR, P.
* - Phrasing elements: AREA, ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
* - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION.
* - Templating elements: SLOT.
* - Text decoration: RUBY.
* - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, MULTICOL, NEXTID, SPACER.
* - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, MULTICOL, NEXTID, SPACER.
*
* ### Supported markup
*
Expand Down Expand Up @@ -934,12 +934,28 @@ private function step_in_body() {
$this->run_adoption_agency_algorithm();
return true;

/*
* > An end tag whose tag name is "br"
* > Parse error. Drop the attributes from the token, and act as described in the next
* > entry; i.e. act as if this was a "br" start tag token with no attributes, rather
* > than the end tag token that it actually is.
*/
case '-BR':
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( 'Closing BR tags require unimplemented special handling.' );

/*
* > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr"
*/
case '+AREA':
case '+BR':
case '+EMBED':
case '+IMG':
case '+KEYGEN':
case '+WBR':
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
$this->state->frameset_ok = false;
return true;

/*
Expand Down Expand Up @@ -977,21 +993,18 @@ private function step_in_body() {
case 'BASEFONT':
case 'BGSOUND':
case 'BODY':
case 'BR':
case 'CAPTION':
case 'COL':
case 'COLGROUP':
case 'DD':
case 'DT':
case 'EMBED':
case 'FORM':
case 'FRAME':
case 'FRAMESET':
case 'HEAD':
case 'HTML':
case 'IFRAME':
case 'INPUT':
case 'KEYGEN':
case 'LI':
case 'LINK':
case 'LISTING':
Expand Down Expand Up @@ -1031,7 +1044,6 @@ private function step_in_body() {
case 'TR':
case 'TRACK':
case 'UL':
case 'WBR':
case 'XMP':
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
Expand Down Expand Up @@ -1692,6 +1704,7 @@ public static function is_void( $tag_name ) {
'IMG' === $tag_name ||
'INPUT' === $tag_name ||
'LINK' === $tag_name ||
'KEYGEN' === $tag_name || // Obsolete but still treated as void.
'META' === $tag_name ||
'SOURCE' === $tag_name ||
'TRACK' === $tag_name ||
Expand Down
85 changes: 80 additions & 5 deletions tests/phpunit/tests/html-api/wpHtmlProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,86 @@ public function test_fails_to_reconstruct_formatting_elements() {
$this->assertFalse( $p->next_tag( 'EM' ), 'Should have aborted before finding second EM as it required reconstructing the first EM.' );
}

/**
* Ensure non-nesting tags do not nest.
*
* @ticket 60283
*
* @covers WP_HTML_Processor::step_in_body
* @covers WP_HTML_Processor::is_void
*
* @dataProvider data_void_tags
*
* @param string $tag_name Name of void tag under test.
*/
public function test_cannot_nest_void_tags( $tag_name ) {
$processor = WP_HTML_Processor::create_fragment( "<{$tag_name}><div>" );

/*
* This HTML represents the same as the following HTML,
* assuming that it were provided `<img>` as the tag:
*
* <html>
* <body>
* <img>
* <div></div>
* </body>
* </html>
*/

$found_tag = $processor->next_tag();

if ( WP_HTML_Processor::ERROR_UNSUPPORTED === $processor->get_last_error() ) {
$this->markTestSkipped( "Tag {$tag_name} is not supported." );
}

$this->assertTrue(
$found_tag,
"Could not find first {$tag_name}."
);

$this->assertSame(
array( 'HTML', 'BODY', $tag_name ),
$processor->get_breadcrumbs(),
'Found incorrect nesting of first element.'
);

$this->assertTrue(
$processor->next_tag(),
'Should have found the DIV as the second tag.'
);

$this->assertSame(
array( 'HTML', 'BODY', 'DIV' ),
$processor->get_breadcrumbs(),
"DIV should have been a sibling of the {$tag_name}."
);
}

/**
* Data provider.
*
* @return array[]
*/
public function data_void_tags() {
return array(
'AREA' => array( 'AREA' ),
'BASE' => array( 'BASE' ),
'BR' => array( 'BR' ),
'COL' => array( 'COL' ),
'EMBED' => array( 'EMBED' ),
'HR' => array( 'HR' ),
'IMG' => array( 'IMG' ),
'INPUT' => array( 'INPUT' ),
'KEYGEN' => array( 'KEYGEN' ),
'LINK' => array( 'LINK' ),
'META' => array( 'META' ),
'SOURCE' => array( 'SOURCE' ),
'TRACK' => array( 'TRACK' ),
'WBR' => array( 'WBR' ),
);
}

/**
* Ensures that special handling of unsupported tags is cleaned up
* as handling is implemented. Otherwise there's risk of leaving special
Expand Down Expand Up @@ -159,24 +239,20 @@ public function test_step_in_body_fails_on_unsupported_tags( $tag_name ) {
public function data_unsupported_special_in_body_tags() {
return array(
'APPLET' => array( 'APPLET' ),
'AREA' => array( 'AREA' ),
'BASE' => array( 'BASE' ),
'BASEFONT' => array( 'BASEFONT' ),
'BGSOUND' => array( 'BGSOUND' ),
'BODY' => array( 'BODY' ),
'BR' => array( 'BR' ),
'CAPTION' => array( 'CAPTION' ),
'COL' => array( 'COL' ),
'COLGROUP' => array( 'COLGROUP' ),
'EMBED' => array( 'EMBED' ),
'FORM' => array( 'FORM' ),
'FRAME' => array( 'FRAME' ),
'FRAMESET' => array( 'FRAMESET' ),
'HEAD' => array( 'HEAD' ),
'HTML' => array( 'HTML' ),
'IFRAME' => array( 'IFRAME' ),
'INPUT' => array( 'INPUT' ),
'KEYGEN' => array( 'KEYGEN' ),
'LINK' => array( 'LINK' ),
'LISTING' => array( 'LISTING' ),
'MARQUEE' => array( 'MARQUEE' ),
Expand Down Expand Up @@ -213,7 +289,6 @@ public function data_unsupported_special_in_body_tags() {
'TITLE' => array( 'TITLE' ),
'TR' => array( 'TR' ),
'TRACK' => array( 'TRACK' ),
'WBR' => array( 'WBR' ),
'XMP' => array( 'XMP' ),
);
}
Expand Down
5 changes: 0 additions & 5 deletions tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php
Original file line number Diff line number Diff line change
Expand Up @@ -162,23 +162,19 @@ public function test_fails_when_encountering_unsupported_tag( $html ) {
public function data_unsupported_elements() {
$unsupported_elements = array(
'APPLET', // Deprecated.
'AREA',
'BASE',
'BGSOUND', // Deprecated; self-closing if self-closing flag provided, otherwise normal.
'BODY',
'BR',
'CAPTION',
'COL',
'COLGROUP',
'EMBED',
'FORM',
'FRAME',
'FRAMESET',
'HEAD',
'HTML',
'IFRAME',
'INPUT',
'KEYGEN', // Deprecated; void.
'LINK',
'LISTING', // Deprecated, use PRE instead.
'MARQUEE', // Deprecated.
Expand Down Expand Up @@ -213,7 +209,6 @@ public function data_unsupported_elements() {
'TITLE',
'TR',
'TRACK',
'WBR',
'XMP', // Deprecated, use PRE instead.
);

Expand Down
25 changes: 25 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php
Original file line number Diff line number Diff line change
Expand Up @@ -392,4 +392,29 @@ public function test_in_body_any_other_end_tag_with_unclosed_non_special_element
$this->assertSame( 'DIV', $p->get_tag(), "Expected to find DIV element, but found {$p->get_tag()} instead." );
$this->assertSame( array( 'HTML', 'BODY', 'DIV', 'DIV' ), $p->get_breadcrumbs(), 'Failed to produce expected DOM nesting: SPAN should be closed and DIV should be its sibling.' );
}

/**
* Ensures that support isn't accidentally partially added for the closing BR tag `</br>`.
*
* This tag closer has special rules and support shouldn't be added without implementing full support.
*
* > An end tag whose tag name is "br"
* > Parse error. Drop the attributes from the token, and act as described in the next entry;
* > i.e. act as if this was a "br" start tag token with no attributes, rather than the end
* > tag token that it actually is.
*
* When this handling is implemented, this test should be removed. It's not incorporated
* into the existing unsupported tag behavior test because the opening tag is supported;
* only the closing tag isn't.
*
* @covers WP_HTML_Processor::step_in_body
*
* @ticket 60283
*/
public function test_br_end_tag_unsupported() {
$p = WP_HTML_Processor::create_fragment( '</br>' );

$this->assertFalse( $p->next_tag(), 'Found a BR tag that should not be handled.' );
$this->assertSame( WP_HTML_Processor::ERROR_UNSUPPORTED, $p->get_last_error() );
}
}

0 comments on commit 91e51f9

Please sign in to comment.