class WP_Textdomain_Registry { /** * List of domains and all their language directory paths for each locale. * * @since 6.1.0 * * @var array */ protected $all = array(); /** * List of domains and their language directory path for the current (most recent) locale. * * @since 6.1.0 * * @var array */ protected $current = array(); /** * List of domains and their custom language directory paths. * * @see load_plugin_textdomain() * @see load_theme_textdomain() * * @since 6.1.0 * * @var array */ protected $custom_paths = array(); /** * Holds a cached list of available .mo files to improve performance. * * @since 6.1.0 * * @var array */ protected $cached_mo_files = array(); /** * Holds a cached list of domains with translations to improve performance. * * @since 6.2.0 * * @var string[] */ protected $domains_with_translations = array(); /** * Returns the languages directory path for a specific domain and locale. * * @since 6.1.0 * * @param string $domain Text domain. * @param string $locale Locale. * * @return string|false MO file path or false if there is none available. */ public function get( $domain, $locale ) { if ( isset( $this->all[ $domain ][ $locale ] ) ) { return $this->all[ $domain ][ $locale ]; } return $this->get_path_from_lang_dir( $domain, $locale ); } /** * Determines whether any MO file paths are available for the domain. * * This is the case if a path has been set for the current locale, * or if there is no information stored yet, in which case * _load_textdomain_just_in_time() will fetch the information first. * * @since 6.1.0 * * @param string $domain Text domain. * @return bool Whether any MO file paths are available for the domain. */ public function has( $domain ) { return ( isset( $this->current[ $domain ] ) || empty( $this->all[ $domain ] ) || in_array( $domain, $this->domains_with_translations, true ) ); } /** * Sets the language directory path for a specific domain and locale. * * Also sets the 'current' property for direct access * to the path for the current (most recent) locale. * * @since 6.1.0 * * @param string $domain Text domain. * @param string $locale Locale. * @param string|false $path Language directory path or false if there is none available. */ public function set( $domain, $locale, $path ) { $this->all[ $domain ][ $locale ] = $path ? rtrim( $path, '/' ) . '/' : false; $this->current[ $domain ] = $this->all[ $domain ][ $locale ]; } /** * Sets the custom path to the plugin's/theme's languages directory. * * Used by load_plugin_textdomain() and load_theme_textdomain(). * * @since 6.1.0 * * @param string $domain Text domain. * @param string $path Language directory path. */ public function set_custom_path( $domain, $path ) { $this->custom_paths[ $domain ] = rtrim( $path, '/' ); } /** * Returns possible language directory paths for a given text domain. * * @since 6.2.0 * * @param string $domain Text domain. * @return string[] Array of language directory paths. */ private function get_paths_for_domain( $domain ) { $locations = array( WP_LANG_DIR . '/plugins', WP_LANG_DIR . '/themes', ); if ( isset( $this->custom_paths[ $domain ] ) ) { $locations[] = $this->custom_paths[ $domain ]; } return $locations; } /** * Gets the path to the language directory for the current locale. * * Checks the plugins and themes language directories as well as any * custom directory set via load_plugin_textdomain() or load_theme_textdomain(). * * @since 6.1.0 * * @see _get_path_to_translation_from_lang_dir() * * @param string $domain Text domain. * @param string $locale Locale. * @return string|false Language directory path or false if there is none available. */ private function get_path_from_lang_dir( $domain, $locale ) { $locations = $this->get_paths_for_domain( $domain ); $found_location = false; foreach ( $locations as $location ) { if ( ! isset( $this->cached_mo_files[ $location ] ) ) { $this->set_cached_mo_files( $location ); } $path = "$location/$domain-$locale.mo"; foreach ( $this->cached_mo_files[ $location ] as $mo_path ) { if ( ! in_array( $domain, $this->domains_with_translations, true ) && str_starts_with( str_replace( "$location/", '', $mo_path ), "$domain-" ) ) { $this->domains_with_translations[] = $domain; } if ( $mo_path === $path ) { $found_location = rtrim( $location, '/' ) . '/'; } } } if ( $found_location ) { $this->set( $domain, $locale, $found_location ); return $found_location; } /* * If no path is found for the given locale and a custom path has been set * using load_plugin_textdomain/load_theme_textdomain, use that one. */ if ( 'en_US' !== $locale && isset( $this->custom_paths[ $domain ] ) ) { $fallback_location = rtrim( $this->custom_paths[ $domain ], '/' ) . '/'; $this->set( $domain, $locale, $fallback_location ); return $fallback_location; } $this->set( $domain, $locale, false ); return false; } /** * Reads and caches all available MO files from a given directory. * * @since 6.1.0 * * @param string $path Language directory path. */ private function set_cached_mo_files( $path ) { $this->cached_mo_files[ $path ] = array(); $mo_files = glob( $path . '/*.mo' ); if ( $mo_files ) { $this->cached_mo_files[ $path ] = $mo_files; } } } class WP_Theme_JSON_Data { /** * Container of the data to update. * * @since 6.1.0 * @var WP_Theme_JSON */ private $theme_json = null; /** * The origin of the data: default, theme, user, etc. * * @since 6.1.0 * @var string */ private $origin = ''; /** * Constructor. * * @since 6.1.0 * * @link https://developer.wordpress.org/block-editor/reference-guides/theme-json-reference/ * * @param array $data Array following the theme.json specification. * @param string $origin The origin of the data: default, theme, user. */ public function __construct( $data = array(), $origin = 'theme' ) { $this->origin = $origin; $this->theme_json = new WP_Theme_JSON( $data, $this->origin ); } /** * Updates the theme.json with the the given data. * * @since 6.1.0 * * @param array $new_data Array following the theme.json specification. * * @return WP_Theme_JSON_Data The own instance with access to the modified data. */ public function update_with( $new_data ) { $this->theme_json->merge( new WP_Theme_JSON( $new_data, $this->origin ) ); return $this; } /** * Returns an array containing the underlying data * following the theme.json specification. * * @since 6.1.0 * * @return array */ public function get_data() { return $this->theme_json->get_raw_data(); } } class WP_HTML_Attribute_Token { /** * Attribute name. * * @since 6.2.0 * @var string */ public $name; /** * Attribute value. * * @since 6.2.0 * @var int */ public $value_starts_at; /** * How many bytes the value occupies in the input HTML. * * @since 6.2.0 * @var int */ public $value_length; /** * The string offset where the attribute name starts. * * @since 6.2.0 * @var int */ public $start; /** * The string offset after the attribute value or its name. * * @since 6.2.0 * @var int */ public $end; /** * Whether the attribute is a boolean attribute with value `true`. * * @since 6.2.0 * @var bool */ public $is_true; /** * Constructor. * * @since 6.2.0 * * @param string $name Attribute name. * @param int $value_start Attribute value. * @param int $value_length Number of bytes attribute value spans. * @param int $start The string offset where the attribute name starts. * @param int $end The string offset after the attribute value or its name. * @param bool $is_true Whether the attribute is a boolean attribute with true value. */ public function __construct( $name, $value_start, $value_length, $start, $end, $is_true ) { $this->name = $name; $this->value_starts_at = $value_start; $this->value_length = $value_length; $this->start = $start; $this->end = $end; $this->is_true = $is_true; } } class WP_HTML_Span { /** * Byte offset into document where span begins. * * @since 6.2.0 * @var int */ public $start; /** * Byte offset into document where span ends. * * @since 6.2.0 * @var int */ public $end; /** * Constructor. * * @since 6.2.0 * * @param int $start Byte offset into document where replacement span begins. * @param int $end Byte offset into document where replacement span ends. */ public function __construct( $start, $end ) { $this->start = $start; $this->end = $end; } } class WP_HTML_Text_Replacement { /** * Byte offset into document where replacement span begins. * * @since 6.2.0 * @var int */ public $start; /** * Byte offset into document where replacement span ends. * * @since 6.2.0 * @var int */ public $end; /** * Span of text to insert in document to replace existing content from start to end. * * @since 6.2.0 * @var string */ public $text; /** * Constructor. * * @since 6.2.0 * * @param int $start Byte offset into document where replacement span begins. * @param int $end Byte offset into document where replacement span ends. * @param string $text Span of text to insert in document to replace existing content from start to end. */ public function __construct( $start, $end, $text ) { $this->start = $start; $this->end = $end; $this->text = $text; } } class WP_HTML_Tag_Processor { /** * The maximum number of bookmarks allowed to exist at * any given time. * * @since 6.2.0 * @var int * * @see WP_HTML_Tag_Processor::set_bookmark() */ const MAX_BOOKMARKS = 10; /** * Maximum number of times seek() can be called. * Prevents accidental infinite loops. * * @since 6.2.0 * @var int * * @see WP_HTML_Tag_Processor::seek() */ const MAX_SEEK_OPS = 1000; /** * The HTML document to parse. * * @since 6.2.0 * @var string */ protected $html; /** * The last query passed to next_tag(). * * @since 6.2.0 * @var array|null */ private $last_query; /** * The tag name this processor currently scans for. * * @since 6.2.0 * @var string|null */ private $sought_tag_name; /** * The CSS class name this processor currently scans for. * * @since 6.2.0 * @var string|null */ private $sought_class_name; /** * The match offset this processor currently scans for. * * @since 6.2.0 * @var int|null */ private $sought_match_offset; /** * Whether to visit tag closers, e.g. , when walking an input document. * * @since 6.2.0 * @var bool */ private $stop_on_tag_closers; /** * How many bytes from the original HTML document have been read and parsed. * * This value points to the latest byte offset in the input document which * has been already parsed. It is the internal cursor for the Tag Processor * and updates while scanning through the HTML tokens. * * @since 6.2.0 * @var int */ private $bytes_already_parsed = 0; /** * Byte offset in input document where current tag name starts. * * Example: * *
... * 01234 * - tag name starts at 1 * * @since 6.2.0 * @var int|null */ private $tag_name_starts_at; /** * Byte length of current tag name. * * Example: * *
... * 01234 * --- tag name length is 3 * * @since 6.2.0 * @var int|null */ private $tag_name_length; /** * Byte offset in input document where current tag token ends. * * Example: * *
... * 0 1 | * 01234567890123456 * --- tag name ends at 14 * * @since 6.2.0 * @var int|null */ private $tag_ends_at; /** * Whether the current tag is an opening tag, e.g.
, or a closing tag, e.g.
. * * @var bool */ private $is_closing_tag; /** * Lazily-built index of attributes found within an HTML tag, keyed by the attribute name. * * Example: * * // Supposing the parser is working through this content * // and stops after recognizing the `id` attribute. * //
* // ^ parsing will continue from this point. * $this->attributes = array( * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ) * ); * * // When picking up parsing again, or when asking to find the * // `class` attribute we will continue and add to this array. * $this->attributes = array( * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ), * 'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 ) * ); * * // Note that only the `class` attribute value is stored in the index. * // That's because it is the only value used by this class at the moment. * * @since 6.2.0 * @var WP_HTML_Attribute_Token[] */ private $attributes = array(); /** * Tracks spans of duplicate attributes on a given tag, used for removing * all copies of an attribute when calling `remove_attribute()`. * * @since 6.3.2 * * @var (WP_HTML_Span[])[]|null */ private $duplicate_attributes = null; /** * Which class names to add or remove from a tag. * * These are tracked separately from attribute updates because they are * semantically distinct, whereas this interface exists for the common * case of adding and removing class names while other attributes are * generally modified as with DOM `setAttribute` calls. * * When modifying an HTML document these will eventually be collapsed * into a single `set_attribute( 'class', $changes )` call. * * Example: * * // Add the `wp-block-group` class, remove the `wp-group` class. * $classname_updates = array( * // Indexed by a comparable class name. * 'wp-block-group' => WP_HTML_Tag_Processor::ADD_CLASS, * 'wp-group' => WP_HTML_Tag_Processor::REMOVE_CLASS * ); * * @since 6.2.0 * @var bool[] */ private $classname_updates = array(); /** * Tracks a semantic location in the original HTML which * shifts with updates as they are applied to the document. * * @since 6.2.0 * @var WP_HTML_Span[] */ protected $bookmarks = array(); const ADD_CLASS = true; const REMOVE_CLASS = false; const SKIP_CLASS = null; /** * Lexical replacements to apply to input HTML document. * * "Lexical" in this class refers to the part of this class which * operates on pure text _as text_ and not as HTML. There's a line * between the public interface, with HTML-semantic methods like * `set_attribute` and `add_class`, and an internal state that tracks * text offsets in the input document. * * When higher-level HTML methods are called, those have to transform their * operations (such as setting an attribute's value) into text diffing * operations (such as replacing the sub-string from indices A to B with * some given new string). These text-diffing operations are the lexical * updates. * * As new higher-level methods are added they need to collapse their * operations into these lower-level lexical updates since that's the * Tag Processor's internal language of change. Any code which creates * these lexical updates must ensure that they do not cross HTML syntax * boundaries, however, so these should never be exposed outside of this * class or any classes which intentionally expand its functionality. * * These are enqueued while editing the document instead of being immediately * applied to avoid processing overhead, string allocations, and string * copies when applying many updates to a single document. * * Example: * * // Replace an attribute stored with a new value, indices * // sourced from the lazily-parsed HTML recognizer. * $start = $attributes['src']->start; * $end = $attributes['src']->end; * $modifications[] = new WP_HTML_Text_Replacement( $start, $end, $new_value ); * * // Correspondingly, something like this will appear in this array. * $lexical_updates = array( * WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' ) * ); * * @since 6.2.0 * @var WP_HTML_Text_Replacement[] */ protected $lexical_updates = array(); /** * Tracks and limits `seek()` calls to prevent accidental infinite loops. * * @since 6.2.0 * @var int * * @see WP_HTML_Tag_Processor::seek() */ protected $seek_count = 0; /** * Constructor. * * @since 6.2.0 * * @param string $html HTML to process. */ public function __construct( $html ) { $this->html = $html; } /** * Finds the next tag matching the $query. * * @since 6.2.0 * * @param array|string|null $query { * Optional. Which tag name to find, having which class, etc. Default is to find any tag. * * @type string|null $tag_name Which tag to find, or `null` for "any tag." * @type int|null $match_offset Find the Nth tag matching all search criteria. * 1 for "first" tag, 3 for "third," etc. * Defaults to first tag. * @type string|null $class_name Tag must contain this whole class name to match. * @type string|null $tag_closers "visit" or "skip": whether to stop on tag closers, e.g.
. * } * @return bool Whether a tag was matched. */ public function next_tag( $query = null ) { $this->parse_query( $query ); $already_found = 0; do { if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { return false; } // Find the next tag if it exists. if ( false === $this->parse_next_tag() ) { $this->bytes_already_parsed = strlen( $this->html ); return false; } // Parse all of its attributes. while ( $this->parse_next_attribute() ) { continue; } // Ensure that the tag closes before the end of the document. if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { return false; } $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); if ( false === $tag_ends_at ) { return false; } $this->tag_ends_at = $tag_ends_at; $this->bytes_already_parsed = $tag_ends_at; // Finally, check if the parsed tag and its attributes match the search query. if ( $this->matches() ) { ++$already_found; } /* * For non-DATA sections which might contain text that looks like HTML tags but * isn't, scan with the appropriate alternative mode. Looking at the first letter * of the tag name as a pre-check avoids a string allocation when it's not needed. */ $t = $this->html[ $this->tag_name_starts_at ]; if ( ! $this->is_closing_tag && ( 'i' === $t || 'I' === $t || 'n' === $t || 'N' === $t || 's' === $t || 'S' === $t || 't' === $t || 'T' === $t ) ) { $tag_name = $this->get_tag(); if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) { $this->bytes_already_parsed = strlen( $this->html ); return false; } elseif ( ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && ! $this->skip_rcdata( $tag_name ) ) { $this->bytes_already_parsed = strlen( $this->html ); return false; } elseif ( ( 'IFRAME' === $tag_name || 'NOEMBED' === $tag_name || 'NOFRAMES' === $tag_name || 'NOSCRIPT' === $tag_name || 'STYLE' === $tag_name ) && ! $this->skip_rawtext( $tag_name ) ) { /* * "XMP" should be here too but its rules are more complicated and require the * complexity of the HTML Processor (it needs to close out any open P element, * meaning it can't be skipped here or else the HTML Processor will lose its * place). For now, it can be ignored as it's a rare HTML tag in practice and * any normative HTML should be using PRE instead. */ $this->bytes_already_parsed = strlen( $this->html ); return false; } } } while ( $already_found < $this->sought_match_offset ); return true; } /** * Generator for a foreach loop to step through each class name for the matched tag. * * This generator function is designed to be used inside a "foreach" loop. * * Example: * * $p = new WP_HTML_Tag_Processor( "
" ); * $p->next_tag(); * foreach ( $p->class_list() as $class_name ) { * echo "{$class_name} "; * } * // Outputs: "free lang-en " * * @since 6.4.0 */ public function class_list() { /** @var string $class contains the string value of the class attribute, with character references decoded. */ $class = $this->get_attribute( 'class' ); if ( ! is_string( $class ) ) { return; } $seen = array(); $at = 0; while ( $at < strlen( $class ) ) { // Skip past any initial boundary characters. $at += strspn( $class, " \t\f\r\n", $at ); if ( $at >= strlen( $class ) ) { return; } // Find the byte length until the next boundary. $length = strcspn( $class, " \t\f\r\n", $at ); if ( 0 === $length ) { return; } /* * CSS class names are case-insensitive in the ASCII range. * * @see https://www.w3.org/TR/CSS2/syndata.html#x1 */ $name = strtolower( substr( $class, $at, $length ) ); $at += $length; /* * It's expected that the number of class names for a given tag is relatively small. * Given this, it is probably faster overall to scan an array for a value rather * than to use the class name as a key and check if it's a key of $seen. */ if ( in_array( $name, $seen, true ) ) { continue; } $seen[] = $name; yield $name; } } /** * Returns if a matched tag contains the given ASCII case-insensitive class name. * * @since 6.4.0 * * @param string $wanted_class Look for this CSS class name, ASCII case-insensitive. * @return bool|null Whether the matched tag contains the given class name, or null if not matched. */ public function has_class( $wanted_class ) { if ( ! $this->tag_name_starts_at ) { return null; } $wanted_class = strtolower( $wanted_class ); foreach ( $this->class_list() as $class_name ) { if ( $class_name === $wanted_class ) { return true; } } return false; } /** * Sets a bookmark in the HTML document. * * Bookmarks represent specific places or tokens in the HTML * document, such as a tag opener or closer. When applying * edits to a document, such as setting an attribute, the * text offsets of that token may shift; the bookmark is * kept updated with those shifts and remains stable unless * the entire span of text in which the token sits is removed. * * Release bookmarks when they are no longer needed. * * Example: * *

Surprising fact you may not know!

* ^ ^ * \-|-- this `H2` opener bookmark tracks the token * *

Surprising fact you may no… * ^ ^ * \-|-- it shifts with edits * * Bookmarks provide the ability to seek to a previously-scanned * place in the HTML document. This avoids the need to re-scan * the entire document. * * Example: * *
  • One
  • Two
  • Three
* ^^^^ * want to note this last item * * $p = new WP_HTML_Tag_Processor( $html ); * $in_list = false; * while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) { * if ( 'UL' === $p->get_tag() ) { * if ( $p->is_tag_closer() ) { * $in_list = false; * $p->set_bookmark( 'resume' ); * if ( $p->seek( 'last-li' ) ) { * $p->add_class( 'last-li' ); * } * $p->seek( 'resume' ); * $p->release_bookmark( 'last-li' ); * $p->release_bookmark( 'resume' ); * } else { * $in_list = true; * } * } * * if ( 'LI' === $p->get_tag() ) { * $p->set_bookmark( 'last-li' ); * } * } * * Bookmarks intentionally hide the internal string offsets * to which they refer. They are maintained internally as * updates are applied to the HTML document and therefore * retain their "position" - the location to which they * originally pointed. The inability to use bookmarks with * functions like `substr` is therefore intentional to guard * against accidentally breaking the HTML. * * Because bookmarks allocate memory and require processing * for every applied update, they are limited and require * a name. They should not be created with programmatically-made * names, such as "li_{$index}" with some loop. As a general * rule they should only be created with string-literal names * like "start-of-section" or "last-paragraph". * * Bookmarks are a powerful tool to enable complicated behavior. * Consider double-checking that you need this tool if you are * reaching for it, as inappropriate use could lead to broken * HTML structure or unwanted processing overhead. * * @since 6.2.0 * * @param string $name Identifies this particular bookmark. * @return bool Whether the bookmark was successfully created. */ public function set_bookmark( $name ) { if ( null === $this->tag_name_starts_at ) { return false; } if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= static::MAX_BOOKMARKS ) { _doing_it_wrong( __METHOD__, __( 'Too many bookmarks: cannot create any more.' ), '6.2.0' ); return false; } $this->bookmarks[ $name ] = new WP_HTML_Span( $this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ), $this->tag_ends_at ); return true; } /** * Removes a bookmark that is no longer needed. * * Releasing a bookmark frees up the small * performance overhead it requires. * * @param string $name Name of the bookmark to remove. * @return bool Whether the bookmark already existed before removal. */ public function release_bookmark( $name ) { if ( ! array_key_exists( $name, $this->bookmarks ) ) { return false; } unset( $this->bookmarks[ $name ] ); return true; } /** * Skips contents of generic rawtext elements. * * @since 6.3.2 * * @see https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm * * @param string $tag_name The uppercase tag name which will close the RAWTEXT region. * @return bool Whether an end to the RAWTEXT region was found before the end of the document. */ private function skip_rawtext( $tag_name ) { /* * These two functions distinguish themselves on whether character references are * decoded, and since functionality to read the inner markup isn't supported, it's * not necessary to implement these two functions separately. */ return $this->skip_rcdata( $tag_name ); } /** * Skips contents of RCDATA elements, namely title and textarea tags. * * @since 6.2.0 * * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state * * @param string $tag_name The uppercase tag name which will close the RCDATA region. * @return bool Whether an end to the RCDATA region was found before the end of the document. */ private function skip_rcdata( $tag_name ) { $html = $this->html; $doc_length = strlen( $html ); $tag_length = strlen( $tag_name ); $at = $this->bytes_already_parsed; while ( false !== $at && $at < $doc_length ) { $at = strpos( $this->html, '= $doc_length ) { $this->bytes_already_parsed = $doc_length; return false; } $closer_potentially_starts_at = $at; $at += 2; /* * Find a case-insensitive match to the tag name. * * Because tag names are limited to US-ASCII there is no * need to perform any kind of Unicode normalization when * comparing; any character which could be impacted by such * normalization could not be part of a tag name. */ for ( $i = 0; $i < $tag_length; $i++ ) { $tag_char = $tag_name[ $i ]; $html_char = $html[ $at + $i ]; if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) { $at += $i; continue 2; } } $at += $tag_length; $this->bytes_already_parsed = $at; /* * Ensure that the tag name terminates to avoid matching on * substrings of a longer tag name. For example, the sequence * "' !== $c ) { continue; } while ( $this->parse_next_attribute() ) { continue; } $at = $this->bytes_already_parsed; if ( $at >= strlen( $this->html ) ) { return false; } if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) { $this->bytes_already_parsed = $closer_potentially_starts_at; return true; } } return false; } /** * Skips contents of script tags. * * @since 6.2.0 * * @return bool Whether the script tag was closed before the end of the document. */ private function skip_script_data() { $state = 'unescaped'; $html = $this->html; $doc_length = strlen( $html ); $at = $this->bytes_already_parsed; while ( false !== $at && $at < $doc_length ) { $at += strcspn( $html, '-<', $at ); /* * For all script states a "-->" transitions * back into the normal unescaped script mode, * even if that's the current state. */ if ( $at + 2 < $doc_length && '-' === $html[ $at ] && '-' === $html[ $at + 1 ] && '>' === $html[ $at + 2 ] ) { $at += 3; $state = 'unescaped'; continue; } // Everything of interest past here starts with "<". if ( $at + 1 >= $doc_length || '<' !== $html[ $at++ ] ) { continue; } /* * Unlike with "-->", the " * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( strlen( $html ) > $at + 3 && '-' === $html[ $at + 2 ] && '-' === $html[ $at + 3 ] ) { $closer_at = $at + 4; // If it's not possible to close the comment then there is nothing more to scan. if ( strlen( $html ) <= $closer_at ) { return false; } // Abruptly-closed empty comments are a sequence of dashes followed by `>`. $span_of_dashes = strspn( $html, '-', $closer_at ); if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { $at = $closer_at + $span_of_dashes + 1; continue; } /* * Comments may be closed by either a --> or an invalid --!>. * The first occurrence closes the comment. * * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment */ --$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping. while ( ++$closer_at < strlen( $html ) ) { $closer_at = strpos( $html, '--', $closer_at ); if ( false === $closer_at ) { return false; } if ( $closer_at + 2 < strlen( $html ) && '>' === $html[ $closer_at + 2 ] ) { $at = $closer_at + 3; continue 2; } if ( $closer_at + 3 < strlen( $html ) && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { $at = $closer_at + 4; continue 2; } } } /* * * The CDATA is case-sensitive. * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( strlen( $html ) > $at + 8 && '[' === $html[ $at + 2 ] && 'C' === $html[ $at + 3 ] && 'D' === $html[ $at + 4 ] && 'A' === $html[ $at + 5 ] && 'T' === $html[ $at + 6 ] && 'A' === $html[ $at + 7 ] && '[' === $html[ $at + 8 ] ) { $closer_at = strpos( $html, ']]>', $at + 9 ); if ( false === $closer_at ) { return false; } $at = $closer_at + 3; continue; } /* * * These are ASCII-case-insensitive. * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( strlen( $html ) > $at + 8 && ( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) && ( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) && ( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) && ( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) && ( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) && ( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) && ( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] ) ) { $closer_at = strpos( $html, '>', $at + 9 ); if ( false === $closer_at ) { return false; } $at = $closer_at + 1; continue; } /* * Anything else here is an incorrectly-opened comment and transitions * to the bogus comment state - skip to the nearest >. */ $at = strpos( $html, '>', $at + 1 ); continue; } /* * is a missing end tag name, which is ignored. * * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name */ if ( '>' === $html[ $at + 1 ] ) { ++$at; continue; } if ( '?' === $html[ $at + 1 ] ) { $closer_at = strpos( $html, '>', $at + 2 ); if ( false === $closer_at ) { return false; } $at = $closer_at + 1; continue; } /* * If a non-alpha starts the tag name in a tag closer it's a comment. * Find the first `>`, which closes the comment. * * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name */ if ( $this->is_closing_tag ) { $closer_at = strpos( $html, '>', $at + 3 ); if ( false === $closer_at ) { return false; } $at = $closer_at + 1; continue; } ++$at; } return false; } /** * Parses the next attribute. * * @since 6.2.0 * * @return bool Whether an attribute was found before the end of the document. */ private function parse_next_attribute() { // Skip whitespace and slashes. $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed ); if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { return false; } /* * Treat the equal sign as a part of the attribute * name if it is the first encountered byte. * * @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state */ $name_length = '=' === $this->html[ $this->bytes_already_parsed ] ? 1 + strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed + 1 ) : strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed ); // No attribute, just tag closer. if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) { return false; } $attribute_start = $this->bytes_already_parsed; $attribute_name = substr( $this->html, $attribute_start, $name_length ); $this->bytes_already_parsed += $name_length; if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { return false; } $this->skip_whitespace(); if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { return false; } $has_value = '=' === $this->html[ $this->bytes_already_parsed ]; if ( $has_value ) { ++$this->bytes_already_parsed; $this->skip_whitespace(); if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { return false; } switch ( $this->html[ $this->bytes_already_parsed ] ) { case "'": case '"': $quote = $this->html[ $this->bytes_already_parsed ]; $value_start = $this->bytes_already_parsed + 1; $value_length = strcspn( $this->html, $quote, $value_start ); $attribute_end = $value_start + $value_length + 1; $this->bytes_already_parsed = $attribute_end; break; default: $value_start = $this->bytes_already_parsed; $value_length = strcspn( $this->html, "> \t\f\r\n", $value_start ); $attribute_end = $value_start + $value_length; $this->bytes_already_parsed = $attribute_end; } } else { $value_start = $this->bytes_already_parsed; $value_length = 0; $attribute_end = $attribute_start + $name_length; } if ( $attribute_end >= strlen( $this->html ) ) { return false; } if ( $this->is_closing_tag ) { return true; } /* * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive */ $comparable_name = strtolower( $attribute_name ); // If an attribute is listed many times, only use the first declaration and ignore the rest. if ( ! array_key_exists( $comparable_name, $this->attributes ) ) { $this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token( $attribute_name, $value_start, $value_length, $attribute_start, $attribute_end, ! $has_value ); return true; } /* * Track the duplicate attributes so if we remove it, all disappear together. * * While `$this->duplicated_attributes` could always be stored as an `array()`, * which would simplify the logic here, storing a `null` and only allocating * an array when encountering duplicates avoids needless allocations in the * normative case of parsing tags with no duplicate attributes. */ $duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end ); if ( null === $this->duplicate_attributes ) { $this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) ); } elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) { $this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span ); } else { $this->duplicate_attributes[ $comparable_name ][] = $duplicate_span; } return true; } /** * Move the internal cursor past any immediate successive whitespace. * * @since 6.2.0 */ private function skip_whitespace() { $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n", $this->bytes_already_parsed ); } /** * Applies attribute updates and cleans up once a tag is fully parsed. * * @since 6.2.0 */ private function after_tag() { $this->get_updated_html(); $this->tag_name_starts_at = null; $this->tag_name_length = null; $this->tag_ends_at = null; $this->is_closing_tag = null; $this->attributes = array(); $this->duplicate_attributes = null; } /** * Converts class name updates into tag attributes updates * (they are accumulated in different data formats for performance). * * @since 6.2.0 * * @see WP_HTML_Tag_Processor::$lexical_updates * @see WP_HTML_Tag_Processor::$classname_updates */ private function class_name_updates_to_attributes_updates() { if ( count( $this->classname_updates ) === 0 ) { return; } $existing_class = $this->get_enqueued_attribute_value( 'class' ); if ( null === $existing_class || true === $existing_class ) { $existing_class = ''; } if ( false === $existing_class && isset( $this->attributes['class'] ) ) { $existing_class = substr( $this->html, $this->attributes['class']->value_starts_at, $this->attributes['class']->value_length ); } if ( false === $existing_class ) { $existing_class = ''; } /** * Updated "class" attribute value. * * This is incrementally built while scanning through the existing class * attribute, skipping removed classes on the way, and then appending * added classes at the end. Only when finished processing will the * value contain the final new value. * @var string $class */ $class = ''; /** * Tracks the cursor position in the existing * class attribute value while parsing. * * @var int $at */ $at = 0; /** * Indicates if there's any need to modify the existing class attribute. * * If a call to `add_class()` and `remove_class()` wouldn't impact * the `class` attribute value then there's no need to rebuild it. * For example, when adding a class that's already present or * removing one that isn't. * * This flag enables a performance optimization when none of the enqueued * class updates would impact the `class` attribute; namely, that the * processor can continue without modifying the input document, as if * none of the `add_class()` or `remove_class()` calls had been made. * * This flag is set upon the first change that requires a string update. * * @var bool $modified */ $modified = false; // Remove unwanted classes by only copying the new ones. $existing_class_length = strlen( $existing_class ); while ( $at < $existing_class_length ) { // Skip to the first non-whitespace character. $ws_at = $at; $ws_length = strspn( $existing_class, " \t\f\r\n", $ws_at ); $at += $ws_length; // Capture the class name – it's everything until the next whitespace. $name_length = strcspn( $existing_class, " \t\f\r\n", $at ); if ( 0 === $name_length ) { // If no more class names are found then that's the end. break; } $name = substr( $existing_class, $at, $name_length ); $at += $name_length; // If this class is marked for removal, start processing the next one. $remove_class = ( isset( $this->classname_updates[ $name ] ) && self::REMOVE_CLASS === $this->classname_updates[ $name ] ); // If a class has already been seen then skip it; it should not be added twice. if ( ! $remove_class ) { $this->classname_updates[ $name ] = self::SKIP_CLASS; } if ( $remove_class ) { $modified = true; continue; } /* * Otherwise, append it to the new "class" attribute value. * * There are options for handling whitespace between tags. * Preserving the existing whitespace produces fewer changes * to the HTML content and should clarify the before/after * content when debugging the modified output. * * This approach contrasts normalizing the inter-class * whitespace to a single space, which might appear cleaner * in the output HTML but produce a noisier change. */ $class .= substr( $existing_class, $ws_at, $ws_length ); $class .= $name; } // Add new classes by appending those which haven't already been seen. foreach ( $this->classname_updates as $name => $operation ) { if ( self::ADD_CLASS === $operation ) { $modified = true; $class .= strlen( $class ) > 0 ? ' ' : ''; $class .= $name; } } $this->classname_updates = array(); if ( ! $modified ) { return; } if ( strlen( $class ) > 0 ) { $this->set_attribute( 'class', $class ); } else { $this->remove_attribute( 'class' ); } } /** * Applies attribute updates to HTML document. * * @since 6.2.0 * @since 6.2.1 Accumulates shift for internal cursor and passed pointer. * @since 6.3.0 Invalidate any bookmarks whose targets are overwritten. * * @param int $shift_this_point Accumulate and return shift for this position. * @return int How many bytes the given pointer moved in response to the updates. */ private function apply_attributes_updates( $shift_this_point = 0 ) { if ( ! count( $this->lexical_updates ) ) { return 0; } $accumulated_shift_for_given_point = 0; /* * Attribute updates can be enqueued in any order but updates * to the document must occur in lexical order; that is, each * replacement must be made before all others which follow it * at later string indices in the input document. * * Sorting avoid making out-of-order replacements which * can lead to mangled output, partially-duplicated * attributes, and overwritten attributes. */ usort( $this->lexical_updates, array( self::class, 'sort_start_ascending' ) ); $bytes_already_copied = 0; $output_buffer = ''; foreach ( $this->lexical_updates as $diff ) { $shift = strlen( $diff->text ) - ( $diff->end - $diff->start ); // Adjust the cursor position by however much an update affects it. if ( $diff->start <= $this->bytes_already_parsed ) { $this->bytes_already_parsed += $shift; } // Accumulate shift of the given pointer within this function call. if ( $diff->start <= $shift_this_point ) { $accumulated_shift_for_given_point += $shift; } $output_buffer .= substr( $this->html, $bytes_already_copied, $diff->start - $bytes_already_copied ); $output_buffer .= $diff->text; $bytes_already_copied = $diff->end; } $this->html = $output_buffer . substr( $this->html, $bytes_already_copied ); /* * Adjust bookmark locations to account for how the text * replacements adjust offsets in the input document. */ foreach ( $this->bookmarks as $bookmark_name => $bookmark ) { /* * Each lexical update which appears before the bookmark's endpoints * might shift the offsets for those endpoints. Loop through each change * and accumulate the total shift for each bookmark, then apply that * shift after tallying the full delta. */ $head_delta = 0; $tail_delta = 0; foreach ( $this->lexical_updates as $diff ) { if ( $bookmark->start < $diff->start && $bookmark->end < $diff->start ) { break; } if ( $bookmark->start >= $diff->start && $bookmark->end < $diff->end ) { $this->release_bookmark( $bookmark_name ); continue 2; } $delta = strlen( $diff->text ) - ( $diff->end - $diff->start ); if ( $bookmark->start >= $diff->start ) { $head_delta += $delta; } if ( $bookmark->end >= $diff->end ) { $tail_delta += $delta; } } $bookmark->start += $head_delta; $bookmark->end += $tail_delta; } $this->lexical_updates = array(); return $accumulated_shift_for_given_point; } /** * Checks whether a bookmark with the given name exists. * * @since 6.3.0 * * @param string $bookmark_name Name to identify a bookmark that potentially exists. * @return bool Whether that bookmark exists. */ public function has_bookmark( $bookmark_name ) { return array_key_exists( $bookmark_name, $this->bookmarks ); } /** * Move the internal cursor in the Tag Processor to a given bookmark's location. * * In order to prevent accidental infinite loops, there's a * maximum limit on the number of times seek() can be called. * * @since 6.2.0 * * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. * @return bool Whether the internal cursor was successfully moved to the bookmark's location. */ public function seek( $bookmark_name ) { if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) { _doing_it_wrong( __METHOD__, __( 'Unknown bookmark name.' ), '6.2.0' ); return false; } if ( ++$this->seek_count > static::MAX_SEEK_OPS ) { _doing_it_wrong( __METHOD__, __( 'Too many calls to seek() - this can lead to performance issues.' ), '6.2.0' ); return false; } // Flush out any pending updates to the document. $this->get_updated_html(); // Point this tag processor before the sought tag opener and consume it. $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; return $this->next_tag( array( 'tag_closers' => 'visit' ) ); } /** * Compare two WP_HTML_Text_Replacement objects. * * @since 6.2.0 * * @param WP_HTML_Text_Replacement $a First attribute update. * @param WP_HTML_Text_Replacement $b Second attribute update. * @return int Comparison value for string order. */ private static function sort_start_ascending( $a, $b ) { $by_start = $a->start - $b->start; if ( 0 !== $by_start ) { return $by_start; } $by_text = isset( $a->text, $b->text ) ? strcmp( $a->text, $b->text ) : 0; if ( 0 !== $by_text ) { return $by_text; } /* * This code should be unreachable, because it implies the two replacements * start at the same location and contain the same text. */ return $a->end - $b->end; } /** * Return the enqueued value for a given attribute, if one exists. * * Enqueued updates can take different data types: * - If an update is enqueued and is boolean, the return will be `true` * - If an update is otherwise enqueued, the return will be the string value of that update. * - If an attribute is enqueued to be removed, the return will be `null` to indicate that. * - If no updates are enqueued, the return will be `false` to differentiate from "removed." * * @since 6.2.0 * * @param string $comparable_name The attribute name in its comparable form. * @return string|boolean|null Value of enqueued update if present, otherwise false. */ private function get_enqueued_attribute_value( $comparable_name ) { if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) { return false; } $enqueued_text = $this->lexical_updates[ $comparable_name ]->text; // Removed attributes erase the entire span. if ( '' === $enqueued_text ) { return null; } /* * Boolean attribute updates are just the attribute name without a corresponding value. * * This value might differ from the given comparable name in that there could be leading * or trailing whitespace, and that the casing follows the name given in `set_attribute`. * * Example: * * $p->set_attribute( 'data-TEST-id', 'update' ); * 'update' === $p->get_enqueued_attribute_value( 'data-test-id' ); * * Detect this difference based on the absence of the `=`, which _must_ exist in any * attribute containing a value, e.g. ``. * ¹ ² * 1. Attribute with a string value. * 2. Boolean attribute whose value is `true`. */ $equals_at = strpos( $enqueued_text, '=' ); if ( false === $equals_at ) { return true; } /* * Finally, a normal update's value will appear after the `=` and * be double-quoted, as performed incidentally by `set_attribute`. * * e.g. `type="text"` * ¹² ³ * 1. Equals is here. * 2. Double-quoting starts one after the equals sign. * 3. Double-quoting ends at the last character in the update. */ $enqueued_value = substr( $enqueued_text, $equals_at + 2, -1 ); return html_entity_decode( $enqueued_value ); } /** * Returns the value of a requested attribute from a matched tag opener if that attribute exists. * * Example: * * $p = new WP_HTML_Tag_Processor( '
Test
' ); * $p->next_tag( array( 'class_name' => 'test' ) ) === true; * $p->get_attribute( 'data-test-id' ) === '14'; * $p->get_attribute( 'enabled' ) === true; * $p->get_attribute( 'aria-label' ) === null; * * $p->next_tag() === false; * $p->get_attribute( 'class' ) === null; * * @since 6.2.0 * * @param string $name Name of attribute whose value is requested. * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. */ public function get_attribute( $name ) { if ( null === $this->tag_name_starts_at ) { return null; } $comparable = strtolower( $name ); /* * For every attribute other than `class` it's possible to perform a quick check if * there's an enqueued lexical update whose value takes priority over what's found in * the input document. * * The `class` attribute is special though because of the exposed helpers `add_class` * and `remove_class`. These form a builder for the `class` attribute, so an additional * check for enqueued class changes is required in addition to the check for any enqueued * attribute values. If any exist, those enqueued class changes must first be flushed out * into an attribute value update. */ if ( 'class' === $name ) { $this->class_name_updates_to_attributes_updates(); } // Return any enqueued attribute value updates if they exist. $enqueued_value = $this->get_enqueued_attribute_value( $comparable ); if ( false !== $enqueued_value ) { return $enqueued_value; } if ( ! isset( $this->attributes[ $comparable ] ) ) { return null; } $attribute = $this->attributes[ $comparable ]; /* * This flag distinguishes an attribute with no value * from an attribute with an empty string value. For * unquoted attributes this could look very similar. * It refers to whether an `=` follows the name. * * e.g.
* ¹ ² * 1. Attribute `boolean-attribute` is `true`. * 2. Attribute `empty-attribute` is `""`. */ if ( true === $attribute->is_true ) { return true; } $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length ); return html_entity_decode( $raw_value ); } /** * Gets lowercase names of all attributes matching a given prefix in the current tag. * * Note that matching is case-insensitive. This is in accordance with the spec: * * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * Example: * * $p = new WP_HTML_Tag_Processor( '
Test
' ); * $p->next_tag( array( 'class_name' => 'test' ) ) === true; * $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-enabled', 'data-test-id' ); * * $p->next_tag() === false; * $p->get_attribute_names_with_prefix( 'data-' ) === null; * * @since 6.2.0 * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive * * @param string $prefix Prefix of requested attribute names. * @return array|null List of attribute names, or `null` when no tag opener is matched. */ public function get_attribute_names_with_prefix( $prefix ) { if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) { return null; } $comparable = strtolower( $prefix ); $matches = array(); foreach ( array_keys( $this->attributes ) as $attr_name ) { if ( str_starts_with( $attr_name, $comparable ) ) { $matches[] = $attr_name; } } return $matches; } /** * Returns the uppercase name of the matched tag. * * Example: * * $p = new WP_HTML_Tag_Processor( '
Test
' ); * $p->next_tag() === true; * $p->get_tag() === 'DIV'; * * $p->next_tag() === false; * $p->get_tag() === null; * * @since 6.2.0 * * @return string|null Name of currently matched tag in input HTML, or `null` if none found. */ public function get_tag() { if ( null === $this->tag_name_starts_at ) { return null; } $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); return strtoupper( $tag_name ); } /** * Indicates if the currently matched tag contains the self-closing flag. * * No HTML elements ought to have the self-closing flag and for those, the self-closing * flag will be ignored. For void elements this is benign because they "self close" * automatically. For non-void HTML elements though problems will appear if someone * intends to use a self-closing element in place of that element with an empty body. * For HTML foreign elements and custom elements the self-closing flag determines if * they self-close or not. * * This function does not determine if a tag is self-closing, * but only if the self-closing flag is present in the syntax. * * @since 6.3.0 * * @return bool Whether the currently matched tag contains the self-closing flag. */ public function has_self_closing_flag() { if ( ! $this->tag_name_starts_at ) { return false; } return '/' === $this->html[ $this->tag_ends_at - 1 ]; } /** * Indicates if the current tag token is a tag closer. * * Example: * * $p = new WP_HTML_Tag_Processor( '
' ); * $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) ); * $p->is_tag_closer() === false; * * $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) ); * $p->is_tag_closer() === true; * * @since 6.2.0 * * @return bool Whether the current tag is a tag closer. */ public function is_tag_closer() { return $this->is_closing_tag; } /** * Updates or creates a new attribute on the currently matched tag with the passed value. * * For boolean attributes special handling is provided: * - When `true` is passed as the value, then only the attribute name is added to the tag. * - When `false` is passed, the attribute gets removed if it existed before. * * For string attributes, the value is escaped using the `esc_attr` function. * * @since 6.2.0 * @since 6.2.1 Fix: Only create a single update for multiple calls with case-variant attribute names. * * @param string $name The attribute name to target. * @param string|bool $value The new attribute value. * @return bool Whether an attribute value was set. */ public function set_attribute( $name, $value ) { if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) { return false; } /* * WordPress rejects more characters than are strictly forbidden * in HTML5. This is to prevent additional security risks deeper * in the WordPress and plugin stack. Specifically the * less-than (<) greater-than (>) and ampersand (&) aren't allowed. * * The use of a PCRE match enables looking for specific Unicode * code points without writing a UTF-8 decoder. Whereas scanning * for one-byte characters is trivial (with `strcspn`), scanning * for the longer byte sequences would be more complicated. Given * that this shouldn't be in the hot path for execution, it's a * reasonable compromise in efficiency without introducing a * noticeable impact on the overall system. * * @see https://html.spec.whatwg.org/#attributes-2 * * @TODO as the only regex pattern maybe we should take it out? are * Unicode patterns available broadly in Core? */ if ( preg_match( '~[' . // Syntax-like characters. '"\'>& The values "true" and "false" are not allowed on boolean attributes. * > To represent a false value, the attribute has to be omitted altogether. * - HTML5 spec, https://html.spec.whatwg.org/#boolean-attributes */ if ( false === $value ) { return $this->remove_attribute( $name ); } if ( true === $value ) { $updated_attribute = $name; } else { $escaped_new_value = esc_attr( $value ); $updated_attribute = "{$name}=\"{$escaped_new_value}\""; } /* * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive */ $comparable_name = strtolower( $name ); if ( isset( $this->attributes[ $comparable_name ] ) ) { /* * Update an existing attribute. * * Example – set attribute id to "new" in
: * *
* ^-------------^ * start end * replacement: `id="new"` * * Result:
*/ $existing_attribute = $this->attributes[ $comparable_name ]; $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $existing_attribute->start, $existing_attribute->end, $updated_attribute ); } else { /* * Create a new attribute at the tag's name end. * * Example – add attribute id="new" to
: * *
* ^ * start and end * replacement: ` id="new"` * * Result:
*/ $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $this->tag_name_starts_at + $this->tag_name_length, $this->tag_name_starts_at + $this->tag_name_length, ' ' . $updated_attribute ); } /* * Any calls to update the `class` attribute directly should wipe out any * enqueued class changes from `add_class` and `remove_class`. */ if ( 'class' === $comparable_name && ! empty( $this->classname_updates ) ) { $this->classname_updates = array(); } return true; } /** * Remove an attribute from the currently-matched tag. * * @since 6.2.0 * * @param string $name The attribute name to remove. * @return bool Whether an attribute was removed. */ public function remove_attribute( $name ) { if ( $this->is_closing_tag ) { return false; } /* * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive */ $name = strtolower( $name ); /* * Any calls to update the `class` attribute directly should wipe out any * enqueued class changes from `add_class` and `remove_class`. */ if ( 'class' === $name && count( $this->classname_updates ) !== 0 ) { $this->classname_updates = array(); } /* * If updating an attribute that didn't exist in the input * document, then remove the enqueued update and move on. * * For example, this might occur when calling `remove_attribute()` * after calling `set_attribute()` for the same attribute * and when that attribute wasn't originally present. */ if ( ! isset( $this->attributes[ $name ] ) ) { if ( isset( $this->lexical_updates[ $name ] ) ) { unset( $this->lexical_updates[ $name ] ); } return false; } /* * Removes an existing tag attribute. * * Example – remove the attribute id from
: *
* ^-------------^ * start end * replacement: `` * * Result:
*/ $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( $this->attributes[ $name ]->start, $this->attributes[ $name ]->end, '' ); // Removes any duplicated attributes if they were also present. if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) { foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) { $this->lexical_updates[] = new WP_HTML_Text_Replacement( $attribute_token->start, $attribute_token->end, '' ); } } return true; } /** * Adds a new class name to the currently matched tag. * * @since 6.2.0 * * @param string $class_name The class name to add. * @return bool Whether the class was set to be added. */ public function add_class( $class_name ) { if ( $this->is_closing_tag ) { return false; } if ( null !== $this->tag_name_starts_at ) { $this->classname_updates[ $class_name ] = self::ADD_CLASS; } return true; } /** * Removes a class name from the currently matched tag. * * @since 6.2.0 * * @param string $class_name The class name to remove. * @return bool Whether the class was set to be removed. */ public function remove_class( $class_name ) { if ( $this->is_closing_tag ) { return false; } if ( null !== $this->tag_name_starts_at ) { $this->classname_updates[ $class_name ] = self::REMOVE_CLASS; } return true; } /** * Returns the string representation of the HTML Tag Processor. * * @since 6.2.0 * * @see WP_HTML_Tag_Processor::get_updated_html() * * @return string The processed HTML. */ public function __toString() { return $this->get_updated_html(); } /** * Returns the string representation of the HTML Tag Processor. * * @since 6.2.0 * @since 6.2.1 Shifts the internal cursor corresponding to the applied updates. * @since 6.4.0 No longer calls subclass method `next_tag()` after updating HTML. * * @return string The processed HTML. */ public function get_updated_html() { $requires_no_updating = 0 === count( $this->classname_updates ) && 0 === count( $this->lexical_updates ); /* * When there is nothing more to update and nothing has already been * updated, return the original document and avoid a string copy. */ if ( $requires_no_updating ) { return $this->html; } /* * Keep track of the position right before the current tag. This will * be necessary for reparsing the current tag after updating the HTML. */ $before_current_tag = $this->tag_name_starts_at - 1; /* * 1. Apply the enqueued edits and update all the pointers to reflect those changes. */ $this->class_name_updates_to_attributes_updates(); $before_current_tag += $this->apply_attributes_updates( $before_current_tag ); /* * 2. Rewind to before the current tag and reparse to get updated attributes. * * At this point the internal cursor points to the end of the tag name. * Rewind before the tag name starts so that it's as if the cursor didn't * move; a call to `next_tag()` will reparse the recently-updated attributes * and additional calls to modify the attributes will apply at this same * location, but in order to avoid issues with subclasses that might add * behaviors to `next_tag()`, the internal methods should be called here * instead. * * It's important to note that in this specific place there will be no change * because the processor was already at a tag when this was called and it's * rewinding only to the beginning of this very tag before reprocessing it * and its attributes. * *

Previous HTMLMore HTML

* ↑ │ back up by the length of the tag name plus the opening < * └←─┘ back up by strlen("em") + 1 ==> 3 */ $this->bytes_already_parsed = $before_current_tag; $this->parse_next_tag(); // Reparse the attributes. while ( $this->parse_next_attribute() ) { continue; } $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); $this->tag_ends_at = $tag_ends_at; $this->bytes_already_parsed = $tag_ends_at; return $this->html; } /** * Parses tag query input into internal search criteria. * * @since 6.2.0 * * @param array|string|null $query { * Optional. Which tag name to find, having which class, etc. Default is to find any tag. * * @type string|null $tag_name Which tag to find, or `null` for "any tag." * @type int|null $match_offset Find the Nth tag matching all search criteria. * 1 for "first" tag, 3 for "third," etc. * Defaults to first tag. * @type string|null $class_name Tag must contain this class name to match. * @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g.
. * } */ private function parse_query( $query ) { if ( null !== $query && $query === $this->last_query ) { return; } $this->last_query = $query; $this->sought_tag_name = null; $this->sought_class_name = null; $this->sought_match_offset = 1; $this->stop_on_tag_closers = false; // A single string value means "find the tag of this name". if ( is_string( $query ) ) { $this->sought_tag_name = $query; return; } // An empty query parameter applies no restrictions on the search. if ( null === $query ) { return; } // If not using the string interface, an associative array is required. if ( ! is_array( $query ) ) { _doing_it_wrong( __METHOD__, __( 'The query argument must be an array or a tag name.' ), '6.2.0' ); return; } if ( isset( $query['tag_name'] ) && is_string( $query['tag_name'] ) ) { $this->sought_tag_name = $query['tag_name']; } if ( isset( $query['class_name'] ) && is_string( $query['class_name'] ) ) { $this->sought_class_name = $query['class_name']; } if ( isset( $query['match_offset'] ) && is_int( $query['match_offset'] ) && 0 < $query['match_offset'] ) { $this->sought_match_offset = $query['match_offset']; } if ( isset( $query['tag_closers'] ) ) { $this->stop_on_tag_closers = 'visit' === $query['tag_closers']; } } /** * Checks whether a given tag and its attributes match the search criteria. * * @since 6.2.0 * * @return bool Whether the given tag and its attribute match the search criteria. */ private function matches() { if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) { return false; } // Does the tag name match the requested tag name in a case-insensitive manner? if ( null !== $this->sought_tag_name ) { /* * String (byte) length lookup is fast. If they aren't the * same length then they can't be the same string values. */ if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) { return false; } /* * Check each character to determine if they are the same. * Defer calls to `strtoupper()` to avoid them when possible. * Calling `strcasecmp()` here tested slowed than comparing each * character, so unless benchmarks show otherwise, it should * not be used. * * It's expected that most of the time that this runs, a * lower-case tag name will be supplied and the input will * contain lower-case tag names, thus normally bypassing * the case comparison code. */ for ( $i = 0; $i < $this->tag_name_length; $i++ ) { $html_char = $this->html[ $this->tag_name_starts_at + $i ]; $tag_char = $this->sought_tag_name[ $i ]; if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) { return false; } } } if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) { return false; } return true; } }