// SPDX-FileCopyrightText: 2004-2023 Ryan Parman, Sam Sneddon, Ryan McCue
// SPDX-License-Identifier: BSD-3-Clause
* This implements HTML5 as of revision 967 (2007-06-28)
* @deprecated Use DOMDocument instead!
class SimplePie_Decode_HTML_Entities
* Currently consumed bytes
* Position of the current byte being parsed
* Create an instance of the class with the input data
* @param string $data Input data
public function __construct(string $data)
* @return string Output data
while (($position = strpos($this->data, '&', $this->position)) !== false) {
$this->position = $position;
* @return string|false The next byte, or false, if there is no more data
public function consume()
if (isset($this->data[$this->position])) {
$this->consumed .= $this->data[$this->position];
return $this->data[$this->position++];
* Consume a range of characters
* @param string $chars Characters to consume
* @return string|false A series of characters that match the range, or false
public function consume_range(string $chars)
if ($len = strspn($this->data, $chars, $this->position)) {
$data = substr($this->data, $this->position, $len);
$this->consumed .= $data;
public function unconsume()
$this->consumed = substr($this->consumed, 0, -1);
switch ($this->consume()) {
switch ($this->consume()) {
$range = '0123456789ABCDEFabcdef';
if ($codepoint = $this->consume_range($range)) {
static $windows_1252_specials = [0x0D => "\x0A", 0x80 => "\xE2\x82\xAC", 0x81 => "\xEF\xBF\xBD", 0x82 => "\xE2\x80\x9A", 0x83 => "\xC6\x92", 0x84 => "\xE2\x80\x9E", 0x85 => "\xE2\x80\xA6", 0x86 => "\xE2\x80\xA0", 0x87 => "\xE2\x80\xA1", 0x88 => "\xCB\x86", 0x89 => "\xE2\x80\xB0", 0x8A => "\xC5\xA0", 0x8B => "\xE2\x80\xB9", 0x8C => "\xC5\x92", 0x8D => "\xEF\xBF\xBD", 0x8E => "\xC5\xBD", 0x8F => "\xEF\xBF\xBD", 0x90 => "\xEF\xBF\xBD", 0x91 => "\xE2\x80\x98", 0x92 => "\xE2\x80\x99", 0x93 => "\xE2\x80\x9C", 0x94 => "\xE2\x80\x9D", 0x95 => "\xE2\x80\xA2", 0x96 => "\xE2\x80\x93", 0x97 => "\xE2\x80\x94", 0x98 => "\xCB\x9C", 0x99 => "\xE2\x84\xA2", 0x9A => "\xC5\xA1", 0x9B => "\xE2\x80\xBA", 0x9C => "\xC5\x93", 0x9D => "\xEF\xBF\xBD", 0x9E => "\xC5\xBE", 0x9F => "\xC5\xB8"];
// Cap to PHP_INT_MAX to ensure consistent behaviour if $codepoint is so large
// it cannot fit into int – just casting float to int might return junk (e.g. a negative number).
// If it is so large, `Misc::codepoint_to_utf8` will just return a replacement character.
$codepoint = (int) min(hexdec($codepoint), \PHP_INT_MAX);
// Casting string to int caps at PHP_INT_MAX automatically.
$codepoint = (int) $codepoint;
if (isset($windows_1252_specials[$codepoint])) {
$replacement = $windows_1252_specials[$codepoint];
$replacement = SimplePie_Misc::codepoint_to_utf8($codepoint);
if (!in_array($this->consume(), [';', false], true)) {
$consumed_length = strlen($this->consumed);
$this->data = substr_replace($this->data, $replacement, $this->position - $consumed_length, $consumed_length);
$this->position += strlen($replacement) - $consumed_length;
'alefsym;' => "\xE2\x84\xB5",
'and;' => "\xE2\x88\xA7",
'ang;' => "\xE2\x88\xA0",
'asymp;' => "\xE2\x89\x88",
'bdquo;' => "\xE2\x80\x9E",
'bull;' => "\xE2\x80\xA2",
'cap;' => "\xE2\x88\xA9",
'clubs;' => "\xE2\x99\xA3",
'cong;' => "\xE2\x89\x85",
'crarr;' => "\xE2\x86\xB5",
'cup;' => "\xE2\x88\xAA",
'Dagger;' => "\xE2\x80\xA1",
'dagger;' => "\xE2\x80\xA0",
'dArr;' => "\xE2\x87\x93",
'darr;' => "\xE2\x86\x93",
'diams;' => "\xE2\x99\xA6",
'empty;' => "\xE2\x88\x85",
'emsp;' => "\xE2\x80\x83",
'ensp;' => "\xE2\x80\x82",
'Epsilon;' => "\xCE\x95",
'epsilon;' => "\xCE\xB5",
'equiv;' => "\xE2\x89\xA1",
'euro;' => "\xE2\x82\xAC",
'exist;' => "\xE2\x88\x83",
'forall;' => "\xE2\x88\x80",
'frasl;' => "\xE2\x81\x84",
'hArr;' => "\xE2\x87\x94",
'harr;' => "\xE2\x86\x94",
'hearts;' => "\xE2\x99\xA5",
'hellip;' => "\xE2\x80\xA6",
'image;' => "\xE2\x84\x91",
'infin;' => "\xE2\x88\x9E",
'int;' => "\xE2\x88\xAB",
'isin;' => "\xE2\x88\x88",
'lang;' => "\xE3\x80\x88",
'lArr;' => "\xE2\x87\x90",
'larr;' => "\xE2\x86\x90",
'lceil;' => "\xE2\x8C\x88",
'ldquo;' => "\xE2\x80\x9C",
'lfloor;' => "\xE2\x8C\x8A",
'lowast;' => "\xE2\x88\x97",
'loz;' => "\xE2\x97\x8A",
'lrm;' => "\xE2\x80\x8E",
'lsaquo;' => "\xE2\x80\xB9",
'lsquo;' => "\xE2\x80\x98",
'mdash;' => "\xE2\x80\x94",
'minus;' => "\xE2\x88\x92",
'nabla;' => "\xE2\x88\x87",
'ndash;' => "\xE2\x80\x93",
'notin;' => "\xE2\x88\x89",
'nsub;' => "\xE2\x8A\x84",
'oline;' => "\xE2\x80\xBE",
'Omicron;' => "\xCE\x9F",
'omicron;' => "\xCE\xBF",
'oplus;' => "\xE2\x8A\x95",
'otimes;' => "\xE2\x8A\x97",
'part;' => "\xE2\x88\x82",
'permil;' => "\xE2\x80\xB0",
'perp;' => "\xE2\x8A\xA5",
'Prime;' => "\xE2\x80\xB3",
'prime;' => "\xE2\x80\xB2",
'prod;' => "\xE2\x88\x8F",
'prop;' => "\xE2\x88\x9D",
'radic;' => "\xE2\x88\x9A",
'rang;' => "\xE3\x80\x89",
'rArr;' => "\xE2\x87\x92",
'rarr;' => "\xE2\x86\x92",
'rceil;' => "\xE2\x8C\x89",
'rdquo;' => "\xE2\x80\x9D",
'real;' => "\xE2\x84\x9C",
'rfloor;' => "\xE2\x8C\x8B",
'rlm;' => "\xE2\x80\x8F",
'rsaquo;' => "\xE2\x80\xBA",
'rsquo;' => "\xE2\x80\x99",
'sbquo;' => "\xE2\x80\x9A",
'sdot;' => "\xE2\x8B\x85",
'sim;' => "\xE2\x88\xBC",
'spades;' => "\xE2\x99\xA0",
'sub;' => "\xE2\x8A\x82",
'sube;' => "\xE2\x8A\x86",
'sum;' => "\xE2\x88\x91",
'sup;' => "\xE2\x8A\x83",
'supe;' => "\xE2\x8A\x87",
'there4;' => "\xE2\x88\xB4",
'thetasym;' => "\xCF\x91",
'thinsp;' => "\xE2\x80\x89",