From 53737d12447634ccf5b100179c2dfb625577c850 Mon Sep 17 00:00:00 2001 From: Inverle Date: Sat, 29 Nov 2025 14:59:29 +0100 Subject: [PATCH 1/6] Add `disallow_uri_protocols()` function for disallowing `javascript:` URIs --- src/Sanitize.php | 61 +++++++++++++++++++++++++++++++++++++++++++++++ src/SimplePie.php | 15 ++++++++++++ 2 files changed, 76 insertions(+) diff --git a/src/Sanitize.php b/src/Sanitize.php index df1176b75..72310bf86 100644 --- a/src/Sanitize.php +++ b/src/Sanitize.php @@ -58,6 +58,8 @@ class Sanitize implements RegistryAware public $allow_data_attr = true; /** @var bool */ public $allow_aria_attr = true; + /** @var string[] */ + public $disallowed_uri_protocols = ['javascript']; /** @var array> */ public $add_attributes = ['audio' => ['preload' => 'none'], 'iframe' => ['sandbox' => 'allow-scripts allow-same-origin'], 'video' => ['preload' => 'none']]; /** @var bool */ @@ -280,6 +282,14 @@ public function allow_aria_attr(bool $allow = true): void $this->allow_aria_attr = $allow; } + /** + * @param string[] $protocols List of protocols to disallow + */ + public function disallow_uri_protocols(array $protocols = ['javascript']): void + { + $this->disallowed_uri_protocols = $protocols; + } + /** * @return void */ @@ -541,6 +551,12 @@ public function sanitize(string $data, int $type, string $base = '') $this->replace_urls($document, $element, $attributes); } + if ($this->disallowed_uri_protocols) { + foreach ($this->disallowed_uri_protocols as $protocol) { + $this->strip_uri_protocol($xpath, $protocol); + } + } + // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags. if ($this->image_handler !== '' && $this->enable_cache) { $images = $document->getElementsByTagName('img'); @@ -742,6 +758,51 @@ protected function enforce_allowed_html_nodes(\DOMNode $element, bool $allow_dat } } + private function extract_protocol(string $uri): string + { + if (!str_contains($uri, ':')) { + return ''; + } + $extracted_protocol = strtolower(preg_replace( + '/[\x01-\x20\s]/', + '', + rawurldecode(explode(':', $uri)[0]) + ) ?? ''); + return $extracted_protocol; + } + + /** + * Remove a disallowed URI protocol + */ + protected function strip_uri_protocol(\DOMXPath $xpath, string $protocol): void + { + $protocol = strtolower($protocol); + $elements = $xpath->query('.//a[@href]|.//iframe[@src]|.//math//*[@href]'); + + if ($elements === false) { + throw new \SimplePie\Exception(sprintf( + '%s(): Possibly malformed expression', + __METHOD__ + ), 1); + } + + foreach ($elements as $element) { + if (!($element instanceof \DOMElement)) { + continue; + } + + $href = $element->getAttribute('href'); + $src = $element->getAttribute('src'); + + if ($element->hasAttribute('href') && $this->extract_protocol($href) === $protocol) { + $element->setAttribute('href', 'unsafe:' . $href); + } + if ($element->hasAttribute('src') && $this->extract_protocol($src) === $protocol) { + $element->setAttribute('src', 'unsafe:' . $src); + } + } + } + /** * @param int-mask-of $type * @return void diff --git a/src/SimplePie.php b/src/SimplePie.php index 38c61e841..3d31bb129 100644 --- a/src/SimplePie.php +++ b/src/SimplePie.php @@ -692,6 +692,13 @@ class SimplePie */ public $allow_aria_attr = true; + /** + * @var string[] Stores array of disallowed URI protocols + * @see SimplePie::disallow_uri_protocols() + * @access private + */ + public $disallowed_uri_protocols = ['javascript']; + /** * @var bool Should we throw exceptions, or use the old-style error property? * @access private @@ -1589,6 +1596,14 @@ public function allow_aria_attr(bool $allow = true): void $this->sanitize->allow_aria_attr($allow); } + /** + * @param string[] $protocols List of protocols to disallow + */ + public function disallow_uri_protocols(array $protocols = ['javascript']): void + { + $this->sanitize->disallow_uri_protocols($protocols); + } + /** * @return void */ From dc32e351120484c653fc77b7e1da3de52575b99f Mon Sep 17 00:00:00 2001 From: Alexandre Alapetite Date: Thu, 11 Jun 2026 21:59:42 +0200 Subject: [PATCH 2/6] Refactor to be lighter --- src/Sanitize.php | 84 +++++++++++---------- src/SimplePie.php | 12 +-- tests/Unit/SanitizeTest.php | 147 ++++++++++++++++++++++++++++++++++++ 3 files changed, 198 insertions(+), 45 deletions(-) diff --git a/src/Sanitize.php b/src/Sanitize.php index ec6953b4f..18139e256 100644 --- a/src/Sanitize.php +++ b/src/Sanitize.php @@ -59,7 +59,7 @@ class Sanitize implements RegistryAware /** @var bool */ public $allow_aria_attr = true; /** @var string[] */ - public $disallowed_uri_protocols = ['javascript']; + public $disallowed_uri_schemes = ['javascript']; /** @var array> */ public $add_attributes = ['audio' => ['preload' => 'none'], 'iframe' => ['sandbox' => 'allow-scripts allow-same-origin'], 'video' => ['preload' => 'none']]; /** @var bool */ @@ -283,11 +283,11 @@ public function allow_aria_attr(bool $allow = true): void } /** - * @param string[] $protocols List of protocols to disallow + * @param string[] $schemes List of URI schemes (protocols) to disallow */ - public function disallow_uri_protocols(array $protocols = ['javascript']): void + public function disallow_uri_schemes(array $schemes = ['javascript']): void { - $this->disallowed_uri_protocols = $protocols; + $this->disallowed_uri_schemes = $schemes; } /** @@ -362,8 +362,8 @@ public function set_output_encoding(string $encoding = 'UTF-8') * containing URLs that need to be resolved relative to the feed * * Defaults to |a|@href, |area|@href, |audio|@src, |blockquote|@cite, - * |del|@cite, |form|@action, |img|@longdesc, |img|@src, |input|@src, - * |ins|@cite, |q|@cite, |source|@src, |video|@src + * |del|@cite, |form|@action, |iframe|@src, |img|@longdesc, |img|@src, + * |input|@src, |ins|@cite, |q|@cite, |source|@src, |video|@src * * @since 1.0 * @param array|null $element_attribute Element/attribute key/value pairs, null for default @@ -379,6 +379,7 @@ public function set_url_replacements(?array $element_attribute = null) 'blockquote' => 'cite', 'del' => 'cite', 'form' => 'action', + 'iframe' => 'src', 'img' => [ 'longdesc', 'src' @@ -545,16 +546,18 @@ public function sanitize(string $data, int $type, string $base = '') } } - // Replace relative URLs + // Replace relative URLs and blocks disallowed URI schemes (protocols) $this->base = $base; foreach ($this->replace_url_attributes as $element => $attributes) { $this->replace_urls($document, $element, $attributes); } - if ($this->disallowed_uri_protocols) { - foreach ($this->disallowed_uri_protocols as $protocol) { - $this->strip_uri_protocol($xpath, $protocol); - } + // MathML and SVG allow href on arbitrary descendants, + // so require a walk of the DOM + if ($this->disallowed_uri_schemes !== [] + && ($document->getElementsByTagName('math')->length > 0 + || $document->getElementsByTagName('svg')->length > 0)) { + $this->block_disallowed_schemes_in_descendants($xpath); } // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags. @@ -673,7 +676,10 @@ public function replace_urls(DOMDocument $document, string $tag, $attributes) if ($element->hasAttribute($attribute)) { $value = $this->registry->call(Misc::class, 'absolutize_url', [$element->getAttribute($attribute), $this->base]); if ($value !== false) { - $value = $this->https_url($value); + // Block disallowed URI protocols (e.g. javascript:), otherwise force HTTPS where applicable + $value = ($this->disallowed_uri_schemes !== [] && !$this->is_allowed_scheme($value)) + ? 'unsafe:' . $value + : $this->https_url($value); $element->setAttribute($attribute, $value); } } @@ -758,47 +764,47 @@ protected function enforce_allowed_html_nodes(\DOMNode $element, bool $allow_dat } } - private function extract_protocol(string $uri): string + private function is_allowed_scheme(string $uri): bool { - if (!str_contains($uri, ':')) { - return ''; + $pos = strpos($uri, ':'); + if ($pos === false) { + return true; } - $extracted_protocol = strtolower(preg_replace( - '/[\x01-\x20\s]/', - '', - rawurldecode(explode(':', $uri)[0]) - ) ?? ''); - return $extracted_protocol; + $scheme = strtolower(substr($uri, 0, $pos)); + if (!ctype_alnum($scheme)) { + return false; + } + return !in_array($scheme, $this->disallowed_uri_schemes, true); } /** - * Remove a disallowed URI protocol + * Block disallowed URI schemes (protocols) on elements carrying an href attribute. + * + * This handles MathML and SVG elements which permit href on arbitrary descendant elements + * For SVG, also checks xlink:href attribute. */ - protected function strip_uri_protocol(\DOMXPath $xpath, string $protocol): void + private function block_disallowed_schemes_in_descendants(\DOMXPath $xpath): void { - $protocol = strtolower($protocol); - $elements = $xpath->query('.//a[@href]|.//iframe[@src]|.//math//*[@href]'); - + // Note: content is parsed via loadHTML(), which does not namespace-process attributes + $elements = $xpath->query('.//*[self::math or self::svg]/descendant-or-self::*[@href or @*[name()="xlink:href"]]'); if ($elements === false) { - throw new \SimplePie\Exception(sprintf( - '%s(): Possibly malformed expression', - __METHOD__ - ), 1); + return; } - foreach ($elements as $element) { if (!($element instanceof \DOMElement)) { continue; } - - $href = $element->getAttribute('href'); - $src = $element->getAttribute('src'); - - if ($element->hasAttribute('href') && $this->extract_protocol($href) === $protocol) { - $element->setAttribute('href', 'unsafe:' . $href); + if ($element->hasAttribute('href')) { + $href = $element->getAttribute('href'); + if (!$this->is_allowed_scheme($href)) { + $element->setAttribute('href', 'unsafe:' . $href); + } } - if ($element->hasAttribute('src') && $this->extract_protocol($src) === $protocol) { - $element->setAttribute('src', 'unsafe:' . $src); + if ($element->hasAttribute('xlink:href')) { + $href = $element->getAttribute('xlink:href'); + if (!$this->is_allowed_scheme($href)) { + $element->setAttribute('xlink:href', 'unsafe:' . $href); + } } } } diff --git a/src/SimplePie.php b/src/SimplePie.php index 69d04e35f..9905f2880 100644 --- a/src/SimplePie.php +++ b/src/SimplePie.php @@ -693,11 +693,11 @@ class SimplePie public $allow_aria_attr = true; /** - * @var string[] Stores array of disallowed URI protocols - * @see SimplePie::disallow_uri_protocols() + * @var string[] Stores array of disallowed URI schemes (protocols) + * @see SimplePie::disallow_uri_schemes() * @access private */ - public $disallowed_uri_protocols = ['javascript']; + public $disallowed_uri_schemes = ['javascript']; /** * @var bool Should we throw exceptions, or use the old-style error property? @@ -1597,11 +1597,11 @@ public function allow_aria_attr(bool $allow = true): void } /** - * @param string[] $protocols List of protocols to disallow + * @param string[] $schemes List of schemes (protocols) to disallow */ - public function disallow_uri_protocols(array $protocols = ['javascript']): void + public function disallow_uri_schemes(array $schemes = ['javascript']): void { - $this->sanitize->disallow_uri_protocols($protocols); + $this->sanitize->disallow_uri_schemes($schemes); } /** diff --git a/tests/Unit/SanitizeTest.php b/tests/Unit/SanitizeTest.php index c22c1e772..82dd2e143 100644 --- a/tests/Unit/SanitizeTest.php +++ b/tests/Unit/SanitizeTest.php @@ -103,4 +103,151 @@ public function testSanitizeURLResolution(string $given, string $expected): void self::assertSame($expected, $sanitize->sanitize($given, SIMPLEPIE_CONSTRUCT_HTML, $base)); } + + /** + * @param string[] $disallowedSchemes List of schemes (protocols) to disallow + * @dataProvider disallowedUriSchemesProvider + */ + public function testDisallowedUriSchemes( + string $input, + string $expected, + array $disallowedSchemes = ['javascript'], + bool $stripIframes = true + ): void { + $sanitize = new Sanitize(); + $sanitize->disallow_uri_schemes($disallowedSchemes); + + if (!$stripIframes) { + $sanitize->strip_htmltags = []; + } + + $sanitize->set_registry(new Registry()); + $base = 'http://example.com/'; + self::assertSame($expected, $sanitize->sanitize($input, \SimplePie\SimplePie::CONSTRUCT_HTML, $base)); + } + + /** + * @return iterable,bool}> + */ + public static function disallowedUriSchemesProvider(): iterable + { + yield 'javascript scheme in href' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'javascript scheme with space in href' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'javascript scheme in iframe src' => [ + '', + '', + ['javascript'], + false, + ]; + + yield 'javascript scheme case insensitive' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'vbscript scheme blocked' => [ + 'Click me', + 'Click me', + ['javascript', 'vbscript', 'data'], + true, + ]; + + yield 'data scheme blocked' => [ + 'Click me', + 'Click me', + ['javascript', 'vbscript', 'data'], + true, + ]; + + yield 'safe http scheme unaffected' => [ + 'HTTP link', + 'HTTP link', + ['javascript'], + true, + ]; + + yield 'safe http scheme with blanks unaffected' => [ + 'HTTP link', + 'HTTP link', + ['javascript'], + true, + ]; + + yield 'safe https scheme unaffected' => [ + 'HTTPS link', + 'HTTPS link', + ['javascript'], + true, + ]; + + yield 'safe mailto scheme unaffected' => [ + 'Email', + 'Email', + ['javascript'], + true, + ]; + + yield 'javascript scheme in form action' => [ + '
', + '
', + ['javascript'], + false, + ]; + + yield 'javascript scheme in blockquote cite' => [ + '
Quote
', + '
Quote
', + ['javascript'], + true, + ]; + + yield 'javascript scheme on mathml descendant href' => [ + 'x', + 'x', + ['javascript'], + true, + ]; + + yield 'javascript scheme on mathml root href' => [ + 'x', + 'x', + ['javascript'], + true, + ]; + + yield 'javascript scheme on svg descendant href' => [ + 'x', + 'x', + ['javascript'], + true, + ]; + + yield 'javascript scheme on svg descendant xlink:href' => [ + 'x', + 'x', + ['javascript'], + true, + ]; + + yield 'safe scheme on svg descendant xlink:href unaffected' => [ + 'x', + 'x', + ['javascript'], + true, + ]; + } } From 122afae195acf9c62331264b27d888e46144bb40 Mon Sep 17 00:00:00 2001 From: Inverle Date: Sat, 13 Jun 2026 16:25:38 +0200 Subject: [PATCH 3/6] Add some more tests --- tests/Unit/SanitizeTest.php | 73 ++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/tests/Unit/SanitizeTest.php b/tests/Unit/SanitizeTest.php index 82dd2e143..9e82da1c6 100644 --- a/tests/Unit/SanitizeTest.php +++ b/tests/Unit/SanitizeTest.php @@ -138,13 +138,12 @@ public static function disallowedUriSchemesProvider(): iterable true, ]; - yield 'javascript scheme with space in href' => [ + yield 'javascript scheme with spaces in href' => [ 'Click me', 'Click me', ['javascript'], true, ]; - yield 'javascript scheme in iframe src' => [ '', '', @@ -159,6 +158,76 @@ public static function disallowedUriSchemesProvider(): iterable true, ]; + yield 'javascript scheme url encoded' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'javascript scheme with scheme colon url encoded' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'javascript scheme encoded with numeric HTML entities' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'javascript scheme encoded with hex entities' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'javascript scheme with scheme colon as numeric HTML entity' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'javascript scheme with scheme colon as hex HTML entity' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'javascript scheme with scheme colon as named HTML entity' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'javascript scheme double encoded with URL encoding inside numeric HTML entities' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'javascript scheme with scheme colon double encoded with URL encoding inside numeric HTML entities' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'javascript scheme with a double slash' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + yield 'vbscript scheme blocked' => [ 'Click me', 'Click me', From e6399dce072cec04fe70b36d64b1b95a8cb529cc Mon Sep 17 00:00:00 2001 From: Inverle Date: Sat, 13 Jun 2026 16:29:32 +0200 Subject: [PATCH 4/6] Never strip any HTML tags during URI scheme sanitize test --- tests/Unit/SanitizeTest.php | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/tests/Unit/SanitizeTest.php b/tests/Unit/SanitizeTest.php index 9e82da1c6..261b9b154 100644 --- a/tests/Unit/SanitizeTest.php +++ b/tests/Unit/SanitizeTest.php @@ -112,14 +112,10 @@ public function testDisallowedUriSchemes( string $input, string $expected, array $disallowedSchemes = ['javascript'], - bool $stripIframes = true ): void { $sanitize = new Sanitize(); $sanitize->disallow_uri_schemes($disallowedSchemes); - - if (!$stripIframes) { - $sanitize->strip_htmltags = []; - } + $sanitize->strip_htmltags = []; $sanitize->set_registry(new Registry()); $base = 'http://example.com/'; @@ -135,20 +131,17 @@ public static function disallowedUriSchemesProvider(): iterable 'Click me', 'Click me', ['javascript'], - true, ]; yield 'javascript scheme with spaces in href' => [ 'Click me', 'Click me', ['javascript'], - true, ]; yield 'javascript scheme in iframe src' => [ '', '', ['javascript'], - false, ]; yield 'javascript scheme case insensitive' => [ @@ -162,161 +155,138 @@ public static function disallowedUriSchemesProvider(): iterable 'Click me', 'Click me', ['javascript'], - true, ]; yield 'javascript scheme with scheme colon url encoded' => [ 'Click me', 'Click me', ['javascript'], - true, ]; yield 'javascript scheme encoded with numeric HTML entities' => [ 'Click me', 'Click me', ['javascript'], - true, ]; yield 'javascript scheme encoded with hex entities' => [ 'Click me', 'Click me', ['javascript'], - true, ]; yield 'javascript scheme with scheme colon as numeric HTML entity' => [ 'Click me', 'Click me', ['javascript'], - true, ]; yield 'javascript scheme with scheme colon as hex HTML entity' => [ 'Click me', 'Click me', ['javascript'], - true, ]; yield 'javascript scheme with scheme colon as named HTML entity' => [ 'Click me', 'Click me', ['javascript'], - true, ]; yield 'javascript scheme double encoded with URL encoding inside numeric HTML entities' => [ 'Click me', 'Click me', ['javascript'], - true, ]; yield 'javascript scheme with scheme colon double encoded with URL encoding inside numeric HTML entities' => [ 'Click me', 'Click me', ['javascript'], - true, ]; yield 'javascript scheme with a double slash' => [ 'Click me', 'Click me', ['javascript'], - true, ]; yield 'vbscript scheme blocked' => [ 'Click me', 'Click me', ['javascript', 'vbscript', 'data'], - true, ]; yield 'data scheme blocked' => [ 'Click me', 'Click me', ['javascript', 'vbscript', 'data'], - true, ]; yield 'safe http scheme unaffected' => [ 'HTTP link', 'HTTP link', ['javascript'], - true, ]; yield 'safe http scheme with blanks unaffected' => [ 'HTTP link', 'HTTP link', ['javascript'], - true, ]; yield 'safe https scheme unaffected' => [ 'HTTPS link', 'HTTPS link', ['javascript'], - true, ]; yield 'safe mailto scheme unaffected' => [ 'Email', 'Email', ['javascript'], - true, ]; yield 'javascript scheme in form action' => [ '
', '
', ['javascript'], - false, ]; yield 'javascript scheme in blockquote cite' => [ '
Quote
', '
Quote
', ['javascript'], - true, ]; yield 'javascript scheme on mathml descendant href' => [ 'x', 'x', ['javascript'], - true, ]; yield 'javascript scheme on mathml root href' => [ 'x', 'x', ['javascript'], - true, ]; yield 'javascript scheme on svg descendant href' => [ 'x', 'x', ['javascript'], - true, ]; yield 'javascript scheme on svg descendant xlink:href' => [ 'x', 'x', ['javascript'], - true, ]; yield 'safe scheme on svg descendant xlink:href unaffected' => [ 'x', 'x', ['javascript'], - true, ]; } } From 8d418d1968a5dbc211fb2a71469d71e37a72fb9e Mon Sep 17 00:00:00 2001 From: Inverle Date: Sat, 13 Jun 2026 16:46:47 +0200 Subject: [PATCH 5/6] Fix PHPStan type --- tests/Unit/SanitizeTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Unit/SanitizeTest.php b/tests/Unit/SanitizeTest.php index 261b9b154..017033acf 100644 --- a/tests/Unit/SanitizeTest.php +++ b/tests/Unit/SanitizeTest.php @@ -123,7 +123,7 @@ public function testDisallowedUriSchemes( } /** - * @return iterable,bool}> + * @return iterable}> */ public static function disallowedUriSchemesProvider(): iterable { From 524d0a513b53fce41d40792d8487c47a7fbf6b07 Mon Sep 17 00:00:00 2001 From: Inverle Date: Sat, 13 Jun 2026 16:48:59 +0200 Subject: [PATCH 6/6] Fix PHP 7 parsing errors --- tests/Unit/SanitizeTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/Unit/SanitizeTest.php b/tests/Unit/SanitizeTest.php index 017033acf..7d94d167b 100644 --- a/tests/Unit/SanitizeTest.php +++ b/tests/Unit/SanitizeTest.php @@ -111,7 +111,7 @@ public function testSanitizeURLResolution(string $given, string $expected): void public function testDisallowedUriSchemes( string $input, string $expected, - array $disallowedSchemes = ['javascript'], + array $disallowedSchemes = ['javascript'] ): void { $sanitize = new Sanitize(); $sanitize->disallow_uri_schemes($disallowedSchemes);