diff --git a/src/Sanitize.php b/src/Sanitize.php index dbcc6310e..18139e256 100644 --- a/src/Sanitize.php +++ b/src/Sanitize.php @@ -58,6 +58,8 @@ class Sanitize implements RegistryAware public $allow_data_attr = true; /** @var bool */ public $allow_aria_attr = true; + /** @var string[] */ + public $disallowed_uri_schemes = ['javascript']; /** @var array> */ public $add_attributes = ['audio' => ['preload' => 'none'], 'iframe' => ['sandbox' => 'allow-scripts allow-same-origin'], 'video' => ['preload' => 'none']]; /** @var bool */ @@ -280,6 +282,14 @@ public function allow_aria_attr(bool $allow = true): void $this->allow_aria_attr = $allow; } + /** + * @param string[] $schemes List of URI schemes (protocols) to disallow + */ + public function disallow_uri_schemes(array $schemes = ['javascript']): void + { + $this->disallowed_uri_schemes = $schemes; + } + /** * @return void */ @@ -352,8 +362,8 @@ public function set_output_encoding(string $encoding = 'UTF-8') * containing URLs that need to be resolved relative to the feed * * Defaults to |a|@href, |area|@href, |audio|@src, |blockquote|@cite, - * |del|@cite, |form|@action, |img|@longdesc, |img|@src, |input|@src, - * |ins|@cite, |q|@cite, |source|@src, |video|@src + * |del|@cite, |form|@action, |iframe|@src, |img|@longdesc, |img|@src, + * |input|@src, |ins|@cite, |q|@cite, |source|@src, |video|@src * * @since 1.0 * @param array|null $element_attribute Element/attribute key/value pairs, null for default @@ -369,6 +379,7 @@ public function set_url_replacements(?array $element_attribute = null) 'blockquote' => 'cite', 'del' => 'cite', 'form' => 'action', + 'iframe' => 'src', 'img' => [ 'longdesc', 'src' @@ -535,12 +546,20 @@ public function sanitize(string $data, int $type, string $base = '') } } - // Replace relative URLs + // Replace relative URLs and blocks disallowed URI schemes (protocols) $this->base = $base; foreach ($this->replace_url_attributes as $element => $attributes) { $this->replace_urls($document, $element, $attributes); } + // MathML and SVG allow href on arbitrary descendants, + // so require a walk of the DOM + if ($this->disallowed_uri_schemes !== [] + && ($document->getElementsByTagName('math')->length > 0 + || $document->getElementsByTagName('svg')->length > 0)) { + $this->block_disallowed_schemes_in_descendants($xpath); + } + // If image handling (caching, etc.) is enabled, cache and rewrite all the image tags. if ($this->image_handler !== '' && $this->enable_cache) { $images = $document->getElementsByTagName('img'); @@ -657,7 +676,10 @@ public function replace_urls(DOMDocument $document, string $tag, $attributes) if ($element->hasAttribute($attribute)) { $value = $this->registry->call(Misc::class, 'absolutize_url', [$element->getAttribute($attribute), $this->base]); if ($value !== false) { - $value = $this->https_url($value); + // Block disallowed URI protocols (e.g. javascript:), otherwise force HTTPS where applicable + $value = ($this->disallowed_uri_schemes !== [] && !$this->is_allowed_scheme($value)) + ? 'unsafe:' . $value + : $this->https_url($value); $element->setAttribute($attribute, $value); } } @@ -742,6 +764,51 @@ protected function enforce_allowed_html_nodes(\DOMNode $element, bool $allow_dat } } + private function is_allowed_scheme(string $uri): bool + { + $pos = strpos($uri, ':'); + if ($pos === false) { + return true; + } + $scheme = strtolower(substr($uri, 0, $pos)); + if (!ctype_alnum($scheme)) { + return false; + } + return !in_array($scheme, $this->disallowed_uri_schemes, true); + } + + /** + * Block disallowed URI schemes (protocols) on elements carrying an href attribute. + * + * This handles MathML and SVG elements which permit href on arbitrary descendant elements + * For SVG, also checks xlink:href attribute. + */ + private function block_disallowed_schemes_in_descendants(\DOMXPath $xpath): void + { + // Note: content is parsed via loadHTML(), which does not namespace-process attributes + $elements = $xpath->query('.//*[self::math or self::svg]/descendant-or-self::*[@href or @*[name()="xlink:href"]]'); + if ($elements === false) { + return; + } + foreach ($elements as $element) { + if (!($element instanceof \DOMElement)) { + continue; + } + if ($element->hasAttribute('href')) { + $href = $element->getAttribute('href'); + if (!$this->is_allowed_scheme($href)) { + $element->setAttribute('href', 'unsafe:' . $href); + } + } + if ($element->hasAttribute('xlink:href')) { + $href = $element->getAttribute('xlink:href'); + if (!$this->is_allowed_scheme($href)) { + $element->setAttribute('xlink:href', 'unsafe:' . $href); + } + } + } + } + /** * @param int-mask-of $type * @return void diff --git a/src/SimplePie.php b/src/SimplePie.php index aa9d2782f..9905f2880 100644 --- a/src/SimplePie.php +++ b/src/SimplePie.php @@ -692,6 +692,13 @@ class SimplePie */ public $allow_aria_attr = true; + /** + * @var string[] Stores array of disallowed URI schemes (protocols) + * @see SimplePie::disallow_uri_schemes() + * @access private + */ + public $disallowed_uri_schemes = ['javascript']; + /** * @var bool Should we throw exceptions, or use the old-style error property? * @access private @@ -1589,6 +1596,14 @@ public function allow_aria_attr(bool $allow = true): void $this->sanitize->allow_aria_attr($allow); } + /** + * @param string[] $schemes List of schemes (protocols) to disallow + */ + public function disallow_uri_schemes(array $schemes = ['javascript']): void + { + $this->sanitize->disallow_uri_schemes($schemes); + } + /** * @return void */ diff --git a/tests/Unit/SanitizeTest.php b/tests/Unit/SanitizeTest.php index c22c1e772..7d94d167b 100644 --- a/tests/Unit/SanitizeTest.php +++ b/tests/Unit/SanitizeTest.php @@ -103,4 +103,190 @@ public function testSanitizeURLResolution(string $given, string $expected): void self::assertSame($expected, $sanitize->sanitize($given, SIMPLEPIE_CONSTRUCT_HTML, $base)); } + + /** + * @param string[] $disallowedSchemes List of schemes (protocols) to disallow + * @dataProvider disallowedUriSchemesProvider + */ + public function testDisallowedUriSchemes( + string $input, + string $expected, + array $disallowedSchemes = ['javascript'] + ): void { + $sanitize = new Sanitize(); + $sanitize->disallow_uri_schemes($disallowedSchemes); + $sanitize->strip_htmltags = []; + + $sanitize->set_registry(new Registry()); + $base = 'http://example.com/'; + self::assertSame($expected, $sanitize->sanitize($input, \SimplePie\SimplePie::CONSTRUCT_HTML, $base)); + } + + /** + * @return iterable}> + */ + public static function disallowedUriSchemesProvider(): iterable + { + yield 'javascript scheme in href' => [ + 'Click me', + 'Click me', + ['javascript'], + ]; + + yield 'javascript scheme with spaces in href' => [ + 'Click me', + 'Click me', + ['javascript'], + ]; + yield 'javascript scheme in iframe src' => [ + '', + '', + ['javascript'], + ]; + + yield 'javascript scheme case insensitive' => [ + 'Click me', + 'Click me', + ['javascript'], + true, + ]; + + yield 'javascript scheme url encoded' => [ + 'Click me', + 'Click me', + ['javascript'], + ]; + + yield 'javascript scheme with scheme colon url encoded' => [ + 'Click me', + 'Click me', + ['javascript'], + ]; + + yield 'javascript scheme encoded with numeric HTML entities' => [ + 'Click me', + 'Click me', + ['javascript'], + ]; + + yield 'javascript scheme encoded with hex entities' => [ + 'Click me', + 'Click me', + ['javascript'], + ]; + + yield 'javascript scheme with scheme colon as numeric HTML entity' => [ + 'Click me', + 'Click me', + ['javascript'], + ]; + + yield 'javascript scheme with scheme colon as hex HTML entity' => [ + 'Click me', + 'Click me', + ['javascript'], + ]; + + yield 'javascript scheme with scheme colon as named HTML entity' => [ + 'Click me', + 'Click me', + ['javascript'], + ]; + + yield 'javascript scheme double encoded with URL encoding inside numeric HTML entities' => [ + 'Click me', + 'Click me', + ['javascript'], + ]; + + yield 'javascript scheme with scheme colon double encoded with URL encoding inside numeric HTML entities' => [ + 'Click me', + 'Click me', + ['javascript'], + ]; + + yield 'javascript scheme with a double slash' => [ + 'Click me', + 'Click me', + ['javascript'], + ]; + + yield 'vbscript scheme blocked' => [ + 'Click me', + 'Click me', + ['javascript', 'vbscript', 'data'], + ]; + + yield 'data scheme blocked' => [ + 'Click me', + 'Click me', + ['javascript', 'vbscript', 'data'], + ]; + + yield 'safe http scheme unaffected' => [ + 'HTTP link', + 'HTTP link', + ['javascript'], + ]; + + yield 'safe http scheme with blanks unaffected' => [ + 'HTTP link', + 'HTTP link', + ['javascript'], + ]; + + yield 'safe https scheme unaffected' => [ + 'HTTPS link', + 'HTTPS link', + ['javascript'], + ]; + + yield 'safe mailto scheme unaffected' => [ + 'Email', + 'Email', + ['javascript'], + ]; + + yield 'javascript scheme in form action' => [ + '
', + '
', + ['javascript'], + ]; + + yield 'javascript scheme in blockquote cite' => [ + '
Quote
', + '
Quote
', + ['javascript'], + ]; + + yield 'javascript scheme on mathml descendant href' => [ + 'x', + 'x', + ['javascript'], + ]; + + yield 'javascript scheme on mathml root href' => [ + 'x', + 'x', + ['javascript'], + ]; + + yield 'javascript scheme on svg descendant href' => [ + 'x', + 'x', + ['javascript'], + ]; + + yield 'javascript scheme on svg descendant xlink:href' => [ + 'x', + 'x', + ['javascript'], + ]; + + yield 'safe scheme on svg descendant xlink:href unaffected' => [ + 'x', + 'x', + ['javascript'], + ]; + } }