Merge branch 'master' into AngleSharp_0_10

This commit is contained in:
Michael Ganss
2017-10-11 14:01:32 +02:00
6 changed files with 328 additions and 15 deletions

View File

@@ -7,6 +7,26 @@ using System.ComponentModel;
namespace Ganss.XSS
{
/// <summary>
/// Provides data for the <see cref="HtmlSanitizer.PostProcessDom"/> event.
/// </summary>
public class PostProcessDomEventArgs : EventArgs
{
/// <summary>
/// Gets or sets the document.
/// </summary>
/// <value>
/// The document.
/// </value>
public IHtmlDocument Document { get; set; }
/// <summary>
/// Initializes a new instance of the <see cref="PostProcessDomEventArgs"/> class.
/// </summary>
public PostProcessDomEventArgs()
{ }
}
/// <summary>
/// Provides data for the <see cref="HtmlSanitizer.PostProcessNode"/> event.
/// </summary>
@@ -162,4 +182,34 @@ namespace Ganss.XSS
/// </value>
public IComment Comment { get; set; }
}
/// <summary>
/// Provides data for the <see cref="HtmlSanitizer.RemovingCssClass"/> event.
/// </summary>
public class RemovingCssClassEventArgs : CancelEventArgs
{
/// <summary>
/// Gets or sets the tag containing the CSS class to be removed.
/// </summary>
/// <value>
/// The tag.
/// </value>
public IElement Tag { get; set; }
/// <summary>
/// Gets or sets the CSS class to be removed.
/// </summary>
/// <value>
/// The CSS class.
/// </value>
public string CssClass { get; set; }
/// <summary>
/// Gets or sets the reason why the CSS class will be removed.
/// </summary>
/// <value>
/// The reason.
/// </value>
public RemoveReason Reason { get; set; }
}
}

View File

@@ -53,12 +53,13 @@ namespace Ganss.XSS
/// Initializes a new instance of the <see cref="HtmlSanitizer"/> class.
/// </summary>
/// <param name="allowedTags">The allowed tag names such as "a" and "div". When <c>null</c>, uses <see cref="DefaultAllowedTags"/></param>
/// <param name="allowedSchemes">The allowed HTTP schemes such as "http" and "https". When <c>null</c>, uses <see cref="DefaultAllowedSchemes"/></param>
/// <param name="allowedSchemes">The allowed HTTP schemes such as "http" and "https". When <c>null</c>, uses <see cref="DefaultAllowedSchemes"/></param>
/// <param name="allowedAttributes">The allowed HTML attributes such as "href" and "alt". When <c>null</c>, uses <see cref="DefaultAllowedAttributes"/></param>
/// <param name="uriAttributes">the HTML attributes that can contain a URI such as "href". When <c>null</c>, uses <see cref="DefaultUriAttributes"/></param>
/// <param name="allowedCssProperties">the allowed CSS properties such as "font" and "margin". When <c>null</c>, uses <see cref="DefaultAllowedCssProperties"/></param>
/// <param name="uriAttributes">The HTML attributes that can contain a URI such as "href". When <c>null</c>, uses <see cref="DefaultUriAttributes"/></param>
/// <param name="allowedCssProperties">The allowed CSS properties such as "font" and "margin". When <c>null</c>, uses <see cref="DefaultAllowedCssProperties"/></param>
/// <param name="allowedCssClasses">CSS class names which are allowed in the value of a class attribute. When <c>null</c>, any class names are allowed.</param>
public HtmlSanitizer(IEnumerable<string> allowedTags = null, IEnumerable<string> allowedSchemes = null,
IEnumerable<string> allowedAttributes = null, IEnumerable<string> uriAttributes = null, IEnumerable<string> allowedCssProperties = null)
IEnumerable<string> allowedAttributes = null, IEnumerable<string> uriAttributes = null, IEnumerable<string> allowedCssProperties = null, IEnumerable<string> allowedCssClasses = null)
{
AllowedTags = new HashSet<string>(allowedTags ?? DefaultAllowedTags, StringComparer.OrdinalIgnoreCase);
AllowedSchemes = new HashSet<string>(allowedSchemes ?? DefaultAllowedSchemes, StringComparer.OrdinalIgnoreCase);
@@ -66,6 +67,7 @@ namespace Ganss.XSS
UriAttributes = new HashSet<string>(uriAttributes ?? DefaultUriAttributes, StringComparer.OrdinalIgnoreCase);
AllowedCssProperties = new HashSet<string>(allowedCssProperties ?? DefaultAllowedCssProperties, StringComparer.OrdinalIgnoreCase);
AllowedAtRules = new HashSet<CssRuleType>(DefaultAllowedAtRules);
AllowedCssClasses = allowedCssClasses != null ? new HashSet<string>(allowedCssClasses) : null;
}
/// <summary>
@@ -282,6 +284,18 @@ namespace Ganss.XSS
set { _disallowedCssPropertyValue = value; }
}
/// <summary>
/// Gets or sets the allowed CSS classes.
/// </summary>
/// <value>
/// The allowed CSS classes.
/// </value>
public ISet<string> AllowedCssClasses { get; private set; }
/// <summary>
/// Occurs after sanitizing the document and post processing nodes.
/// </summary>
public event EventHandler<PostProcessDomEventArgs> PostProcessDom;
/// <summary>
/// Occurs for every node after sanitizing.
/// </summary>
@@ -306,6 +320,19 @@ namespace Ganss.XSS
/// Occurs before a comment is removed.
/// </summary>
public event EventHandler<RemovingCommentEventArgs> RemovingComment;
/// <summary>
/// Occurs before a CSS class is removed.
/// </summary>
public event EventHandler<RemovingCssClassEventArgs> RemovingCssClass;
/// <summary>
/// Raises the <see cref="E:PostProcessDom" /> event.
/// </summary>
/// <param name="e">The <see cref="PostProcessDomEventArgs"/> instance containing the event data.</param>
protected virtual void OnPostProcessDom(PostProcessDomEventArgs e)
{
PostProcessDom?.Invoke(this, e);
}
/// <summary>
/// Raises the <see cref="E:PostProcessNode" /> event.
@@ -366,6 +393,15 @@ namespace Ganss.XSS
/// </summary>
public static readonly Regex DefaultDisallowedCssPropertyValue = new Regex(@"[<>]", RegexOptions.Compiled);
/// <summary>
/// Raises the <see cref="E:RemovingCSSClass" /> event.
/// </summary>
/// <param name="e">The <see cref="RemovingCSSClass"/> instance containing the event data.</param>
protected virtual void OnRemovingCssClass(RemovingCssClassEventArgs e)
{
RemovingCssClass?.Invoke(this, e);
}
/// <summary>
/// Return all nested subnodes of a node.
/// </summary>
@@ -393,6 +429,20 @@ namespace Ganss.XSS
/// <param name="outputFormatter">The formatter used to render the DOM. Using the <see cref="OutputFormatter"/> if null.</param>
/// <returns>The sanitized HTML body fragment.</returns>
public string Sanitize(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null)
{
var dom = SanitizeDom(html, baseUrl);
var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? OutputFormatter);
return output;
}
/// <summary>
/// Sanitizes the specified HTML body fragment. If a document is given, only the body part will be returned.
/// </summary>
/// <param name="html">The HTML body fragment to sanitize.</param>
/// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
/// <returns>The sanitized HTML Document.</returns>
public IHtmlDocument SanitizeDom(string html, string baseUrl = "")
{
var parser = HtmlParserFactory();
var dom = parser.ParseDocument("<html><body></body></html>");
@@ -400,11 +450,9 @@ namespace Ganss.XSS
DoSanitize(dom, dom.Body, baseUrl);
var output = dom.Body.ChildNodes.ToHtml(outputFormatter ?? OutputFormatter);
return output;
return dom;
}
/// <summary>
/// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned.
/// </summary>
@@ -486,15 +534,35 @@ namespace Ganss.XSS
// sanitize the style attribute
SanitizeStyle(tag, baseUrl);
var checkClasses = AllowedCssClasses != null;
var allowedTags = AllowedCssClasses?.ToArray() ?? new string[0];
// sanitize the value of the attributes
foreach (var attribute in tag.Attributes.ToList())
{
// The '& Javascript include' is a possible method to execute Javascript and can lead to XSS.
// (see https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet#.26_JavaScript_includes)
if (attribute.Value.Contains("&{"))
{
RemoveAttribute(tag, attribute, RemoveReason.NotAllowedValue);
}
else
tag.SetAttribute(attribute.Name, attribute.Value);
{
if (checkClasses && attribute.Name == "class")
{
var removedClasses = tag.ClassList.Except(allowedTags).ToArray();
foreach(var removedClass in removedClasses)
RemoveCssClass(tag, removedClass, RemoveReason.NotAllowedCssClass);
if (!tag.ClassList.Any())
RemoveAttribute(tag, attribute, RemoveReason.ClassAttributeEmpty);
}
else
{
tag.SetAttribute(attribute.Name, attribute.Value);
}
}
}
}
@@ -584,6 +652,12 @@ namespace Ganss.XSS
}
}
}
if (PostProcessDom != null)
{
var e = new PostProcessDomEventArgs { Document = dom };
OnPostProcessDom(e);
}
}
/// <summary>
@@ -831,5 +905,18 @@ namespace Ganss.XSS
OnRemovingAtRule(e);
return !e.Cancel;
}
/// <summary>
/// Removes a CSS class from a class attribute.
/// </summary>
/// <param name="tag">Tag the style belongs to</param>
/// <param name="rule">Rule to be removed</param>
/// <returns>true, if the rule can be removed; false, otherwise.</returns>
private void RemoveCssClass(IElement tag, string cssClass, RemoveReason reason)
{
var e = new RemovingCssClassEventArgs { Tag = tag, CssClass = cssClass, Reason = reason };
OnRemovingCssClass(e);
if (!e.Cancel) tag.ClassList.Remove(cssClass);
}
}
}

View File

@@ -19,9 +19,9 @@
<RepositoryUrl>git://github.com/mganss/HtmlSanitizer</RepositoryUrl>
<PackageTargetFallback Condition=" '$(TargetFramework)' == 'netstandard1.3' ">$(PackageTargetFallback);dotnet</PackageTargetFallback>
<GenerateAssemblyVersionAttribute>false</GenerateAssemblyVersionAttribute>
<AppConfig Condition="'$(TargetFramework)' == 'net40'">app.net40.config</AppConfig>
<AutoUnifyAssemblyReferences Condition="'$(TargetFramework)' == 'net40'">false</AutoUnifyAssemblyReferences>
<RootNamespace>Ganss.XSS</RootNamespace>
<AppConfig Condition="'$(TargetFramework)' == 'net40'">app.net40.config</AppConfig>
<AutoUnifyAssemblyReferences Condition="'$(TargetFramework)' == 'net40'">false</AutoUnifyAssemblyReferences>
<RootNamespace>Ganss.XSS</RootNamespace>
</PropertyGroup>
<ItemGroup>
@@ -53,8 +53,7 @@
</PropertyGroup>
<ItemGroup Condition=" '$(TargetFramework)' == 'netstandard1.3' ">
<PackageReference Include="System.ComponentModel" Version="4.3.0" />
<Reference Include="System.ComponentModel" />
<PackageReference Include="System.ComponentModel" Version="4.3.0" />
</ItemGroup>
</Project>

View File

@@ -1,4 +1,7 @@
using AngleSharp;
using AngleSharp.Dom.Css;
using AngleSharp.Dom.Html;
using AngleSharp.Parser.Html;
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
@@ -11,6 +14,29 @@ namespace Ganss.XSS
/// </summary>
public interface IHtmlSanitizer
{
/// <summary>
/// Gets or sets a value indicating whether to keep child nodes of elements that are removed. Default is <see cref="DefaultKeepChildNodes"/>.
/// </summary>
bool KeepChildNodes { get; set; }
/// <summary>
/// Gets or sets the <see cref="Func{HtmlParser}"/> object the creates the parser used for parsing the input.
/// </summary>
Func<HtmlParser> HtmlParserFactory { get; set; }
/// <summary>
/// Gets or sets the <see cref="IMarkupFormatter"/> object used for generating output. Default is <see cref="DefaultOutputFormatter"/>.
/// </summary>
IMarkupFormatter OutputFormatter { get; set; }
/// <summary>
/// Gets or sets the allowed CSS at-rules such as "@media" and "@font-face".
/// </summary>
/// <value>
/// The allowed CSS at-rules.
/// </value>
ISet<CssRuleType> AllowedAtRules { get; }
/// <summary>
/// Gets or sets the allowed HTTP schemes such as "http" and "https".
/// </summary>
@@ -64,6 +90,18 @@ namespace Ganss.XSS
/// </value>
Regex DisallowCssPropertyValue { get; set; }
/// Gets or sets the allowed CSS classes.
/// </summary>
/// <value>
/// The allowed CSS classes.
/// </value>
ISet<string> AllowedCssClasses { get; }
/// <summary>
/// Occurs after sanitizing the document and post processing nodes.
/// </summary>
event EventHandler<PostProcessDomEventArgs> PostProcessDom;
/// <summary>
/// Occurs for every node after sanitizing.
/// </summary>
@@ -84,6 +122,21 @@ namespace Ganss.XSS
/// </summary>
event EventHandler<RemovingStyleEventArgs> RemovingStyle;
/// <summary>
/// Occurs before an at-rule is removed.
/// </summary>
event EventHandler<RemovingAtRuleEventArgs> RemovingAtRule;
/// <summary>
/// Occurs before a comment is removed.
/// </summary>
event EventHandler<RemovingCommentEventArgs> RemovingComment;
/// <summary>
/// Occurs before a CSS class is removed.
/// </summary>
event EventHandler<RemovingCssClassEventArgs> RemovingCssClass;
/// <summary>
/// Sanitizes the specified HTML.
/// </summary>
@@ -92,5 +145,22 @@ namespace Ganss.XSS
/// <param name="outputFormatter">The formatter used to render the DOM. Using the default formatter if null.</param>
/// <returns>The sanitized HTML.</returns>
string Sanitize(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null);
/// <summary>
/// Sanitizes the specified HTML body fragment. If a document is given, only the body part will be returned.
/// </summary>
/// <param name="html">The HTML body fragment to sanitize.</param>
/// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
/// <returns>The sanitized HTML Document.</returns>
IHtmlDocument SanitizeDom(string html, string baseUrl = "");
/// <summary>
/// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned.
/// </summary>
/// <param name="html">The HTML document to sanitize.</param>
/// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
/// <param name="outputFormatter">The formatter used to render the DOM. Using the <see cref="OutputFormatter"/> if null.</param>
/// <returns>The sanitized HTML document.</returns>
string SanitizeDocument(string html, string baseUrl = "", IMarkupFormatter outputFormatter = null);
}
}

View File

@@ -25,5 +25,13 @@
/// Value is not allowed or harmful
/// </summary>
NotAllowedValue,
/// <summary>
/// CSS Class is not allowed
/// </summary>
NotAllowedCssClass,
/// <summary>
/// The class attribute is empty
/// </summary>
ClassAttributeEmpty
}
}

View File

@@ -2166,7 +2166,7 @@ rl(javascript:alert(""foo""))'>";
}
[Fact]
public void PostProcessTest()
public void PostProcessNodeTest()
{
var sanitizer = new HtmlSanitizer();
sanitizer.PostProcessNode += (s, e) =>
@@ -2184,6 +2184,22 @@ rl(javascript:alert(""foo""))'>";
Assert.Equal(@"<div class=""test"">Hallo<b>Test</b></div>", sanitized, ignoreCase: true);
}
[Fact]
public void PostProcessDomTest()
{
var sanitizer = new HtmlSanitizer();
sanitizer.PostProcessDom += (s, e) =>
{
var p = e.Document.CreateElement("p");
p.TextContent = "World";
e.Document.Body.AppendChild(p);
};
var html = @"<div>Hallo</div>";
var sanitized = sanitizer.Sanitize(html);
Assert.Equal(@"<div>Hallo</div><p>World</p>", sanitized, ignoreCase: true);
}
[Fact]
public void AutoLinkTest()
{
@@ -2499,6 +2515,44 @@ rl(javascript:alert(""foo""))'>";
Assert.Equal(RemoveReason.NotAllowedTag, actual);
}
[Fact]
public void RemoveEventForNotAllowedCssClass()
{
RemoveReason? reason = null;
string removedClass = null;
var s = new HtmlSanitizer(allowedAttributes: new[] { "class" }, allowedCssClasses: new[] { "good" });
s.RemovingCssClass += (sender, args) =>
{
reason = args.Reason;
removedClass = args.CssClass;
};
s.Sanitize(@"<div class=""good bad"">Test</div>");
Assert.Equal("bad", removedClass);
Assert.Equal(RemoveReason.NotAllowedCssClass, reason);
}
[Fact]
public void RemoveEventForEmptyClassAttributeAfterClassRemoval()
{
RemoveReason? reason = null;
string attributeName = null;
var s = new HtmlSanitizer(allowedAttributes: new[] { "class" }, allowedCssClasses: new[] { "other" });
s.RemovingAttribute += (sender, args) =>
{
attributeName = args.Attribute.Name;
reason = args.Reason;
};
s.Sanitize(@"<div class=""good bad"">Test</div>");
Assert.Equal("class", attributeName);
Assert.Equal(RemoveReason.ClassAttributeEmpty, reason);
}
[Fact]
public void DocumentTest()
{
@@ -2844,9 +2898,31 @@ zqy1QY1kkPOuMvKWvvmFIwClI2393jVVcp91eda4+J+fIYDbfJa7RY5YcNrZhTuV//9k="">
Assert.Equal(0, failures);
}
}
[Fact]
public void AllowAllClassesByDefaultTest()
{
var sanitizer = new HtmlSanitizer(allowedAttributes: new[] { "class" });
var html = @"<div class=""good bad"">Test</div>";
var actual = sanitizer.Sanitize(html);
Assert.Equal(@"<div class=""good bad"">Test</div>", actual);
}
[Fact]
public void AllowClassesTest()
{
var sanitizer = new HtmlSanitizer(allowedAttributes: new[] { "class" }, allowedCssClasses: new[] { "good" });
var html = @"<div class=""good bad"">Test</div>";
var actual = sanitizer.Sanitize(html);
Assert.Equal(@"<div class=""good"">Test</div>", actual);
}
[Fact]
public void AllowClassesUsingEventTest()
{
var sanitizer = new HtmlSanitizer();
sanitizer.RemovingAttribute += (s, e) =>
@@ -2864,6 +2940,29 @@ zqy1QY1kkPOuMvKWvvmFIwClI2393jVVcp91eda4+J+fIYDbfJa7RY5YcNrZhTuV//9k="">
Assert.Equal(@"<div class=""good"">Test</div>", actual);
}
[Fact]
public void RemoveClassAttributeIfNoAllowedClassesTest()
{
// Empty array for allowed classes = no classes allowed
var sanitizer = new HtmlSanitizer(allowedAttributes: new[] { "class" }, allowedCssClasses: new string[0]);
var html = @"<div class=""good bad"">Test</div>";
var actual = sanitizer.Sanitize(html);
Assert.Equal(@"<div>Test</div>", actual);
}
[Fact]
public void RemoveClassAttributeIfEmptyTest()
{
var sanitizer = new HtmlSanitizer(allowedAttributes: new[] { "class" }, allowedCssClasses: new[] { "other" });
var html = @"<div class=""good bad"">Test</div>";
var actual = sanitizer.Sanitize(html);
Assert.Equal(@"<div>Test</div>", actual);
}
[Fact]
public void TextTest()
{