Add SanitizeDocument overload that takes a Stream

Fixes #158
This commit is contained in:
Michael Ganss
2019-01-27 17:02:38 +01:00
parent dfd5e9d237
commit 11b271631e
3 changed files with 64 additions and 14 deletions

View File

@@ -9,6 +9,7 @@ using AngleSharp.Parser.Html;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
@@ -490,6 +491,27 @@ namespace Ganss.XSS
}
}
/// <summary>
/// Sanitizes the specified HTML document. Even if only a fragment is given, a whole document will be returned.
/// </summary>
/// <param name="html">The HTML document to sanitize.</param>
/// <param name="baseUrl">The base URL relative URLs are resolved against. No resolution if empty.</param>
/// <param name="outputFormatter">The formatter used to render the DOM. Using the <see cref="OutputFormatter"/> if null.</param>
/// <returns>The sanitized HTML document.</returns>
public string SanitizeDocument(Stream html, string baseUrl = "", IMarkupFormatter outputFormatter = null)
{
var parser = HtmlParserFactory();
using (var dom = parser.Parse(html))
{
DoSanitize(dom, dom.DocumentElement, baseUrl);
var output = dom.ToHtml(outputFormatter ?? OutputFormatter);
return output;
}
}
/// <summary>
/// Creeates an instance of <see cref="HtmlParser"/>.
/// </summary>

View File

@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>netcoreapp2.1;netcoreapp2.0;net452</TargetFrameworks>
<TargetFrameworks>netcoreapp2.1;netcoreapp2.0;net46</TargetFrameworks>
<AssemblyName>HtmlSanitizer.Tests</AssemblyName>
<PackageId>HtmlSanitizer.Tests</PackageId>
<GenerateRuntimeConfigurationFiles>true</GenerateRuntimeConfigurationFiles>
@@ -23,19 +23,23 @@
</ItemGroup>
<ItemGroup>
<PackageReference Include="coverlet.msbuild" Version="2.1.1" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.8.0" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.0" />
<PackageReference Include="xunit.runner.console" Version="2.4.0" />
<PackageReference Include="xunit" Version="2.4.0" />
</ItemGroup>
<ItemGroup Condition=" '$(TargetFramework)' == 'netcoreapp2.0' ">
</ItemGroup>
<ItemGroup Condition=" '$(TargetFramework)' == 'net452' ">
<Reference Include="System" />
<Reference Include="Microsoft.CSharp" />
<PackageReference Include="coverlet.msbuild" Version="2.5.1">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
</PackageReference>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.9.0" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.1">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
</PackageReference>
<PackageReference Include="xunit.runner.console" Version="2.4.1">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
</PackageReference>
<PackageReference Include="xunit" Version="2.4.1" />
<PackageReference Include="System.Text.Encoding.CodePages">
<Version>4.5.1</Version>
</PackageReference>
</ItemGroup>
<ItemGroup>

View File

@@ -10,6 +10,8 @@ using AngleSharp;
using AngleSharp.Dom.Css;
using System.Threading;
using System.Reflection;
using System.IO;
using System.Text;
// Tests based on tests from http://roadkill.codeplex.com/
@@ -36,6 +38,7 @@ namespace Ganss.XSS.Tests
public HtmlSanitizerTests(HtmlSanitizerFixture fixture)
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
Sanitizer = fixture.Sanitizer;
}
@@ -3097,6 +3100,27 @@ zqy1QY1kkPOuMvKWvvmFIwClI2393jVVcp91eda4+J+fIYDbfJa7RY5YcNrZhTuV//9k="">
Assert.Equal(@"<img src=""https://www.example.com/test.png"">", actual);
}
[Fact]
public void EncodingTest()
{
// https://github.com/mganss/HtmlSanitizer/issues/158
var sanitizer = new HtmlSanitizer();
sanitizer.AllowedTags.Add("meta");
sanitizer.AllowedAttributes.Add("http-equiv");
sanitizer.AllowedAttributes.Add("content");
var html = @"<html><head><meta http-equiv=""Content-Type"" content=""text/html; charset=iso-8859-1""></head><body>kopieën</body></html>";
using (var stream = new MemoryStream(Encoding.GetEncoding("iso-8859-1").GetBytes(html)))
{
var actual = sanitizer.SanitizeDocument(stream);
Assert.Equal(html, actual);
}
}
}
}