using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.ComponentModel;
namespace NearForums
{
public static class Utils
{
///
/// sanitize any potentially dangerous tags from the provided raw HTML input using
/// a whitelist based approach, leaving the "safe" HTML tags
/// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937 / http://refactormycode.com/codes/333-sanitize-html
///
/// Html to sanitize
/// Regex containing the allowed name of the html elements. For example: em|h(2|3|4)|strong|p
public static string SanitizeHtml(string html, string whiteListTags = "b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3|4)|i|kbd|li|ol|p(re)?|s(ub|up|trong|trike)?|ul|a|img")
{
#region Regex definitions
Regex tagsRegex = new Regex("<[^>]*(>|$)",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);
Regex cleanupRegex = new Regex("((?<=<\\w+[^>]*)(?!\\shref|\\sclass|\\srel|\\stitle|\\sclass|\\swidth|\\sheight|\\salt|\\ssrc)(\\s[\\w-]+)=[\"']?((?:.(?![\"']?\\s+(?:\\S+)=|[>\"']))+.)[\"']?)|((?<=]*)\\sclass=\"MsoNormal\")",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnoreCase | RegexOptions.Compiled);
Regex whitelistRegex = new Regex("^?(" + whiteListTags + ")>$|^<(b|h)r\\s?/?>$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace);
Regex whitelistAnchorRegex = new Regex(@"
^ href=""(\#\w+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+)""
(
(\sclass=""([\w-]+)"")|(\stitle=""[^""<>]+"")|
(\srel=""nofollow""))*
\s?>$|
^$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace);
Regex whitelistImageRegex = new Regex(@"
^
src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+""
((\swidth=""\d{1,3}"")|
(\sheight=""\d{1,3}"")|
(\salt=""[^""<>]*"")|
(\stitle=""[^""<>]*""))*
\s?/?>$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.IgnorePatternWhitespace);
#endregion
if (String.IsNullOrEmpty(html))
return html;
//Do a previous cleanup, for not not allowed attributes included comming from word
html = cleanupRegex.Replace(html, "");
string tagname;
Match tag;
// match every HTML tag in the input
MatchCollection tags = tagsRegex.Matches(html);
for (int i = tags.Count - 1; i > -1; i--)
{
tag = tags[i];
tagname = tag.Value.ToLowerInvariant();
if (!(whitelistRegex.IsMatch(tagname) || whitelistAnchorRegex.IsMatch(tagname) || whitelistImageRegex.IsMatch(tagname)))
{
html = html.Remove(tag.Index, tag.Length);
System.Diagnostics.Debug.WriteLine("tag sanitized: " + tagname);
}
}
return html;
}
}
}