sanitize.go
No OneTemporary
Actions

Size

14 KB

Subscribers

None

sanitize.go
View Options

	// Copyright (c) 2014, David Kitchen <david@buro9.com>
	//
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright notice, this
	// list of conditions and the following disclaimer.
	//
	// * Redistributions in binary form must reproduce the above copyright notice,
	// this list of conditions and the following disclaimer in the documentation
	// and/or other materials provided with the distribution.
	//
	// * Neither the name of the organisation (Microcosm) nor the names of its
	// contributors may be used to endorse or promote products derived from
	// this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
	// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	package bluemonday

	import (
	"bytes"
	"io"
	"net/url"
	"regexp"
	"strings"

	"golang.org/x/net/html"
	)

	var (
	dataAttribute = regexp.MustCompile("^data-.+")
	dataAttributeXMLPrefix = regexp.MustCompile("^xml.+")
	dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+")
	)

	// Sanitize takes a string that contains a HTML fragment or document and applies
	// the given policy whitelist.
	//
	// It returns a HTML string that has been sanitized by the policy or an empty
	// string if an error has occurred (most likely as a consequence of extremely
	// malformed input)
	func (p *Policy) Sanitize(s string) string {
	if strings.TrimSpace(s) == "" {
	return s
	}

	return p.sanitize(strings.NewReader(s)).String()
	}

	// SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
	// the given policy whitelist.
	//
	// It returns a []byte containing the HTML that has been sanitized by the policy
	// or an empty []byte if an error has occurred (most likely as a consequence of
	// extremely malformed input)
	func (p *Policy) SanitizeBytes(b []byte) []byte {
	if len(bytes.TrimSpace(b)) == 0 {
	return b
	}

	return p.sanitize(bytes.NewReader(b)).Bytes()
	}

	// SanitizeReader takes an io.Reader that contains a HTML fragment or document
	// and applies the given policy whitelist.
	//
	// It returns a bytes.Buffer containing the HTML that has been sanitized by the
	// policy. Errors during sanitization will merely return an empty result.
	func (p Policy) SanitizeReader(r io.Reader) bytes.Buffer {
	return p.sanitize(r)
	}

	// Performs the actual sanitization process.
	func (p Policy) sanitize(r io.Reader) bytes.Buffer {

	// It is possible that the developer has created the policy via:
	// p := bluemonday.Policy{}
	// rather than:
	// p := bluemonday.NewPolicy()
	// If this is the case, and if they haven't yet triggered an action that
	// would initiliaze the maps, then we need to do that.
	p.init()

	var (
	buff bytes.Buffer
	skipElementContent bool
	skippingElementsCount int64
	skipClosingTag bool
	closingTagToSkipStack []string
	mostRecentlyStartedToken string
	)

	tokenizer := html.NewTokenizer(r)
	for {
	if tokenizer.Next() == html.ErrorToken {
	err := tokenizer.Err()
	if err == io.EOF {
	// End of input means end of processing
	return &buff
	}

	// Raw tokenizer error
	return &bytes.Buffer{}
	}

	token := tokenizer.Token()
	switch token.Type {
	case html.DoctypeToken:

	// DocType is not handled as there is no safe parsing mechanism
	// provided by golang.org/x/net/html for the content, and this can
	// be misused to insert HTML tags that are not then sanitized
	//
	// One might wish to recursively sanitize here using the same policy
	// but I will need to do some further testing before considering
	// this.

	case html.CommentToken:

	// Comments are ignored by default

	case html.StartTagToken:

	mostRecentlyStartedToken = token.Data

	aps, ok := p.elsAndAttrs[token.Data]
	if !ok {
	if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
	skipElementContent = true
	skippingElementsCount++
	}
	if p.addSpaces {
	buff.WriteString(" ")
	}
	break
	}

	if len(token.Attr) != 0 {
	token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
	}

	if len(token.Attr) == 0 {
	if !p.allowNoAttrs(token.Data) {
	skipClosingTag = true
	closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
	if p.addSpaces {
	buff.WriteString(" ")
	}
	break
	}
	}

	if !skipElementContent {
	buff.WriteString(token.String())
	}

	case html.EndTagToken:

	if mostRecentlyStartedToken == token.Data {
	mostRecentlyStartedToken = ""
	}

	if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data {
	closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1]
	if len(closingTagToSkipStack) == 0 {
	skipClosingTag = false
	}
	if p.addSpaces {
	buff.WriteString(" ")
	}
	break
	}

	if _, ok := p.elsAndAttrs[token.Data]; !ok {
	if _, ok := p.setOfElementsToSkipContent[token.Data]; ok {
	skippingElementsCount--
	if skippingElementsCount == 0 {
	skipElementContent = false
	}
	}
	if p.addSpaces {
	buff.WriteString(" ")
	}
	break
	}

	if !skipElementContent {
	buff.WriteString(token.String())
	}

	case html.SelfClosingTagToken:

	aps, ok := p.elsAndAttrs[token.Data]
	if !ok {
	if p.addSpaces {
	buff.WriteString(" ")
	}
	break
	}

	if len(token.Attr) != 0 {
	token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
	}

	if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
	if p.addSpaces {
	buff.WriteString(" ")
	}
	break
	}

	if !skipElementContent {
	buff.WriteString(token.String())
	}

	case html.TextToken:

	if !skipElementContent {
	switch mostRecentlyStartedToken {
	case "script":
	// not encouraged, but if a policy allows JavaScript we
	// should not HTML escape it as that would break the output
	buff.WriteString(token.Data)
	case "style":
	// not encouraged, but if a policy allows CSS styles we
	// should not HTML escape it as that would break the output
	buff.WriteString(token.Data)
	default:
	// HTML escape the text
	buff.WriteString(token.String())
	}
	}
	default:
	// A token that didn't exist in the html package when we wrote this
	return &bytes.Buffer{}
	}
	}
	}

	// sanitizeAttrs takes a set of element attribute policies and the global
	// attribute policies and applies them to the []html.Attribute returning a set
	// of html.Attributes that match the policies
	func (p *Policy) sanitizeAttrs(
	elementName string,
	attrs []html.Attribute,
	aps map[string]attrPolicy,
	) []html.Attribute {

	if len(attrs) == 0 {
	return attrs
	}

	// Builds a new attribute slice based on the whether the attribute has been
	// whitelisted explicitly or globally.
	cleanAttrs := []html.Attribute{}
	for _, htmlAttr := range attrs {
	if p.allowDataAttributes {
	// If we see a data attribute, let it through.
	if isDataAttribute(htmlAttr.Key) {
	cleanAttrs = append(cleanAttrs, htmlAttr)
	continue
	}
	}
	// Is there an element specific attribute policy that applies?
	if ap, ok := aps[htmlAttr.Key]; ok {
	if ap.regexp != nil {
	if ap.regexp.MatchString(htmlAttr.Val) {
	cleanAttrs = append(cleanAttrs, htmlAttr)
	continue
	}
	} else {
	cleanAttrs = append(cleanAttrs, htmlAttr)
	continue
	}
	}

	// Is there a global attribute policy that applies?
	if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {

	if ap.regexp != nil {
	if ap.regexp.MatchString(htmlAttr.Val) {
	cleanAttrs = append(cleanAttrs, htmlAttr)
	}
	} else {
	cleanAttrs = append(cleanAttrs, htmlAttr)
	}
	}
	}

	if len(cleanAttrs) == 0 {
	// If nothing was allowed, let's get out of here
	return cleanAttrs
	}
	// cleanAttrs now contains the attributes that are permitted

	if linkable(elementName) {
	if p.requireParseableURLs {
	// Ensure URLs are parseable:
	// - a.href
	// - area.href
	// - link.href
	// - blockquote.cite
	// - q.cite
	// - img.src
	// - script.src
	tmpAttrs := []html.Attribute{}
	for _, htmlAttr := range cleanAttrs {
	switch elementName {
	case "a", "area", "link":
	if htmlAttr.Key == "href" {
	if u, ok := p.validURL(htmlAttr.Val); ok {
	htmlAttr.Val = u
	tmpAttrs = append(tmpAttrs, htmlAttr)
	}
	break
	}
	tmpAttrs = append(tmpAttrs, htmlAttr)
	case "blockquote", "q":
	if htmlAttr.Key == "cite" {
	if u, ok := p.validURL(htmlAttr.Val); ok {
	htmlAttr.Val = u
	tmpAttrs = append(tmpAttrs, htmlAttr)
	}
	break
	}
	tmpAttrs = append(tmpAttrs, htmlAttr)
	case "img", "script":
	if htmlAttr.Key == "src" {
	if u, ok := p.validURL(htmlAttr.Val); ok {
	htmlAttr.Val = u
	tmpAttrs = append(tmpAttrs, htmlAttr)
	}
	break
	}
	tmpAttrs = append(tmpAttrs, htmlAttr)
	default:
	tmpAttrs = append(tmpAttrs, htmlAttr)
	}
	}
	cleanAttrs = tmpAttrs
	}

	if (p.requireNoFollow \|\|
	p.requireNoFollowFullyQualifiedLinks \|\|
	p.addTargetBlankToFullyQualifiedLinks) &&
	len(cleanAttrs) > 0 {

	// Add rel="nofollow" if a "href" exists
	switch elementName {
	case "a", "area", "link":
	var hrefFound bool
	var externalLink bool
	for _, htmlAttr := range cleanAttrs {
	if htmlAttr.Key == "href" {
	hrefFound = true

	u, err := url.Parse(htmlAttr.Val)
	if err != nil {
	continue
	}
	if u.Host != "" {
	externalLink = true
	}

	continue
	}
	}

	if hrefFound {
	var (
	noFollowFound bool
	targetBlankFound bool
	)

	addNoFollow := (p.requireNoFollow \|\|
	externalLink && p.requireNoFollowFullyQualifiedLinks)

	addTargetBlank := (externalLink &&
	p.addTargetBlankToFullyQualifiedLinks)

	tmpAttrs := []html.Attribute{}
	for _, htmlAttr := range cleanAttrs {

	var appended bool
	if htmlAttr.Key == "rel" && addNoFollow {

	if strings.Contains(htmlAttr.Val, "nofollow") {
	noFollowFound = true
	tmpAttrs = append(tmpAttrs, htmlAttr)
	appended = true
	} else {
	htmlAttr.Val += " nofollow"
	noFollowFound = true
	tmpAttrs = append(tmpAttrs, htmlAttr)
	appended = true
	}
	}

	if elementName == "a" && htmlAttr.Key == "target" {
	if htmlAttr.Val == "_blank" {
	targetBlankFound = true
	}
	if addTargetBlank && !targetBlankFound {
	htmlAttr.Val = "_blank"
	targetBlankFound = true
	tmpAttrs = append(tmpAttrs, htmlAttr)
	appended = true
	}
	}

	if !appended {
	tmpAttrs = append(tmpAttrs, htmlAttr)
	}
	}
	if noFollowFound \|\| targetBlankFound {
	cleanAttrs = tmpAttrs
	}

	if addNoFollow && !noFollowFound {
	rel := html.Attribute{}
	rel.Key = "rel"
	rel.Val = "nofollow"
	cleanAttrs = append(cleanAttrs, rel)
	}

	if elementName == "a" && addTargetBlank && !targetBlankFound {
	rel := html.Attribute{}
	rel.Key = "target"
	rel.Val = "_blank"
	targetBlankFound = true
	cleanAttrs = append(cleanAttrs, rel)
	}

	if targetBlankFound {
	// target="_blank" has a security risk that allows the
	// opened window/tab to issue JavaScript calls against
	// window.opener, which in effect allow the destination
	// of the link to control the source:
	// https://dev.to/ben/the-targetblank-vulnerability-by-example
	//
	// To mitigate this risk, we need to add a specific rel
	// attribute if it is not already present.
	// rel="noopener"
	//
	// Unfortunately this is processing the rel twice (we
	// already looked at it earlier ^^) as we cannot be sure
	// of the ordering of the href and rel, and whether we
	// have fully satisfied that we need to do this. This
	// double processing only happens if target="_blank"
	// is true.
	var noOpenerAdded bool
	tmpAttrs := []html.Attribute{}
	for _, htmlAttr := range cleanAttrs {
	var appended bool
	if htmlAttr.Key == "rel" {
	if strings.Contains(htmlAttr.Val, "noopener") {
	noOpenerAdded = true
	tmpAttrs = append(tmpAttrs, htmlAttr)
	} else {
	htmlAttr.Val += " noopener"
	noOpenerAdded = true
	tmpAttrs = append(tmpAttrs, htmlAttr)
	}

	appended = true
	}
	if !appended {
	tmpAttrs = append(tmpAttrs, htmlAttr)
	}
	}
	if noOpenerAdded {
	cleanAttrs = tmpAttrs
	} else {
	// rel attr was not found, or else noopener would
	// have been added already
	rel := html.Attribute{}
	rel.Key = "rel"
	rel.Val = "noopener"
	cleanAttrs = append(cleanAttrs, rel)
	}

	}
	}
	default:
	}
	}
	}

	return cleanAttrs
	}

	func (p *Policy) allowNoAttrs(elementName string) bool {
	_, ok := p.setOfElementsAllowedWithoutAttrs[elementName]
	return ok
	}

	func (p *Policy) validURL(rawurl string) (string, bool) {
	if p.requireParseableURLs {
	// URLs are valid if when space is trimmed the URL is valid
	rawurl = strings.TrimSpace(rawurl)

	// URLs cannot contain whitespace, unless it is a data-uri
	if (strings.Contains(rawurl, " ") \|\|
	strings.Contains(rawurl, "\t") \|\|
	strings.Contains(rawurl, "\n")) &&
	!strings.HasPrefix(rawurl, `data:`) {
	return "", false
	}

	// URLs are valid if they parse
	u, err := url.Parse(rawurl)
	if err != nil {
	return "", false
	}

	if u.Scheme != "" {

	urlPolicy, ok := p.allowURLSchemes[u.Scheme]
	if !ok {
	return "", false

	}

	if urlPolicy == nil \|\| urlPolicy(u) == true {
	return u.String(), true
	}

	return "", false
	}

	if p.allowRelativeURLs {
	if u.String() != "" {
	return u.String(), true
	}
	}

	return "", false
	}

	return rawurl, true
	}

	func linkable(elementName string) bool {
	switch elementName {
	case "a", "area", "blockquote", "img", "link", "script":
	return true
	default:
	return false
	}
	}

	func isDataAttribute(val string) bool {
	if !dataAttribute.MatchString(val) {
	return false
	}
	rest := strings.Split(val, "data-")
	if len(rest) == 1 {
	return false
	}
	// data-xml* is invalid.
	if dataAttributeXMLPrefix.MatchString(rest[1]) {
	return false
	}
	// no uppercase or semi-colons allowed.
	if dataAttributeInvalidChars.MatchString(rest[1]) {
	return false
	}
	return true
	}

File Metadata

Mime Type: text/plain
Expires: Sun, Nov 23, 7:33 AM (1 d, 10 h)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3500700

sanitize.goNo OneTemporaryActions

sanitize.goView Options

File Metadata

Event Timeline

sanitize.go
No OneTemporary
Actions

sanitize.go
View Options