diff --git a/go.mod b/go.mod index fe057de..0ea6e4d 100644 --- a/go.mod +++ b/go.mod @@ -1,17 +1,17 @@ module github.com/writeas/web-core go 1.10 require ( github.com/gofrs/uuid v3.3.0+incompatible - github.com/kr/pretty v0.1.0 // indirect github.com/kylemcc/twitter-text-go v0.0.0-20180726194232-7f582f6736ec github.com/microcosm-cc/bluemonday v1.0.2 github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect - github.com/writeas/impart v1.1.0 + github.com/writeas/go-strip-markdown v2.0.1+incompatible + github.com/writeas/impart v1.1.1 github.com/writeas/openssl-go v1.0.0 - github.com/writeas/saturday v1.6.0 - golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613 - gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect + github.com/writeas/saturday v1.7.1 + golang.org/x/crypto v0.0.0-20200109152110-61a87790db17 + gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0 // indirect ) diff --git a/go.sum b/go.sum index 4d2be2e..81be546 100644 --- a/go.sum +++ b/go.sum @@ -1,28 +1,35 @@ -github.com/gofrs/uuid v1.2.0 h1:coDhrjgyJaglxSjxuJdqQSSdUpG3w6p1OwN2od6frBU= github.com/gofrs/uuid v3.3.0+incompatible h1:8K4tyRfvU1CYPgJsveYFQMhpFd/wXNM7iK6rR7UHz84= github.com/gofrs/uuid v3.3.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= -github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kylemcc/twitter-text-go v0.0.0-20180726194232-7f582f6736ec h1:ZXWuspqypleMuJy4bzYEqlMhJnGAYpLrWe5p7W3CdvI= github.com/kylemcc/twitter-text-go v0.0.0-20180726194232-7f582f6736ec/go.mod h1:voECJzdraJmolzPBgL9Z7ANwXf4oMXaTCsIkdiPpR/g= github.com/microcosm-cc/bluemonday v1.0.2 h1:5lPfLTTAvAbtS0VqT+94yOtFnGfUWYyx0+iToC3Os3s= github.com/microcosm-cc/bluemonday v1.0.2/go.mod h1:iVP4YcDBq+n/5fb23BhYFvIMq/leAFZyRl6bYmGDlGc= github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= -github.com/writeas/impart v1.1.0 h1:nPnoO211VscNkp/gnzir5UwCDEvdHThL5uELU60NFSE= -github.com/writeas/impart v1.1.0/go.mod h1:g0MpxdnTOHHrl+Ca/2oMXUHJ0PcRAEWtkCzYCJUXC9Y= +github.com/writeas/go-strip-markdown v2.0.1+incompatible h1:IIqxTM5Jr7RzhigcL6FkrCNfXkvbR+Nbu1ls48pXYcw= +github.com/writeas/go-strip-markdown v2.0.1+incompatible/go.mod h1:Rsyu10ZhbEK9pXdk8V6MVnZmTzRG0alMNLMwa0J01fE= +github.com/writeas/impart v1.1.1 h1:RyA9+CqbdbDuz53k+nXCWUY+NlEkdyw6+nWanxSBl5o= +github.com/writeas/impart v1.1.1/go.mod h1:g0MpxdnTOHHrl+Ca/2oMXUHJ0PcRAEWtkCzYCJUXC9Y= github.com/writeas/openssl-go v1.0.0 h1:YXM1tDXeYOlTyJjoMlYLQH1xOloUimSR1WMF8kjFc5o= github.com/writeas/openssl-go v1.0.0/go.mod h1:WsKeK5jYl0B5y8ggOmtVjbmb+3rEGqSD25TppjJnETA= -github.com/writeas/saturday v1.6.0 h1:HNUtX8TVJJnSdxE8vaAgtHiAJVt+Of5AJYlm62pmVlI= -github.com/writeas/saturday v1.6.0/go.mod h1:ETE1EK6ogxptJpAgUbcJD0prAtX48bSloie80+tvnzQ= -golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613 h1:MQ/ZZiDsUapFFiMS+vzwXkCTeEKaum+Do5rINYJDmxc= -golang.org/x/crypto v0.0.0-20190131182504-b8fe1690c613/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= +github.com/writeas/saturday v1.7.1 h1:lYo1EH6CYyrFObQoA9RNWHVlpZA5iYL5Opxo7PYAnZE= +github.com/writeas/saturday v1.7.1/go.mod h1:ETE1EK6ogxptJpAgUbcJD0prAtX48bSloie80+tvnzQ= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200109152110-61a87790db17 h1:nVJ3guKA9qdkEQ3TUdXI9QSINo2CUPM/cySEvw2w8I0= +golang.org/x/crypto v0.0.0-20200109152110-61a87790db17/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/net v0.0.0-20181220203305-927f97764cc3 h1:eH6Eip3UpmR+yM/qI9Ijluzb1bNv/cAU/n+6l8tRSis= golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3 h1:0GoQqolDA55aaLxZyTzK/Y2ePZzZTUrRacwib7cNsYQ= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0 h1:POO/ycCATvegFmVuPpQzZFJ+pGZeX22Ufu6fibxDVjU= gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0/go.mod h1:WDnlLJ4WF5VGsH/HVa3CI79GS0ol3YnhVnKP89i0kNg= diff --git a/posts/parse.go b/posts/parse.go index b082a05..168289f 100644 --- a/posts/parse.go +++ b/posts/parse.go @@ -1,21 +1,165 @@ package posts import ( + "fmt" + stripmd "github.com/writeas/go-strip-markdown" + "github.com/writeas/web-core/stringmanip" + "regexp" "strings" + "unicode" + "unicode/utf8" +) + +const ( + maxTitleLen = 80 + assumedTitleLen = 80 +) + +var ( + titleElementReg = regexp.MustCompile("?p>") + urlReg = regexp.MustCompile("https?://") + imgReg = regexp.MustCompile(`!\[([^]]+)\]\([^)]+\)`) ) // ExtractTitle takes the given raw post text and returns a title, if explicitly // provided, and a body. func ExtractTitle(content string) (title string, body string) { if hashIndex := strings.Index(content, "# "); hashIndex == 0 { eol := strings.IndexRune(content, '\n') // First line should start with # and end with \n if eol != -1 { body = strings.TrimLeft(content[eol:], " \t\n\r") title = content[len("# "):eol] return } } body = content return } + +func FriendlyPostTitle(content, friendlyId string) string { + content = StripHTMLWithoutEscaping(content) + + content = strings.TrimLeftFunc(stripmd.Strip(content), unicode.IsSpace) + eol := strings.IndexRune(content, '\n') + blankLine := strings.Index(content, "\n\n") + if blankLine != -1 && blankLine <= eol && blankLine <= assumedTitleLen { + return strings.TrimSpace(content[:blankLine]) + } else if eol == -1 && utf8.RuneCountInString(content) <= maxTitleLen { + return content + } + + title, truncd := TruncToWord(PostLede(content, true), maxTitleLen) + if truncd { + title += "..." + } + return title +} + +// PostDescription generates a description based on the given post content, +// title, and post ID. This doesn't consider a V2 post field, `title` when +// choosing what to generate. In case a post has a title, this function will +// fail, and logic should instead be implemented to skip this when there's no +// title, like so: +// var desc string +// if title == "" { +// desc = PostDescription(content, title, friendlyId) +// } else { +// desc = ShortPostDescription(content) +// } +func PostDescription(content, title, friendlyId string) string { + maxLen := 140 + + if content == "" { + content = "WriteFreely is a painless, simple, federated blogging platform." + } else { + fmtStr := "%s" + truncation := 0 + if utf8.RuneCountInString(content) > maxLen { + // Post is longer than the max description, so let's show a better description + fmtStr = "%s..." + truncation = 3 + } + + if title == friendlyId { + // No specific title was found; simply truncate the post, starting at the beginning + content = fmt.Sprintf(fmtStr, strings.Replace(stringmanip.Substring(content, 0, maxLen-truncation), "\n", " ", -1)) + } else { + // There was a title, so return a real description + blankLine := strings.Index(content, "\n\n") + if blankLine < 0 { + blankLine = 0 + } + truncd := stringmanip.Substring(content, blankLine, blankLine+maxLen-truncation) + contentNoNL := strings.Replace(truncd, "\n", " ", -1) + content = strings.TrimSpace(fmt.Sprintf(fmtStr, contentNoNL)) + } + } + + return content +} + +func ShortPostDescription(content string) string { + maxLen := 140 + fmtStr := "%s" + truncation := 0 + if utf8.RuneCountInString(content) > maxLen { + // Post is longer than the max description, so let's show a better description + fmtStr = "%s..." + truncation = 3 + } + return strings.TrimSpace(fmt.Sprintf(fmtStr, strings.Replace(stringmanip.Substring(content, 0, maxLen-truncation), "\n", " ", -1))) +} + +// TruncToWord truncates the given text to the provided limit. +func TruncToWord(s string, l int) (string, bool) { + truncated := false + c := []rune(s) + if len(c) > l { + truncated = true + s = string(c[:l]) + spaceIdx := strings.LastIndexByte(s, ' ') + if spaceIdx > -1 { + s = s[:spaceIdx] + } + } + return s, truncated +} + +// PostLede attempts to extract the first thought of the given post, generally +// contained within the first line or sentence of text. +func PostLede(t string, includePunc bool) string { + // Adjust where we truncate if we want to include punctuation + iAdj := 0 + if includePunc { + iAdj = 1 + } + + // Find lede within first line of text + nl := strings.IndexRune(t, '\n') + if nl > -1 { + t = t[:nl] + } + + // Strip certain HTML tags + t = titleElementReg.ReplaceAllString(t, "") + + // Strip URL protocols + t = urlReg.ReplaceAllString(t, "") + + // Strip image URL, leaving only alt text + t = imgReg.ReplaceAllString(t, " $1 ") + + // Find lede within first sentence + punc := strings.Index(t, ". ") + if punc > -1 { + t = t[:punc+iAdj] + } + punc = stringmanip.IndexRune(t, '。') + if punc > -1 { + c := []rune(t) + t = string(c[:punc+iAdj]) + } + + return t +} diff --git a/posts/render.go b/posts/render.go index 157bed2..d0885d5 100644 --- a/posts/render.go +++ b/posts/render.go @@ -1,63 +1,70 @@ package posts import ( "github.com/microcosm-cc/bluemonday" "github.com/writeas/saturday" + "html" "regexp" "strings" "unicode" ) var ( blockReg = regexp.MustCompile("<(ul|ol|blockquote)>\n") endBlockReg = regexp.MustCompile("([a-z]+)>\n(ul|ol|blockquote)>") markeddownReg = regexp.MustCompile("
(.+)
") ) func ApplyMarkdown(data []byte) string { mdExtensions := 0 | blackfriday.EXTENSION_TABLES | blackfriday.EXTENSION_FENCED_CODE | blackfriday.EXTENSION_AUTOLINK | blackfriday.EXTENSION_STRIKETHROUGH | blackfriday.EXTENSION_SPACE_HEADERS | blackfriday.EXTENSION_HEADER_IDS htmlFlags := 0 | blackfriday.HTML_USE_SMARTYPANTS | blackfriday.HTML_SMARTYPANTS_DASHES // Generate Markdown md := blackfriday.Markdown([]byte(data), blackfriday.HtmlRenderer(htmlFlags, "", ""), mdExtensions) // Strip out bad HTML policy := bluemonday.UGCPolicy() policy.AllowAttrs("class", "id").Globally() outHTML := string(policy.SanitizeBytes(md)) // Strip newlines on certain block elements that render with them outHTML = blockReg.ReplaceAllString(outHTML, "<$1>") outHTML = endBlockReg.ReplaceAllString(outHTML, "$1>$2>") return outHTML } func ApplyBasicMarkdown(data []byte) string { mdExtensions := 0 | blackfriday.EXTENSION_STRIKETHROUGH | blackfriday.EXTENSION_SPACE_HEADERS | blackfriday.EXTENSION_HEADER_IDS htmlFlags := 0 | blackfriday.HTML_SKIP_HTML | blackfriday.HTML_USE_SMARTYPANTS | blackfriday.HTML_SMARTYPANTS_DASHES // Generate Markdown md := blackfriday.Markdown([]byte(data), blackfriday.HtmlRenderer(htmlFlags, "", ""), mdExtensions) // Strip out bad HTML policy := bluemonday.UGCPolicy() policy.AllowAttrs("class", "id").Globally() outHTML := string(policy.SanitizeBytes(md)) outHTML = markeddownReg.ReplaceAllString(outHTML, "$1") outHTML = strings.TrimRightFunc(outHTML, unicode.IsSpace) return outHTML } + +// StripHTMLWithoutEscaping strips HTML tags with bluemonday's StrictPolicy, then unescapes the HTML +// entities added in by sanitizing the content. +func StripHTMLWithoutEscaping(content string) string { + return html.UnescapeString(bluemonday.StrictPolicy().Sanitize(content)) +}