html.go 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. package render
  2. import (
  3. "golang.org/x/net/html"
  4. "fmt"
  5. "mimicry/style"
  6. "errors"
  7. "strings"
  8. "regexp"
  9. "golang.org/x/net/html/atom"
  10. )
  11. // preprocessing:
  12. // substitute escape key for escape visualizer
  13. // newline strategy:
  14. // blocks have 2 newlines before and two after,
  15. // at the end collapse 4 newlines into 2
  16. // maybe instead collapse any amount greater
  17. // (regex: \n{2,}) than 2 down to 2
  18. // also at the end trim all newlines from
  19. // very beginning and very end
  20. // for block links probably use ‣
  21. // I think it may work to collapse all
  22. // text node whitespace down to space,
  23. // and then trim the contents of blocks
  24. // (including the implicit body element)
  25. // FIXME: instead, you should collapse all whitespace into
  26. // space (including newline, so format newlines don't appear), then do newline insertion for blocks
  27. // then collapse all single-newline-containing whitespace into
  28. // one newline and multi-newline-containing whitespace into two
  29. // newlines
  30. // I will have this issue: https://unix.stackexchange.com/questions/170551/force-lynx-or-elinks-to-interpret-spaces-and-line-breaks
  31. // i.e. 3 or more br's in a row become idempotent, but I don't care
  32. func renderHTML(markup string) (string, error) {
  33. /* Preprocessing
  34. To prevent input text from changing its color, style, etc
  35. via terminal escape codes, swap out U+001B (ESCAPE) for
  36. U+241B (SYMBOL FOR ESCAPE)
  37. TODO: move this to the complete beginning of render, not
  38. just the HTML section
  39. */
  40. markup = strings.ReplaceAll(markup, "\u001b", "␛")
  41. nodes, err := html.ParseFragment(strings.NewReader(markup), &html.Node{
  42. Type: html.ElementNode,
  43. Data: "body",
  44. DataAtom: atom.Body,
  45. })
  46. if err != nil {
  47. return "", err
  48. }
  49. serialized, err := SerializeList(nodes)
  50. if err != nil {
  51. return "", err
  52. }
  53. /*
  54. Postprocessing
  55. Block elements are separated from siblings by prepending
  56. and appending two newline characters. If two blocks are
  57. adjacent, this will result in too many newline characters.
  58. Furthermore, in text nodes, newline-containing whitespace
  59. is collapsed into a single newline, potentially resulting
  60. in even more newlines. So collapse sequences of over two
  61. newlines into two newlines. Also trim all newlines from
  62. the beginning and end of the output.
  63. */
  64. manyNewlines := regexp.MustCompile(`\n{2,}`)
  65. serialized = manyNewlines.ReplaceAllString(serialized, "\n\n")
  66. serialized = strings.Trim(serialized, "\n")
  67. return serialized, nil
  68. }
  69. func renderNode(node *html.Node, preserveWhitespace bool) (string, error) {
  70. if node.Type == html.TextNode {
  71. if !preserveWhitespace {
  72. whitespace := regexp.MustCompile(`[\t ]+`)
  73. newline := regexp.MustCompile(`[\n\t ]*\n[n\t ]*`)
  74. processed := newline.ReplaceAllString(node.Data, "\n")
  75. processed = whitespace.ReplaceAllString(processed, " ")
  76. return processed, nil
  77. }
  78. return node.Data, nil
  79. }
  80. if node.Type != html.ElementNode {
  81. return "", nil
  82. }
  83. // this may need to be moved down into the switch
  84. // so that pre and code can override the last parameter
  85. content := serializeChildren(node, preserveWhitespace)
  86. switch node.Data {
  87. case "a":
  88. return style.Linkify(content), nil
  89. case "s", "del":
  90. return style.Strikethrough(content), nil
  91. case "code":
  92. return style.Code(content), nil
  93. case "i", "em":
  94. return style.Italic(content), nil
  95. case "b", "strong":
  96. return style.Bold(content), nil
  97. case "u":
  98. return style.Underline(content), nil
  99. case "mark":
  100. return style.Highlight(content), nil
  101. case "span", "li":
  102. return content, nil
  103. case "br":
  104. return "\n", nil
  105. case "p", "div":
  106. return block(content), nil
  107. case "pre":
  108. return block(style.CodeBlock(content)), nil
  109. case "blockquote":
  110. // FIXME: change blockquote to style.QuoteBlock
  111. return block(blockquote(content)), nil
  112. case "ul":
  113. return block(bulletedList(node, preserveWhitespace)), nil
  114. // case "ul":
  115. // return numberedList(node), nil
  116. }
  117. return "", errors.New("Encountered unrecognized element " + node.Data)
  118. }
  119. func serializeChildren(node *html.Node, preserveWhitespace bool) (string) {
  120. output := ""
  121. for current := node.FirstChild; current != nil; current = current.NextSibling {
  122. result, _ := renderNode(current, preserveWhitespace)
  123. // if err != nil {
  124. // return "", err
  125. // }
  126. output += result
  127. }
  128. return output
  129. }
  130. func SerializeList(nodes []*html.Node) (string, error) {
  131. output := ""
  132. for _, current := range nodes {
  133. result, err := renderNode(current, false)
  134. if err != nil {
  135. return "", err
  136. }
  137. output += result
  138. }
  139. return output, nil
  140. }
  141. func block(text string) string {
  142. return fmt.Sprintf("\n\n%s\n\n", text)
  143. }
  144. func blockquote(text string) string {
  145. withBar := fmt.Sprintf("▌%s", strings.ReplaceAll(text, "\n", "\n▌"))
  146. withColor := style.Color(withBar)
  147. return withColor
  148. }
  149. func bulletedList(node *html.Node, preserveWhitespace bool) string {
  150. output := ""
  151. for current := node.FirstChild; current != nil; current = current.NextSibling {
  152. if current.Type != html.ElementNode {
  153. continue
  154. }
  155. if current.Data != "li" {
  156. continue
  157. }
  158. result, _ := renderNode(current, preserveWhitespace)
  159. output += fmt.Sprintf("• %s", strings.ReplaceAll(result, "\n", "\n "))
  160. }
  161. return output
  162. }
  163. // could count them and use that to determine
  164. // indentation, but that is way premature
  165. // func numberedList(node *html.Node) string {
  166. // output += ""
  167. // i uint := 1
  168. // for current := node.FirstChild; current != nil; current = current.NextSibling {
  169. // if node.Type != html.ElementNode {
  170. // continue
  171. // }
  172. // if node.Data != "li" {
  173. // continue
  174. // }
  175. // fmt.Sprintf("%d. ")
  176. // output += strings.ReplaceAll(renderNode(node), "\n", "\n ")
  177. // i += 1
  178. // }
  179. // return output
  180. // }