format.c 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. /* snac - A simple, minimalistic ActivityPub instance */
  2. /* copyright (c) 2022 - 2023 grunfink et al. / MIT license */
  3. #include "xs.h"
  4. #include "xs_regex.h"
  5. #include "xs_mime.h"
  6. #include "snac.h"
  7. /* emoticons, people laughing and such */
  8. const char *smileys[] = {
  9. ":-)", "🙂",
  10. ":-D", "😀",
  11. "X-D", "😆",
  12. ";-)", "😉",
  13. "B-)", "😎",
  14. ">:-(", "😡",
  15. ":-(", "😞",
  16. ":-*", "😘",
  17. ":-/", "😕",
  18. "8-o", "😲",
  19. "%-)", "🤪",
  20. ":_(", "😢",
  21. ":-|", "😐",
  22. "<3", "&#10084;&#65039;",
  23. ":facepalm:", "&#129318;",
  24. ":shrug:", "&#129335;",
  25. ":shrug2:", "&#175;\\_(&#12484;)_/&#175;",
  26. ":eyeroll:", "&#128580;",
  27. ":beer:", "&#127866;",
  28. ":beers:", "&#127867;",
  29. ":munch:", "&#128561;",
  30. ":thumb:", "&#128077;",
  31. NULL, NULL
  32. };
  33. static xs_str *format_line(const char *line, xs_list **attach)
  34. /* formats a line */
  35. {
  36. xs_str *s = xs_str_new(NULL);
  37. char *p, *v;
  38. /* split by markup */
  39. xs *sm = xs_regex_split(line,
  40. "(`[^`]+`|\\*\\*?[^\\*]+\\*?\\*|https?:/" "/[^[:space:]]+)");
  41. int n = 0;
  42. p = sm;
  43. while (xs_list_iter(&p, &v)) {
  44. if ((n & 0x1)) {
  45. /* markup */
  46. if (xs_startswith(v, "`")) {
  47. xs *s1 = xs_crop_i(xs_dup(v), 1, -1);
  48. xs *e1 = encode_html(s1);
  49. xs *s2 = xs_fmt("<code>%s</code>", e1);
  50. s = xs_str_cat(s, s2);
  51. }
  52. else
  53. if (xs_startswith(v, "**")) {
  54. xs *s1 = xs_crop_i(xs_dup(v), 2, -2);
  55. xs *s2 = xs_fmt("<b>%s</b>", s1);
  56. s = xs_str_cat(s, s2);
  57. }
  58. else
  59. if (xs_startswith(v, "*")) {
  60. xs *s1 = xs_crop_i(xs_dup(v), 1, -1);
  61. xs *s2 = xs_fmt("<i>%s</i>", s1);
  62. s = xs_str_cat(s, s2);
  63. }
  64. else
  65. if (xs_startswith(v, "http")) {
  66. xs *u = xs_replace(v, "#", "&#35;");
  67. xs *v2 = xs_strip_chars_i(xs_dup(u), ".");
  68. const char *mime = xs_mime_by_ext(v2);
  69. if (attach != NULL && xs_startswith(mime, "image/")) {
  70. /* if it's a link to an image, insert it as an attachment */
  71. xs *d = xs_dict_new();
  72. d = xs_dict_append(d, "mediaType", mime);
  73. d = xs_dict_append(d, "url", v2);
  74. d = xs_dict_append(d, "name", "");
  75. d = xs_dict_append(d, "type", "Image");
  76. *attach = xs_list_append(*attach, d);
  77. }
  78. else {
  79. xs *s1 = xs_fmt("<a href=\"%s\" target=\"_blank\">%s</a>", v2, u);
  80. s = xs_str_cat(s, s1);
  81. }
  82. }
  83. else
  84. s = xs_str_cat(s, v);
  85. }
  86. else
  87. /* surrounded text, copy directly */
  88. s = xs_str_cat(s, v);
  89. n++;
  90. }
  91. return s;
  92. }
  93. xs_str *not_really_markdown(const char *content, xs_list **attach)
  94. /* formats a content using some Markdown rules */
  95. {
  96. xs_str *s = xs_str_new(NULL);
  97. int in_pre = 0;
  98. int in_blq = 0;
  99. xs *list;
  100. char *p, *v;
  101. /* work by lines */
  102. list = xs_split(content, "\n");
  103. p = list;
  104. while (xs_list_iter(&p, &v)) {
  105. xs *ss = NULL;
  106. if (strcmp(v, "```") == 0) {
  107. if (!in_pre)
  108. s = xs_str_cat(s, "<pre>");
  109. else
  110. s = xs_str_cat(s, "</pre>");
  111. in_pre = !in_pre;
  112. continue;
  113. }
  114. if (in_pre) {
  115. // Encode all HTML characters when we're in pre element until we are out.
  116. ss = encode_html(v);
  117. s = xs_str_cat(s, ss);
  118. s = xs_str_cat(s, "<br>");
  119. continue;
  120. }
  121. else
  122. ss = xs_strip_i(format_line(v, attach));
  123. if (xs_startswith(ss, "---")) {
  124. /* delete the --- */
  125. ss = xs_strip_i(xs_crop_i(ss, 3, 0));
  126. s = xs_str_cat(s, "<hr>");
  127. s = xs_str_cat(s, ss);
  128. continue;
  129. }
  130. if (xs_startswith(ss, ">")) {
  131. /* delete the > and subsequent spaces */
  132. ss = xs_strip_i(xs_crop_i(ss, 1, 0));
  133. if (!in_blq) {
  134. s = xs_str_cat(s, "<blockquote>");
  135. in_blq = 1;
  136. }
  137. s = xs_str_cat(s, ss);
  138. s = xs_str_cat(s, "<br>");
  139. continue;
  140. }
  141. if (in_blq) {
  142. s = xs_str_cat(s, "</blockquote>");
  143. in_blq = 0;
  144. }
  145. s = xs_str_cat(s, ss);
  146. s = xs_str_cat(s, "<br>");
  147. }
  148. if (in_blq)
  149. s = xs_str_cat(s, "</blockquote>");
  150. if (in_pre)
  151. s = xs_str_cat(s, "</pre>");
  152. /* some beauty fixes */
  153. s = xs_replace_i(s, "<br><br><blockquote>", "<br><blockquote>");
  154. s = xs_replace_i(s, "</blockquote><br>", "</blockquote>");
  155. s = xs_replace_i(s, "</pre><br>", "</pre>");
  156. {
  157. /* traditional emoticons */
  158. const char **emo = smileys;
  159. while (*emo) {
  160. s = xs_replace_i(s, emo[0], emo[1]);
  161. emo += 2;
  162. }
  163. }
  164. return s;
  165. }
  166. const char *valid_tags[] = {
  167. "a", "p", "br", "br/", "blockquote", "ul", "ol", "li", "cite", "small",
  168. "span", "i", "b", "u", "s", "pre", "code", "em", "strong", "hr", "img", "del", NULL
  169. };
  170. xs_str *sanitize(const char *content)
  171. /* cleans dangerous HTML output */
  172. {
  173. xs_str *s = xs_str_new(NULL);
  174. xs *sl;
  175. int n = 0;
  176. char *p, *v;
  177. sl = xs_regex_split(content, "</?[^>]+>");
  178. p = sl;
  179. n = 0;
  180. while (xs_list_iter(&p, &v)) {
  181. if (n & 0x1) {
  182. xs *s1 = xs_strip_i(xs_crop_i(xs_dup(v), v[1] == '/' ? 2 : 1, -1));
  183. xs *l1 = xs_split_n(s1, " ", 1);
  184. xs *tag = xs_tolower_i(xs_dup(xs_list_get(l1, 0)));
  185. xs *s2 = NULL;
  186. int i;
  187. /* check if it's one of the valid tags */
  188. for (i = 0; valid_tags[i]; i++) {
  189. if (strcmp(tag, valid_tags[i]) == 0)
  190. break;
  191. }
  192. if (valid_tags[i]) {
  193. /* accepted tag: rebuild it with only the accepted elements */
  194. xs *el = xs_regex_select(v, "(src|href|rel|class|target)=\"[^\"]*\"");
  195. xs *s3 = xs_join(el, " ");
  196. s2 = xs_fmt("<%s%s%s%s>",
  197. v[1] == '/' ? "/" : "", tag, xs_list_len(el) ? " " : "", s3);
  198. s = xs_str_cat(s, s2);
  199. } else {
  200. /* else? just show it with encoded code.. that's it. */
  201. xs *el = encode_html(v);
  202. s = xs_str_cat(s, el);
  203. }
  204. }
  205. else {
  206. /* non-tag */
  207. s = xs_str_cat(s, v);
  208. }
  209. n++;
  210. }
  211. return s;
  212. }
  213. xs_str *encode_html_strict(const char *str)
  214. /* escapes html characters */
  215. {
  216. xs_str *encoded = xs_replace(str, "&", "&amp;");
  217. encoded = xs_replace_i(encoded, "<", "&lt;");
  218. encoded = xs_replace_i(encoded, ">", "&gt;");
  219. encoded = xs_replace_i(encoded, "\"", "&#34;");
  220. encoded = xs_replace_i(encoded, "'", "&#39;");
  221. return encoded;
  222. }
  223. xs_str *encode_html(const char *str)
  224. /* escapes html characters */
  225. {
  226. xs_str *encoded = encode_html_strict(str);
  227. /* Restore only <br>. Probably safe. Let's hope nothing goes wrong with this. */
  228. encoded = xs_replace_i(encoded, "&lt;br&gt;", "<br>");
  229. return encoded;
  230. }