format.c 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. /* snac - A simple, minimalistic ActivityPub instance */
  2. /* copyright (c) 2022 - 2023 grunfink et al. / MIT license */
  3. #include "xs.h"
  4. #include "xs_regex.h"
  5. #include "xs_mime.h"
  6. #include "snac.h"
  7. /* emoticons, people laughing and such */
  8. struct {
  9. const char *key;
  10. const char *value;
  11. } smileys[] = {
  12. { ":-)", "🙂" },
  13. { ":-D", "😀" },
  14. { "X-D", "😆" },
  15. { ";-)", "😉" },
  16. { "B-)", "😎" },
  17. { ">:-(", "😡" },
  18. { ":-(", "😞" },
  19. { ":-*", "😘" },
  20. { ":-/", "😕" },
  21. { "8-o", "😲" },
  22. { "%-)", "🤪" },
  23. { ":_(", "😢" },
  24. { ":-|", "😐" },
  25. { "<3", "&#128147;" },
  26. { ":facepalm:", "&#129318;" },
  27. { ":shrug:", "&#129335;" },
  28. { ":shrug2:", "&#175;\\_(&#12484;)_/&#175;" },
  29. { ":eyeroll:", "&#128580;" },
  30. { ":beer:", "&#127866;" },
  31. { ":beers:", "&#127867;" },
  32. { ":munch:", "&#128561;" },
  33. { ":thumb:", "&#128077;" },
  34. { NULL, NULL }
  35. };
  36. static xs_str *format_line(const char *line, xs_list **attach)
  37. /* formats a line */
  38. {
  39. xs_str *s = xs_str_new(NULL);
  40. char *p, *v;
  41. /* split by markup */
  42. xs *sm = xs_regex_split(line,
  43. "(`[^`]+`|\\*\\*?[^\\*]+\\*?\\*|https?:/" "/[^[:space:]]+)");
  44. int n = 0;
  45. p = sm;
  46. while (xs_list_iter(&p, &v)) {
  47. if ((n & 0x1)) {
  48. /* markup */
  49. if (xs_startswith(v, "`")) {
  50. xs *s1 = xs_crop_i(xs_dup(v), 1, -1);
  51. xs *e1 = encode_html(s1);
  52. xs *s2 = xs_fmt("<code>%s</code>", e1);
  53. s = xs_str_cat(s, s2);
  54. }
  55. else
  56. if (xs_startswith(v, "**")) {
  57. xs *s1 = xs_crop_i(xs_dup(v), 2, -2);
  58. xs *s2 = xs_fmt("<b>%s</b>", s1);
  59. s = xs_str_cat(s, s2);
  60. }
  61. else
  62. if (xs_startswith(v, "*")) {
  63. xs *s1 = xs_crop_i(xs_dup(v), 1, -1);
  64. xs *s2 = xs_fmt("<i>%s</i>", s1);
  65. s = xs_str_cat(s, s2);
  66. }
  67. else
  68. if (xs_startswith(v, "http")) {
  69. xs *u = xs_replace(v, "#", "&#35;");
  70. xs *v2 = xs_strip_chars_i(xs_dup(u), ".");
  71. const char *mime = xs_mime_by_ext(v2);
  72. if (attach != NULL && xs_startswith(mime, "image/")) {
  73. /* if it's a link to an image, insert it as an attachment */
  74. xs *d = xs_dict_new();
  75. d = xs_dict_append(d, "mediaType", mime);
  76. d = xs_dict_append(d, "url", v2);
  77. d = xs_dict_append(d, "name", "");
  78. d = xs_dict_append(d, "type", "Image");
  79. *attach = xs_list_append(*attach, d);
  80. }
  81. else {
  82. xs *s1 = xs_fmt("<a href=\"%s\" target=\"_blank\">%s</a>", v2, u);
  83. s = xs_str_cat(s, s1);
  84. }
  85. }
  86. else
  87. s = xs_str_cat(s, v);
  88. }
  89. else
  90. /* surrounded text, copy directly */
  91. s = xs_str_cat(s, v);
  92. n++;
  93. }
  94. return s;
  95. }
  96. xs_str *not_really_markdown(const char *content, xs_list **attach)
  97. /* formats a content using some Markdown rules */
  98. {
  99. xs_str *s = xs_str_new(NULL);
  100. int in_pre = 0;
  101. int in_blq = 0;
  102. xs *list;
  103. char *p, *v;
  104. /* work by lines */
  105. list = xs_split(content, "\n");
  106. p = list;
  107. while (xs_list_iter(&p, &v)) {
  108. xs *ss = NULL;
  109. if (strcmp(v, "```") == 0) {
  110. if (!in_pre)
  111. s = xs_str_cat(s, "<pre>");
  112. else
  113. s = xs_str_cat(s, "</pre>");
  114. in_pre = !in_pre;
  115. continue;
  116. }
  117. if (in_pre) {
  118. // Encode all HTML characters when we're in pre element until we are out.
  119. ss = encode_html(v);
  120. s = xs_str_cat(s, ss);
  121. s = xs_str_cat(s, "<br>");
  122. continue;
  123. }
  124. else
  125. ss = xs_strip_i(format_line(v, attach));
  126. if (xs_startswith(ss, "---")) {
  127. /* delete the --- */
  128. ss = xs_strip_i(xs_crop_i(ss, 3, 0));
  129. s = xs_str_cat(s, "<hr>");
  130. s = xs_str_cat(s, ss);
  131. continue;
  132. }
  133. if (xs_startswith(ss, ">")) {
  134. /* delete the > and subsequent spaces */
  135. ss = xs_strip_i(xs_crop_i(ss, 1, 0));
  136. if (!in_blq) {
  137. s = xs_str_cat(s, "<blockquote>");
  138. in_blq = 1;
  139. }
  140. s = xs_str_cat(s, ss);
  141. s = xs_str_cat(s, "<br>");
  142. continue;
  143. }
  144. if (in_blq) {
  145. s = xs_str_cat(s, "</blockquote>");
  146. in_blq = 0;
  147. }
  148. s = xs_str_cat(s, ss);
  149. s = xs_str_cat(s, "<br>");
  150. }
  151. if (in_blq)
  152. s = xs_str_cat(s, "</blockquote>");
  153. if (in_pre)
  154. s = xs_str_cat(s, "</pre>");
  155. /* some beauty fixes */
  156. s = xs_replace_i(s, "<br><br><blockquote>", "<br><blockquote>");
  157. s = xs_replace_i(s, "</blockquote><br>", "</blockquote>");
  158. s = xs_replace_i(s, "</pre><br>", "</pre>");
  159. {
  160. /* traditional emoticons */
  161. int n;
  162. for (n = 0; smileys[n].key; n++)
  163. s = xs_replace_i(s, smileys[n].key, smileys[n].value);
  164. }
  165. return s;
  166. }
  167. const char *valid_tags[] = {
  168. "a", "p", "br", "br/", "blockquote", "ul", "ol", "li", "cite", "small",
  169. "span", "i", "b", "u", "pre", "code", "em", "strong", "hr", "img", "del", NULL
  170. };
  171. xs_str *sanitize(const char *content)
  172. /* cleans dangerous HTML output */
  173. {
  174. xs_str *s = xs_str_new(NULL);
  175. xs *sl;
  176. int n = 0;
  177. char *p, *v;
  178. sl = xs_regex_split(content, "</?[^>]+>");
  179. p = sl;
  180. n = 0;
  181. while (xs_list_iter(&p, &v)) {
  182. if (n & 0x1) {
  183. xs *s1 = xs_strip_i(xs_crop_i(xs_dup(v), v[1] == '/' ? 2 : 1, -1));
  184. xs *l1 = xs_split_n(s1, " ", 1);
  185. xs *tag = xs_tolower_i(xs_dup(xs_list_get(l1, 0)));
  186. xs *s2 = NULL;
  187. int i;
  188. /* check if it's one of the valid tags */
  189. for (i = 0; valid_tags[i]; i++) {
  190. if (strcmp(tag, valid_tags[i]) == 0)
  191. break;
  192. }
  193. if (valid_tags[i]) {
  194. /* accepted tag: rebuild it with only the accepted elements */
  195. xs *el = xs_regex_match(v, "(src|href|rel|class|target)=\"[^\"]*\"");
  196. xs *s3 = xs_join(el, " ");
  197. s2 = xs_fmt("<%s%s%s%s>",
  198. v[1] == '/' ? "/" : "", tag, xs_list_len(el) ? " " : "", s3);
  199. s = xs_str_cat(s, s2);
  200. } else {
  201. /* else? just show it with encoded code.. that's it. */
  202. xs *el = encode_html(v);
  203. s = xs_str_cat(s, el);
  204. }
  205. }
  206. else {
  207. /* non-tag */
  208. s = xs_str_cat(s, v);
  209. }
  210. n++;
  211. }
  212. return s;
  213. }
  214. xs_str *encode_html(const char *str)
  215. /* escapes html characters */
  216. {
  217. xs_str *encoded = xs_replace(str, "&", "&amp;");
  218. encoded = xs_replace_i(encoded, "<", "&lt;");
  219. encoded = xs_replace_i(encoded, ">", "&gt;");
  220. encoded = xs_replace_i(encoded, "\"", "&#34;");
  221. encoded = xs_replace_i(encoded, "'", "&#39;");
  222. /* Restore only <br>. Probably safe. Let's hope nothing goes wrong with this. */
  223. encoded = xs_replace_i(encoded, "&lt;br&gt;", "<br>");
  224. return encoded;
  225. }