format.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. /* snac - A simple, minimalistic ActivityPub instance */
  2. /* copyright (c) 2022 - 2024 grunfink et al. / MIT license */
  3. #include "xs.h"
  4. #include "xs_regex.h"
  5. #include "xs_mime.h"
  6. #include "xs_html.h"
  7. #include "xs_json.h"
  8. #include "xs_time.h"
  9. #include "snac.h"
  10. /* emoticons, people laughing and such */
  11. const char *smileys[] = {
  12. ":-)", "🙂",
  13. ":-D", "😀",
  14. "X-D", "😆",
  15. ";-)", "😉",
  16. "B-)", "😎",
  17. ">:-(", "😡",
  18. ":-(", "😞",
  19. ":-*", "😘",
  20. ":-/", "😕",
  21. "8-o", "😲",
  22. "%-)", "🤪",
  23. ":_(", "😢",
  24. ":-|", "😐",
  25. "<3", "&#10084;&#65039;",
  26. ":facepalm:", "&#129318;",
  27. ":shrug:", "&#129335;",
  28. ":shrug2:", "&#175;\\_(&#12484;)_/&#175;",
  29. ":eyeroll:", "&#128580;",
  30. ":beer:", "&#127866;",
  31. ":beers:", "&#127867;",
  32. ":munch:", "&#128561;",
  33. ":thumb:", "&#128077;",
  34. NULL, NULL
  35. };
  36. xs_dict *emojis(void)
  37. /* returns a dict with the emojis */
  38. {
  39. xs *fn = xs_fmt("%s/emojis.json", srv_basedir);
  40. FILE *f;
  41. if (mtime(fn) == 0) {
  42. /* file does not exist; create it with the defaults */
  43. xs *d = xs_dict_new();
  44. const char **emo = smileys;
  45. while (*emo) {
  46. d = xs_dict_append(d, emo[0], emo[1]);
  47. emo += 2;
  48. }
  49. if ((f = fopen(fn, "w")) != NULL) {
  50. xs_json_dump(d, 4, f);
  51. fclose(f);
  52. }
  53. else
  54. srv_log(xs_fmt("Error creating '%s'", fn));
  55. }
  56. xs_dict *d = NULL;
  57. if ((f = fopen(fn, "r")) != NULL) {
  58. d = xs_json_load(f);
  59. fclose(f);
  60. if (d == NULL)
  61. srv_log(xs_fmt("JSON parse error in '%s'", fn));
  62. }
  63. else
  64. srv_log(xs_fmt("Error opening '%s'", fn));
  65. return d;
  66. }
  67. static xs_str *format_line(const char *line, xs_list **attach)
  68. /* formats a line */
  69. {
  70. xs_str *s = xs_str_new(NULL);
  71. char *p;
  72. const char *v;
  73. /* split by markup */
  74. xs *sm = xs_regex_split(line,
  75. "("
  76. "`[^`]+`" "|"
  77. "~~[^~]+~~" "|"
  78. "\\*\\*?\\*?[^\\*]+\\*?\\*?\\*" "|"
  79. "!\\[[^]]+\\]\\([^\\)]+\\)" "|"
  80. "\\[[^]]+\\]\\([^\\)]+\\)" "|"
  81. "[a-z]+:/" "/[^[:space:]]+"
  82. ")");
  83. int n = 0;
  84. p = sm;
  85. while (xs_list_iter(&p, &v)) {
  86. if ((n & 0x1)) {
  87. /* markup */
  88. if (xs_startswith(v, "`")) {
  89. xs *s1 = xs_strip_chars_i(xs_dup(v), "`");
  90. xs *e1 = encode_html(s1);
  91. xs *s2 = xs_fmt("<code>%s</code>", e1);
  92. s = xs_str_cat(s, s2);
  93. }
  94. else
  95. if (xs_startswith(v, "***")) {
  96. xs *s1 = xs_strip_chars_i(xs_dup(v), "*");
  97. xs *s2 = xs_fmt("<b><i>%s</i></b>", s1);
  98. s = xs_str_cat(s, s2);
  99. }
  100. else
  101. if (xs_startswith(v, "**")) {
  102. xs *s1 = xs_strip_chars_i(xs_dup(v), "*");
  103. xs *s2 = xs_fmt("<b>%s</b>", s1);
  104. s = xs_str_cat(s, s2);
  105. }
  106. else
  107. if (xs_startswith(v, "*")) {
  108. xs *s1 = xs_strip_chars_i(xs_dup(v), "*");
  109. xs *s2 = xs_fmt("<i>%s</i>", s1);
  110. s = xs_str_cat(s, s2);
  111. }
  112. else
  113. if (xs_startswith(v, "~~")) {
  114. xs *s1 = xs_strip_chars_i(xs_dup(v), "~");
  115. xs *e1 = encode_html(s1);
  116. xs *s2 = xs_fmt("<s>%s</s>", e1);
  117. s = xs_str_cat(s, s2);
  118. }
  119. else
  120. if (*v == '[') {
  121. /* markdown-like links [label](url) */
  122. xs *w = xs_strip_chars_i(
  123. xs_replace_i(xs_replace(v, "#", "&#35;"), "@", "&#64;"),
  124. "![)");
  125. xs *l = xs_split_n(w, "](", 1);
  126. if (xs_list_len(l) == 2) {
  127. xs *link = xs_fmt("<a href=\"%s\">%s</a>",
  128. xs_list_get(l, 1), xs_list_get(l, 0));
  129. s = xs_str_cat(s, link);
  130. }
  131. else
  132. s = xs_str_cat(s, v);
  133. }
  134. else
  135. if (*v == '!') {
  136. /* markdown-like images ![alt text](url to image) */
  137. xs *w = xs_strip_chars_i(
  138. xs_replace_i(xs_replace(v, "#", "&#35;"), "@", "&#64;"),
  139. "![)");
  140. xs *l = xs_split_n(w, "](", 1);
  141. if (xs_list_len(l) == 2) {
  142. const char *alt_text = xs_list_get(l, 0);
  143. const char *img_url = xs_list_get(l, 1);
  144. const char *mime = xs_mime_by_ext(img_url);
  145. if (attach != NULL && xs_startswith(mime, "image/")) {
  146. xs *d = xs_dict_new();
  147. d = xs_dict_append(d, "mediaType", mime);
  148. d = xs_dict_append(d, "url", img_url);
  149. d = xs_dict_append(d, "name", alt_text);
  150. d = xs_dict_append(d, "type", "Image");
  151. *attach = xs_list_append(*attach, d);
  152. }
  153. else {
  154. xs *link = xs_fmt("<a href=\"%s\">%s</a>", img_url, alt_text);
  155. s = xs_str_cat(s, link);
  156. }
  157. }
  158. else
  159. s = xs_str_cat(s, v);
  160. }
  161. else
  162. if (xs_str_in(v, ":/" "/") != -1) {
  163. xs *u = xs_replace_i(xs_replace(v, "#", "&#35;"), "@", "&#64;");
  164. xs *v2 = xs_strip_chars_i(xs_dup(u), ".,)");
  165. const char *mime = xs_mime_by_ext(v2);
  166. if (attach != NULL && xs_startswith(mime, "image/")) {
  167. /* if it's a link to an image, insert it as an attachment */
  168. xs *d = xs_dict_new();
  169. d = xs_dict_append(d, "mediaType", mime);
  170. d = xs_dict_append(d, "url", v2);
  171. d = xs_dict_append(d, "name", "");
  172. d = xs_dict_append(d, "type", "Image");
  173. *attach = xs_list_append(*attach, d);
  174. }
  175. else {
  176. xs *s1 = xs_fmt("<a href=\"%s\" target=\"_blank\">%s</a>", v2, u);
  177. s = xs_str_cat(s, s1);
  178. }
  179. }
  180. else
  181. s = xs_str_cat(s, v);
  182. }
  183. else
  184. /* surrounded text, copy directly */
  185. s = xs_str_cat(s, v);
  186. n++;
  187. }
  188. return s;
  189. }
  190. xs_str *not_really_markdown(const char *content, xs_list **attach, xs_list **tag)
  191. /* formats a content using some Markdown rules */
  192. {
  193. xs_str *s = xs_str_new(NULL);
  194. int in_pre = 0;
  195. int in_blq = 0;
  196. xs *list;
  197. char *p;
  198. const char *v;
  199. /* work by lines */
  200. list = xs_split(content, "\n");
  201. p = list;
  202. while (xs_list_iter(&p, &v)) {
  203. xs *ss = NULL;
  204. if (strcmp(v, "```") == 0) {
  205. if (!in_pre)
  206. s = xs_str_cat(s, "<pre>");
  207. else
  208. s = xs_str_cat(s, "</pre>");
  209. in_pre = !in_pre;
  210. continue;
  211. }
  212. if (in_pre) {
  213. // Encode all HTML characters when we're in pre element until we are out.
  214. ss = encode_html(v);
  215. s = xs_str_cat(s, ss);
  216. s = xs_str_cat(s, "<br>");
  217. continue;
  218. }
  219. else
  220. ss = xs_strip_i(format_line(v, attach));
  221. if (xs_startswith(ss, "---")) {
  222. /* delete the --- */
  223. ss = xs_strip_i(xs_crop_i(ss, 3, 0));
  224. s = xs_str_cat(s, "<hr>");
  225. s = xs_str_cat(s, ss);
  226. continue;
  227. }
  228. if (xs_startswith(ss, ">")) {
  229. /* delete the > and subsequent spaces */
  230. ss = xs_strip_i(xs_crop_i(ss, 1, 0));
  231. if (!in_blq) {
  232. s = xs_str_cat(s, "<blockquote>");
  233. in_blq = 1;
  234. }
  235. s = xs_str_cat(s, ss);
  236. s = xs_str_cat(s, "<br>");
  237. continue;
  238. }
  239. if (in_blq) {
  240. s = xs_str_cat(s, "</blockquote>");
  241. in_blq = 0;
  242. }
  243. s = xs_str_cat(s, ss);
  244. s = xs_str_cat(s, "<br>");
  245. }
  246. if (in_blq)
  247. s = xs_str_cat(s, "</blockquote>");
  248. if (in_pre)
  249. s = xs_str_cat(s, "</pre>");
  250. /* some beauty fixes */
  251. s = xs_replace_i(s, "<br><br><blockquote>", "<br><blockquote>");
  252. s = xs_replace_i(s, "</blockquote><br>", "</blockquote>");
  253. s = xs_replace_i(s, "</pre><br>", "</pre>");
  254. {
  255. /* traditional emoticons */
  256. xs *d = emojis();
  257. int c = 0;
  258. const char *k, *v;
  259. while (xs_dict_next(d, &k, &v, &c)) {
  260. const char *t = NULL;
  261. /* is it an URL to an image? */
  262. if (xs_startswith(v, "https:/" "/") && xs_startswith((t = xs_mime_by_ext(v)), "image/")) {
  263. if (tag && xs_str_in(s, k) != -1) {
  264. /* add the emoji to the tag list */
  265. xs *e = xs_dict_new();
  266. xs *i = xs_dict_new();
  267. xs *u = xs_str_utctime(0, ISO_DATE_SPEC);
  268. e = xs_dict_append(e, "id", v);
  269. e = xs_dict_append(e, "type", "Emoji");
  270. e = xs_dict_append(e, "name", k);
  271. e = xs_dict_append(e, "updated", u);
  272. i = xs_dict_append(i, "type", "Image");
  273. i = xs_dict_append(i, "mediaType", t);
  274. i = xs_dict_append(i, "url", v);
  275. e = xs_dict_append(e, "icon", i);
  276. *tag = xs_list_append(*tag, e);
  277. }
  278. }
  279. else
  280. s = xs_replace_i(s, k, v);
  281. }
  282. }
  283. return s;
  284. }
  285. const char *valid_tags[] = {
  286. "a", "p", "br", "br/", "blockquote", "ul", "ol", "li", "cite", "small",
  287. "span", "i", "b", "u", "s", "pre", "code", "em", "strong", "hr", "img", "del", "bdi", NULL
  288. };
  289. xs_str *sanitize(const char *content)
  290. /* cleans dangerous HTML output */
  291. {
  292. xs_str *s = xs_str_new(NULL);
  293. xs *sl;
  294. int n = 0;
  295. char *p;
  296. const char *v;
  297. sl = xs_regex_split(content, "</?[^>]+>");
  298. p = sl;
  299. n = 0;
  300. while (xs_list_iter(&p, &v)) {
  301. if (n & 0x1) {
  302. xs *s1 = xs_strip_i(xs_crop_i(xs_dup(v), v[1] == '/' ? 2 : 1, -1));
  303. xs *l1 = xs_split_n(s1, " ", 1);
  304. xs *tag = xs_tolower_i(xs_dup(xs_list_get(l1, 0)));
  305. xs *s2 = NULL;
  306. int i;
  307. /* check if it's one of the valid tags */
  308. for (i = 0; valid_tags[i]; i++) {
  309. if (strcmp(tag, valid_tags[i]) == 0)
  310. break;
  311. }
  312. if (valid_tags[i]) {
  313. /* accepted tag: rebuild it with only the accepted elements */
  314. xs *el = xs_regex_select(v, "(src|href|rel|class|target)=\"[^\"]*\"");
  315. xs *s3 = xs_join(el, " ");
  316. s2 = xs_fmt("<%s%s%s%s>",
  317. v[1] == '/' ? "/" : "", tag, xs_list_len(el) ? " " : "", s3);
  318. s = xs_str_cat(s, s2);
  319. } else {
  320. /* treat end of divs as paragraph breaks */
  321. if (strcmp(v, "</div>"))
  322. s = xs_str_cat(s, "<p>");
  323. }
  324. }
  325. else {
  326. /* non-tag */
  327. s = xs_str_cat(s, v);
  328. }
  329. n++;
  330. }
  331. return s;
  332. }
  333. xs_str *encode_html(const char *str)
  334. /* escapes html characters */
  335. {
  336. xs_str *encoded = xs_html_encode((char *)str);
  337. /* Restore only <br>. Probably safe. Let's hope nothing goes wrong with this. */
  338. encoded = xs_replace_i(encoded, "&lt;br&gt;", "<br>");
  339. return encoded;
  340. }