format.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. /* snac - A simple, minimalistic ActivityPub instance */
  2. /* copyright (c) 2022 - 2025 grunfink et al. / MIT license */
  3. #include "xs.h"
  4. #include "xs_regex.h"
  5. #include "xs_mime.h"
  6. #include "xs_html.h"
  7. #include "xs_json.h"
  8. #include "xs_time.h"
  9. #include "xs_match.h"
  10. #include "snac.h"
  11. /* emoticons, people laughing and such */
  12. const char *smileys[] = {
  13. ":-)", "🙂",
  14. ":-D", "😀",
  15. "X-D", "😆",
  16. ";-)", "😉",
  17. "B-)", "😎",
  18. ">:-(", "😡",
  19. ":-(", "😞",
  20. ":-*", "😘",
  21. ":-/", "😕",
  22. "8-o", "😲",
  23. "%-)", "🤪",
  24. ":_(", "😢",
  25. ":-|", "😐",
  26. "<3", "&#10084;&#65039;",
  27. ":facepalm:", "&#129318;",
  28. ":shrug:", "&#129335;",
  29. ":shrug2:", "&#175;\\_(&#12484;)_/&#175;",
  30. ":eyeroll:", "&#128580;",
  31. ":beer:", "&#127866;",
  32. ":beers:", "&#127867;",
  33. ":munch:", "&#128561;",
  34. ":thumb:", "&#128077;",
  35. NULL, NULL
  36. };
  37. xs_dict *emojis(void)
  38. /* returns a dict with the emojis */
  39. {
  40. xs *fn = xs_fmt("%s/emojis.json", srv_basedir);
  41. FILE *f;
  42. if (mtime(fn) == 0) {
  43. /* file does not exist; create it with the defaults */
  44. xs *d = xs_dict_new();
  45. const char **emo = smileys;
  46. while (*emo) {
  47. d = xs_dict_append(d, emo[0], emo[1]);
  48. emo += 2;
  49. }
  50. if ((f = fopen(fn, "w")) != NULL) {
  51. xs_json_dump(d, 4, f);
  52. fclose(f);
  53. }
  54. else
  55. srv_log(xs_fmt("Error creating '%s'", fn));
  56. }
  57. xs_dict *d = NULL;
  58. if ((f = fopen(fn, "r")) != NULL) {
  59. d = xs_json_load(f);
  60. fclose(f);
  61. if (d == NULL)
  62. srv_log(xs_fmt("JSON parse error in '%s'", fn));
  63. }
  64. else
  65. srv_log(xs_fmt("Error opening '%s'", fn));
  66. return d;
  67. }
  68. /* Non-whitespace without trailing comma, period or closing paren */
  69. #define NOSPACE "([^[:space:],.)]+|[,.)]+[^[:space:],.)])+"
  70. static xs_str *format_line(const char *line, xs_list **attach)
  71. /* formats a line */
  72. {
  73. xs_str *s = xs_str_new(NULL);
  74. char *p;
  75. const char *v;
  76. /* split by markup */
  77. xs *sm = xs_regex_split(line,
  78. "("
  79. "`[^`]+`" "|"
  80. "~~[^~]+~~" "|"
  81. "\\*\\*?\\*?[^\\*]+\\*?\\*?\\*" "|"
  82. ":.+:" "|" //emotes
  83. "_[^_]+_" "|" //anzu
  84. "__[^_]+__" "|" //anzu
  85. "!\\[[^]]+\\]\\([^\\)]+\\)" "|"
  86. "\\[[^]]+\\]\\([^\\)]+\\)" "|"
  87. "[a-z]+:/" "/" NOSPACE "|"
  88. "(mailto|xmpp):[^@[:space:]]+@" NOSPACE
  89. ")");
  90. int n = 0;
  91. p = sm;
  92. while (xs_list_iter(&p, &v)) {
  93. if ((n & 0x1)) {
  94. /* markup */
  95. if (xs_startswith(v, "`")) {
  96. xs *s1 = xs_strip_chars_i(xs_dup(v), "`");
  97. xs *e1 = encode_html(s1);
  98. xs *s2 = xs_fmt("<code>%s</code>", e1);
  99. s = xs_str_cat(s, s2);
  100. }
  101. else
  102. if (xs_startswith(v, "***")) {
  103. xs *s1 = xs_strip_chars_i(xs_dup(v), "*");
  104. xs *s2 = xs_fmt("<b><i>%s</i></b>", s1);
  105. s = xs_str_cat(s, s2);
  106. }
  107. else
  108. if (xs_startswith(v, "**")) {
  109. xs *s1 = xs_strip_chars_i(xs_dup(v), "*");
  110. xs *s2 = xs_fmt("<b>%s</b>", s1);
  111. s = xs_str_cat(s, s2);
  112. }
  113. else
  114. if (xs_startswith(v, "*")) {
  115. xs *s1 = xs_strip_chars_i(xs_dup(v), "*");
  116. xs *s2 = xs_fmt("<i>%s</i>", s1);
  117. s = xs_str_cat(s, s2);
  118. }
  119. //anzu - begin
  120. else
  121. if (xs_startswith(v, "__")) {
  122. xs *s1 = xs_strip_chars_i(xs_dup(v), "_");
  123. xs *s2 = xs_fmt("<u>%s</u>", s1);
  124. s = xs_str_cat(s, s2);
  125. }
  126. else
  127. if (xs_startswith(v, "_")) {
  128. xs *s1 = xs_strip_chars_i(xs_dup(v), "_");
  129. xs *s2 = xs_fmt("<i>%s</i>", s1);
  130. s = xs_str_cat(s, s2);
  131. }
  132. //anzu - end
  133. else
  134. if (xs_startswith(v, "~~")) {
  135. xs *s1 = xs_strip_chars_i(xs_dup(v), "~");
  136. xs *e1 = encode_html(s1);
  137. xs *s2 = xs_fmt("<s>%s</s>", e1);
  138. s = xs_str_cat(s, s2);
  139. }
  140. else
  141. if (*v == '[') {
  142. /* markdown-like links [label](url) */
  143. xs *w = xs_strip_chars_i(
  144. xs_replace_i(xs_replace(v, "#", "&#35;"), "@", "&#64;"),
  145. "![)");
  146. xs *l = xs_split_n(w, "](", 1);
  147. if (xs_list_len(l) == 2) {
  148. xs *link = xs_fmt("<a href=\"%s\">%s</a>",
  149. xs_list_get(l, 1), xs_list_get(l, 0));
  150. s = xs_str_cat(s, link);
  151. }
  152. else
  153. s = xs_str_cat(s, v);
  154. }
  155. else
  156. if (*v == '!') {
  157. /* markdown-like images ![alt text](url to image) */
  158. xs *w = xs_strip_chars_i(
  159. xs_replace_i(xs_replace(v, "#", "&#35;"), "@", "&#64;"),
  160. "![)");
  161. xs *l = xs_split_n(w, "](", 1);
  162. if (xs_list_len(l) == 2) {
  163. const char *alt_text = xs_list_get(l, 0);
  164. const char *img_url = xs_list_get(l, 1);
  165. const char *mime = xs_mime_by_ext(img_url);
  166. if (attach != NULL && xs_startswith(mime, "image/")) {
  167. const xs_dict *ad;
  168. int add = 1;
  169. xs_list_foreach(*attach, ad) {
  170. if (strcmp(xs_dict_get_def(ad, "url", ""), img_url) == 0) {
  171. add = 0;
  172. break;
  173. }
  174. }
  175. if (add) {
  176. xs *d = xs_dict_new();
  177. d = xs_dict_append(d, "mediaType", mime);
  178. d = xs_dict_append(d, "url", img_url);
  179. d = xs_dict_append(d, "name", alt_text);
  180. d = xs_dict_append(d, "type", "Image");
  181. *attach = xs_list_append(*attach, d);
  182. }
  183. }
  184. else {
  185. xs *link = xs_fmt("<a href=\"%s\">%s</a>", img_url, alt_text);
  186. s = xs_str_cat(s, link);
  187. }
  188. }
  189. else
  190. s = xs_str_cat(s, v);
  191. }
  192. else
  193. if (xs_str_in(v, ":/" "/") != -1) {
  194. xs *u = xs_replace_i(xs_replace(v, "#", "&#35;"), "@", "&#64;");
  195. xs *v2 = xs_strip_chars_i(xs_dup(u), ".,)");
  196. const char *mime = xs_mime_by_ext(v2);
  197. if (attach != NULL && xs_startswith(mime, "image/")) {
  198. /* if it's a link to an image, insert it as an attachment */
  199. const xs_dict *ad;
  200. int add = 1;
  201. xs_list_foreach(*attach, ad) {
  202. if (strcmp(xs_dict_get_def(ad, "url", ""), v2) == 0) {
  203. add = 0;
  204. break;
  205. }
  206. }
  207. if (add) {
  208. xs *d = xs_dict_new();
  209. d = xs_dict_append(d, "mediaType", mime);
  210. d = xs_dict_append(d, "url", v2);
  211. d = xs_dict_append(d, "name", "");
  212. d = xs_dict_append(d, "type", "Image");
  213. *attach = xs_list_append(*attach, d);
  214. }
  215. }
  216. else {
  217. xs *s1 = xs_fmt("<a href=\"%s\" target=\"_blank\">%s</a>", v2, u);
  218. s = xs_str_cat(s, s1);
  219. }
  220. }
  221. else
  222. if (xs_match(v, "mailto*|xmpp*")) {
  223. xs *u = xs_replace_i(xs_replace(v, "#", "&#35;"), "@", "&#64;");
  224. xs *v2 = xs_strip_chars_i(xs_dup(u), ".,)");
  225. xs *s1 = xs_fmt("<a href=\"%s\" target=\"_blank\">%s</a>", v2, u);
  226. s = xs_str_cat(s, s1);
  227. }
  228. else
  229. s = xs_str_cat(s, v);
  230. }
  231. else
  232. /* surrounded text, copy directly */
  233. s = xs_str_cat(s, v);
  234. n++;
  235. }
  236. return s;
  237. }
  238. xs_str *not_really_markdown(const char *content, xs_list **attach, xs_list **tag)
  239. /* formats a content using some Markdown rules */
  240. {
  241. xs_str *s = xs_str_new(NULL);
  242. int in_pre = 0;
  243. int in_blq = 0;
  244. xs *list;
  245. char *p;
  246. const char *v;
  247. /* work by lines */
  248. list = xs_split(content, "\n");
  249. p = list;
  250. while (xs_list_iter(&p, &v)) {
  251. xs *ss = NULL;
  252. if (strcmp(v, "```") == 0) {
  253. if (!in_pre)
  254. s = xs_str_cat(s, "<pre>");
  255. else
  256. s = xs_str_cat(s, "</pre>");
  257. in_pre = !in_pre;
  258. continue;
  259. }
  260. if (in_pre) {
  261. // Encode all HTML characters when we're in pre element until we are out.
  262. ss = encode_html(v);
  263. s = xs_str_cat(s, ss);
  264. s = xs_str_cat(s, "<br>");
  265. continue;
  266. }
  267. else
  268. ss = xs_strip_i(format_line(v, attach));
  269. if (xs_startswith(ss, "---")) {
  270. /* delete the --- */
  271. ss = xs_strip_i(xs_crop_i(ss, 3, 0));
  272. s = xs_str_cat(s, "<hr>");
  273. s = xs_str_cat(s, ss);
  274. continue;
  275. }
  276. //anzu - begin
  277. // h1 reserved for snac?
  278. if (xs_startswith(ss, "# ")) {
  279. ss = xs_strip_i(xs_crop_i(ss, 2, 0));
  280. s = xs_str_cat(s, "<h2>");
  281. s = xs_str_cat(s, ss);
  282. s = xs_str_cat(s, "</h2>");
  283. continue;
  284. }
  285. if (xs_startswith(ss, "## ")) {
  286. ss = xs_strip_i(xs_crop_i(ss, 3, 0));
  287. s = xs_str_cat(s, "<h2>");
  288. s = xs_str_cat(s, ss);
  289. s = xs_str_cat(s, "</h2>");
  290. continue;
  291. }
  292. if (xs_startswith(ss, "### ")) {
  293. ss = xs_strip_i(xs_crop_i(ss, 4, 0));
  294. s = xs_str_cat(s, "<h3>");
  295. s = xs_str_cat(s, ss);
  296. s = xs_str_cat(s, "</h3>");
  297. continue;
  298. }
  299. //anzu - end
  300. if (xs_startswith(ss, ">")) {
  301. /* delete the > and subsequent spaces */
  302. ss = xs_strip_i(xs_crop_i(ss, 1, 0));
  303. if (!in_blq) {
  304. s = xs_str_cat(s, "<blockquote>");
  305. in_blq = 1;
  306. }
  307. s = xs_str_cat(s, ss);
  308. s = xs_str_cat(s, "<br>");
  309. continue;
  310. }
  311. if (in_blq) {
  312. s = xs_str_cat(s, "</blockquote>");
  313. in_blq = 0;
  314. }
  315. s = xs_str_cat(s, ss);
  316. s = xs_str_cat(s, "<br>");
  317. }
  318. if (in_blq)
  319. s = xs_str_cat(s, "</blockquote>");
  320. if (in_pre)
  321. s = xs_str_cat(s, "</pre>");
  322. /* some beauty fixes */
  323. s = xs_replace_i(s, "<br><br><blockquote>", "<br><blockquote>");
  324. s = xs_replace_i(s, "</blockquote><br>", "</blockquote>");
  325. s = xs_replace_i(s, "</pre><br>", "</pre>");
  326. s = xs_replace_i(s, "</h2><br>", "</h2>"); //anzu ???
  327. s = xs_replace_i(s, "</h3><br>", "</h3>"); //anzu ???
  328. {
  329. /* traditional emoticons */
  330. xs *d = emojis();
  331. int c = 0;
  332. const char *k, *v;
  333. while (xs_dict_next(d, &k, &v, &c)) {
  334. const char *t = NULL;
  335. /* is it an URL to an image? */
  336. if (xs_startswith(v, "https:/" "/") && xs_startswith((t = xs_mime_by_ext(v)), "image/")) {
  337. if (tag && xs_str_in(s, k) != -1) {
  338. /* add the emoji to the tag list */
  339. xs *e = xs_dict_new();
  340. xs *i = xs_dict_new();
  341. xs *u = xs_str_utctime(0, ISO_DATE_SPEC);
  342. e = xs_dict_append(e, "id", v);
  343. e = xs_dict_append(e, "type", "Emoji");
  344. e = xs_dict_append(e, "name", k);
  345. e = xs_dict_append(e, "updated", u);
  346. i = xs_dict_append(i, "type", "Image");
  347. i = xs_dict_append(i, "mediaType", t);
  348. i = xs_dict_append(i, "url", v);
  349. e = xs_dict_append(e, "icon", i);
  350. *tag = xs_list_append(*tag, e);
  351. }
  352. }
  353. else
  354. s = xs_replace_i(s, k, v);
  355. }
  356. }
  357. return s;
  358. }
  359. const char *valid_tags[] = {
  360. "a", "p", "br", "br/", "blockquote", "ul", "ol", "li", "cite", "small",
  361. "span", "i", "b", "u", "s", "pre", "code", "em", "strong", "hr", "img", "del", "bdi",
  362. "h2","h3", //anzu
  363. NULL
  364. };
  365. xs_str *sanitize(const char *content)
  366. /* cleans dangerous HTML output */
  367. {
  368. xs_str *s = xs_str_new(NULL);
  369. xs *sl;
  370. int n = 0;
  371. char *p;
  372. const char *v;
  373. sl = xs_regex_split(content, "</?[^>]+>");
  374. p = sl;
  375. n = 0;
  376. while (xs_list_iter(&p, &v)) {
  377. if (n & 0x1) {
  378. xs *s1 = xs_strip_i(xs_crop_i(xs_dup(v), v[1] == '/' ? 2 : 1, -1));
  379. xs *l1 = xs_split_n(s1, " ", 1);
  380. xs *tag = xs_tolower_i(xs_dup(xs_list_get(l1, 0)));
  381. xs *s2 = NULL;
  382. int i;
  383. /* check if it's one of the valid tags */
  384. for (i = 0; valid_tags[i]; i++) {
  385. if (strcmp(tag, valid_tags[i]) == 0)
  386. break;
  387. }
  388. if (valid_tags[i]) {
  389. /* accepted tag: rebuild it with only the accepted elements */
  390. xs *el = xs_regex_select(v, "(src|href|rel|class|target)=(\"[^\"]*\"|'[^']*')");
  391. xs *s3 = xs_join(el, " ");
  392. s2 = xs_fmt("<%s%s%s%s>",
  393. v[1] == '/' ? "/" : "", tag, xs_list_len(el) ? " " : "", s3);
  394. s = xs_str_cat(s, s2);
  395. } else {
  396. /* treat end of divs as paragraph breaks */
  397. if (strcmp(v, "</div>"))
  398. s = xs_str_cat(s, "<p>");
  399. }
  400. }
  401. else {
  402. /* non-tag */
  403. s = xs_str_cat(s, v);
  404. }
  405. n++;
  406. }
  407. return s;
  408. }
  409. xs_str *encode_html(const char *str)
  410. /* escapes html characters */
  411. {
  412. xs_str *encoded = xs_html_encode((char *)str);
  413. /* Restore only <br>. Probably safe. Let's hope nothing goes wrong with this. */
  414. encoded = xs_replace_i(encoded, "&lt;br&gt;", "<br>");
  415. return encoded;
  416. }