123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437 |
- /* snac - A simple, minimalistic ActivityPub instance */
- /* copyright (c) 2022 - 2024 grunfink et al. / MIT license */
- #include "xs.h"
- #include "xs_regex.h"
- #include "xs_mime.h"
- #include "xs_html.h"
- #include "xs_json.h"
- #include "xs_time.h"
- #include "snac.h"
- /* emoticons, people laughing and such */
- const char *smileys[] = {
- ":-)", "🙂",
- ":-D", "😀",
- "X-D", "😆",
- ";-)", "😉",
- "B-)", "😎",
- ">:-(", "😡",
- ":-(", "😞",
- ":-*", "😘",
- ":-/", "😕",
- "8-o", "😲",
- "%-)", "🤪",
- ":_(", "😢",
- ":-|", "😐",
- "<3", "❤️",
- ":facepalm:", "🤦",
- ":shrug:", "🤷",
- ":shrug2:", "¯\\_(ツ)_/¯",
- ":eyeroll:", "🙄",
- ":beer:", "🍺",
- ":beers:", "🍻",
- ":munch:", "😱",
- ":thumb:", "👍",
- NULL, NULL
- };
- xs_dict *emojis(void)
- /* returns a dict with the emojis */
- {
- xs *fn = xs_fmt("%s/emojis.json", srv_basedir);
- FILE *f;
- if (mtime(fn) == 0) {
- /* file does not exist; create it with the defaults */
- xs *d = xs_dict_new();
- const char **emo = smileys;
- while (*emo) {
- d = xs_dict_append(d, emo[0], emo[1]);
- emo += 2;
- }
- if ((f = fopen(fn, "w")) != NULL) {
- xs_json_dump(d, 4, f);
- fclose(f);
- }
- else
- srv_log(xs_fmt("Error creating '%s'", fn));
- }
- xs_dict *d = NULL;
- if ((f = fopen(fn, "r")) != NULL) {
- d = xs_json_load(f);
- fclose(f);
- if (d == NULL)
- srv_log(xs_fmt("JSON parse error in '%s'", fn));
- }
- else
- srv_log(xs_fmt("Error opening '%s'", fn));
- return d;
- }
- static xs_str *format_line(const char *line, xs_list **attach)
- /* formats a line */
- {
- xs_str *s = xs_str_new(NULL);
- char *p;
- const char *v;
- /* split by markup */
- xs *sm = xs_regex_split(line,
- "("
- "`[^`]+`" "|"
- "~~[^~]+~~" "|"
- "\\*\\*?\\*?[^\\*]+\\*?\\*?\\*" "|"
- "!\\[[^]]+\\]\\([^\\)]+\\)" "|"
- "\\[[^]]+\\]\\([^\\)]+\\)" "|"
- "[a-z]+:/" "/[^[:space:]]+"
- ")");
- int n = 0;
- p = sm;
- while (xs_list_iter(&p, &v)) {
- if ((n & 0x1)) {
- /* markup */
- if (xs_startswith(v, "`")) {
- xs *s1 = xs_strip_chars_i(xs_dup(v), "`");
- xs *e1 = encode_html(s1);
- xs *s2 = xs_fmt("<code>%s</code>", e1);
- s = xs_str_cat(s, s2);
- }
- else
- if (xs_startswith(v, "***")) {
- xs *s1 = xs_strip_chars_i(xs_dup(v), "*");
- xs *s2 = xs_fmt("<b><i>%s</i></b>", s1);
- s = xs_str_cat(s, s2);
- }
- else
- if (xs_startswith(v, "**")) {
- xs *s1 = xs_strip_chars_i(xs_dup(v), "*");
- xs *s2 = xs_fmt("<b>%s</b>", s1);
- s = xs_str_cat(s, s2);
- }
- else
- if (xs_startswith(v, "*")) {
- xs *s1 = xs_strip_chars_i(xs_dup(v), "*");
- xs *s2 = xs_fmt("<i>%s</i>", s1);
- s = xs_str_cat(s, s2);
- }
- else
- if (xs_startswith(v, "~~")) {
- xs *s1 = xs_strip_chars_i(xs_dup(v), "~");
- xs *e1 = encode_html(s1);
- xs *s2 = xs_fmt("<s>%s</s>", e1);
- s = xs_str_cat(s, s2);
- }
- else
- if (*v == '[') {
- /* markdown-like links [label](url) */
- xs *w = xs_strip_chars_i(
- xs_replace_i(xs_replace(v, "#", "#"), "@", "@"),
- "![)");
- xs *l = xs_split_n(w, "](", 1);
- if (xs_list_len(l) == 2) {
- xs *link = xs_fmt("<a href=\"%s\">%s</a>",
- xs_list_get(l, 1), xs_list_get(l, 0));
- s = xs_str_cat(s, link);
- }
- else
- s = xs_str_cat(s, v);
- }
- else
- if (*v == '!') {
- /* markdown-like images ![alt text](url to image) */
- xs *w = xs_strip_chars_i(
- xs_replace_i(xs_replace(v, "#", "#"), "@", "@"),
- "![)");
- xs *l = xs_split_n(w, "](", 1);
- if (xs_list_len(l) == 2) {
- const char *alt_text = xs_list_get(l, 0);
- const char *img_url = xs_list_get(l, 1);
- const char *mime = xs_mime_by_ext(img_url);
- if (attach != NULL && xs_startswith(mime, "image/")) {
- const xs_dict *ad;
- int add = 1;
- xs_list_foreach(*attach, ad) {
- if (strcmp(xs_dict_get_def(ad, "url", ""), img_url) == 0) {
- add = 0;
- break;
- }
- }
- if (add) {
- xs *d = xs_dict_new();
- d = xs_dict_append(d, "mediaType", mime);
- d = xs_dict_append(d, "url", img_url);
- d = xs_dict_append(d, "name", alt_text);
- d = xs_dict_append(d, "type", "Image");
- *attach = xs_list_append(*attach, d);
- }
- }
- else {
- xs *link = xs_fmt("<a href=\"%s\">%s</a>", img_url, alt_text);
- s = xs_str_cat(s, link);
- }
- }
- else
- s = xs_str_cat(s, v);
- }
- else
- if (xs_str_in(v, ":/" "/") != -1) {
- xs *u = xs_replace_i(xs_replace(v, "#", "#"), "@", "@");
- xs *v2 = xs_strip_chars_i(xs_dup(u), ".,)");
- const char *mime = xs_mime_by_ext(v2);
- if (attach != NULL && xs_startswith(mime, "image/")) {
- /* if it's a link to an image, insert it as an attachment */
- const xs_dict *ad;
- int add = 1;
- xs_list_foreach(*attach, ad) {
- if (strcmp(xs_dict_get_def(ad, "url", ""), v2) == 0) {
- add = 0;
- break;
- }
- }
- if (add) {
- xs *d = xs_dict_new();
- d = xs_dict_append(d, "mediaType", mime);
- d = xs_dict_append(d, "url", v2);
- d = xs_dict_append(d, "name", "");
- d = xs_dict_append(d, "type", "Image");
- *attach = xs_list_append(*attach, d);
- }
- }
- else {
- xs *s1 = xs_fmt("<a href=\"%s\" target=\"_blank\">%s</a>", v2, u);
- s = xs_str_cat(s, s1);
- }
- }
- else
- s = xs_str_cat(s, v);
- }
- else
- /* surrounded text, copy directly */
- s = xs_str_cat(s, v);
- n++;
- }
- return s;
- }
- xs_str *not_really_markdown(const char *content, xs_list **attach, xs_list **tag)
- /* formats a content using some Markdown rules */
- {
- xs_str *s = xs_str_new(NULL);
- int in_pre = 0;
- int in_blq = 0;
- xs *list;
- char *p;
- const char *v;
- /* work by lines */
- list = xs_split(content, "\n");
- p = list;
- while (xs_list_iter(&p, &v)) {
- xs *ss = NULL;
- if (strcmp(v, "```") == 0) {
- if (!in_pre)
- s = xs_str_cat(s, "<pre>");
- else
- s = xs_str_cat(s, "</pre>");
- in_pre = !in_pre;
- continue;
- }
- if (in_pre) {
- // Encode all HTML characters when we're in pre element until we are out.
- ss = encode_html(v);
- s = xs_str_cat(s, ss);
- s = xs_str_cat(s, "<br>");
- continue;
- }
- else
- ss = xs_strip_i(format_line(v, attach));
- if (xs_startswith(ss, "---")) {
- /* delete the --- */
- ss = xs_strip_i(xs_crop_i(ss, 3, 0));
- s = xs_str_cat(s, "<hr>");
- s = xs_str_cat(s, ss);
- continue;
- }
- if (xs_startswith(ss, ">")) {
- /* delete the > and subsequent spaces */
- ss = xs_strip_i(xs_crop_i(ss, 1, 0));
- if (!in_blq) {
- s = xs_str_cat(s, "<blockquote>");
- in_blq = 1;
- }
- s = xs_str_cat(s, ss);
- s = xs_str_cat(s, "<br>");
- continue;
- }
- if (in_blq) {
- s = xs_str_cat(s, "</blockquote>");
- in_blq = 0;
- }
- s = xs_str_cat(s, ss);
- s = xs_str_cat(s, "<br>");
- }
- if (in_blq)
- s = xs_str_cat(s, "</blockquote>");
- if (in_pre)
- s = xs_str_cat(s, "</pre>");
- /* some beauty fixes */
- s = xs_replace_i(s, "<br><br><blockquote>", "<br><blockquote>");
- s = xs_replace_i(s, "</blockquote><br>", "</blockquote>");
- s = xs_replace_i(s, "</pre><br>", "</pre>");
- {
- /* traditional emoticons */
- xs *d = emojis();
- int c = 0;
- const char *k, *v;
- while (xs_dict_next(d, &k, &v, &c)) {
- const char *t = NULL;
- /* is it an URL to an image? */
- if (xs_startswith(v, "https:/" "/") && xs_startswith((t = xs_mime_by_ext(v)), "image/")) {
- if (tag && xs_str_in(s, k) != -1) {
- /* add the emoji to the tag list */
- xs *e = xs_dict_new();
- xs *i = xs_dict_new();
- xs *u = xs_str_utctime(0, ISO_DATE_SPEC);
- e = xs_dict_append(e, "id", v);
- e = xs_dict_append(e, "type", "Emoji");
- e = xs_dict_append(e, "name", k);
- e = xs_dict_append(e, "updated", u);
- i = xs_dict_append(i, "type", "Image");
- i = xs_dict_append(i, "mediaType", t);
- i = xs_dict_append(i, "url", v);
- e = xs_dict_append(e, "icon", i);
- *tag = xs_list_append(*tag, e);
- }
- }
- else
- s = xs_replace_i(s, k, v);
- }
- }
- return s;
- }
- const char *valid_tags[] = {
- "a", "p", "br", "br/", "blockquote", "ul", "ol", "li", "cite", "small",
- "span", "i", "b", "u", "s", "pre", "code", "em", "strong", "hr", "img", "del", "bdi", NULL
- };
- xs_str *sanitize(const char *content)
- /* cleans dangerous HTML output */
- {
- xs_str *s = xs_str_new(NULL);
- xs *sl;
- int n = 0;
- char *p;
- const char *v;
- sl = xs_regex_split(content, "</?[^>]+>");
- p = sl;
- n = 0;
- while (xs_list_iter(&p, &v)) {
- if (n & 0x1) {
- xs *s1 = xs_strip_i(xs_crop_i(xs_dup(v), v[1] == '/' ? 2 : 1, -1));
- xs *l1 = xs_split_n(s1, " ", 1);
- xs *tag = xs_tolower_i(xs_dup(xs_list_get(l1, 0)));
- xs *s2 = NULL;
- int i;
- /* check if it's one of the valid tags */
- for (i = 0; valid_tags[i]; i++) {
- if (strcmp(tag, valid_tags[i]) == 0)
- break;
- }
- if (valid_tags[i]) {
- /* accepted tag: rebuild it with only the accepted elements */
- xs *el = xs_regex_select(v, "(src|href|rel|class|target)=\"[^\"]*\"");
- xs *s3 = xs_join(el, " ");
- s2 = xs_fmt("<%s%s%s%s>",
- v[1] == '/' ? "/" : "", tag, xs_list_len(el) ? " " : "", s3);
- s = xs_str_cat(s, s2);
- } else {
- /* treat end of divs as paragraph breaks */
- if (strcmp(v, "</div>"))
- s = xs_str_cat(s, "<p>");
- }
- }
- else {
- /* non-tag */
- s = xs_str_cat(s, v);
- }
- n++;
- }
- return s;
- }
- xs_str *encode_html(const char *str)
- /* escapes html characters */
- {
- xs_str *encoded = xs_html_encode((char *)str);
- /* Restore only <br>. Probably safe. Let's hope nothing goes wrong with this. */
- encoded = xs_replace_i(encoded, "<br>", "<br>");
- return encoded;
- }
|