diff options
Diffstat (limited to 'rss.c')
| -rw-r--r-- | rss.c | 274 |
1 files changed, 274 insertions, 0 deletions
| @@ -0,0 +1,274 @@ | |||
| 1 | /* snac - A simple, minimalistic ActivityPub instance */ | ||
| 2 | /* copyright (c) 2025 grunfink et al. / MIT license */ | ||
| 3 | |||
| 4 | #include "xs.h" | ||
| 5 | #include "xs_html.h" | ||
| 6 | #include "xs_regex.h" | ||
| 7 | #include "xs_time.h" | ||
| 8 | #include "xs_match.h" | ||
| 9 | #include "xs_curl.h" | ||
| 10 | #include "xs_openssl.h" | ||
| 11 | #include "xs_json.h" | ||
| 12 | |||
| 13 | #include "snac.h" | ||
| 14 | |||
| 15 | xs_str *rss_from_timeline(snac *user, const xs_list *timeline, | ||
| 16 | const char *title, const char *link, const char *desc) | ||
| 17 | /* converts a timeline to rss */ | ||
| 18 | { | ||
| 19 | xs_html *rss = xs_html_tag("rss", | ||
| 20 | xs_html_attr("xmlns:content", "http:/" "/purl.org/rss/1.0/modules/content/"), | ||
| 21 | xs_html_attr("version", "2.0"), | ||
| 22 | xs_html_attr("xmlns:atom", "http:/" "/www.w3.org/2005/Atom")); | ||
| 23 | |||
| 24 | xs_html *channel = xs_html_tag("channel", | ||
| 25 | xs_html_tag("title", | ||
| 26 | xs_html_text(title)), | ||
| 27 | xs_html_tag("language", | ||
| 28 | xs_html_text("en")), | ||
| 29 | xs_html_tag("link", | ||
| 30 | xs_html_text(link)), | ||
| 31 | xs_html_sctag("atom:link", | ||
| 32 | xs_html_attr("href", link), | ||
| 33 | xs_html_attr("rel", "self"), | ||
| 34 | xs_html_attr("type", "application/rss+xml")), | ||
| 35 | xs_html_tag("generator", | ||
| 36 | xs_html_text(USER_AGENT)), | ||
| 37 | xs_html_tag("description", | ||
| 38 | xs_html_text(desc))); | ||
| 39 | |||
| 40 | xs_html_add(rss, channel); | ||
| 41 | |||
| 42 | int cnt = 0; | ||
| 43 | const char *v; | ||
| 44 | |||
| 45 | xs_list_foreach(timeline, v) { | ||
| 46 | xs *msg = NULL; | ||
| 47 | |||
| 48 | if (user) { | ||
| 49 | if (!valid_status(timeline_get_by_md5(user, v, &msg))) | ||
| 50 | continue; | ||
| 51 | } | ||
| 52 | else { | ||
| 53 | if (!valid_status(object_get_by_md5(v, &msg))) | ||
| 54 | continue; | ||
| 55 | } | ||
| 56 | |||
| 57 | const char *id = xs_dict_get(msg, "id"); | ||
| 58 | const char *content = xs_dict_get(msg, "content"); | ||
| 59 | const char *published = xs_dict_get(msg, "published"); | ||
| 60 | |||
| 61 | if (user && !xs_startswith(id, user->actor)) | ||
| 62 | continue; | ||
| 63 | |||
| 64 | if (!id || !content || !published) | ||
| 65 | continue; | ||
| 66 | |||
| 67 | /* create a title with the first line of the content */ | ||
| 68 | xs *title = xs_replace(content, "<br>", "\n"); | ||
| 69 | title = xs_regex_replace_i(title, "<[^>]+>", " "); | ||
| 70 | title = xs_regex_replace_i(title, "&[^;]+;", " "); | ||
| 71 | int i; | ||
| 72 | |||
| 73 | for (i = 0; title[i] && title[i] != '\n' && i < 50; i++); | ||
| 74 | |||
| 75 | if (title[i] != '\0') { | ||
| 76 | title[i] = '\0'; | ||
| 77 | title = xs_str_cat(title, "..."); | ||
| 78 | } | ||
| 79 | |||
| 80 | title = xs_strip_i(title); | ||
| 81 | |||
| 82 | /* convert the date */ | ||
| 83 | time_t t = xs_parse_iso_date(published, 0); | ||
| 84 | xs *rss_date = xs_str_utctime(t, "%a, %d %b %Y %T +0000"); | ||
| 85 | |||
| 86 | /* if it's the first one, add it to the header */ | ||
| 87 | if (cnt == 0) | ||
| 88 | xs_html_add(channel, | ||
| 89 | xs_html_tag("lastBuildDate", | ||
| 90 | xs_html_text(rss_date))); | ||
| 91 | |||
| 92 | xs_html_add(channel, | ||
| 93 | xs_html_tag("item", | ||
| 94 | xs_html_tag("title", | ||
| 95 | xs_html_text(title)), | ||
| 96 | xs_html_tag("link", | ||
| 97 | xs_html_text(id)), | ||
| 98 | xs_html_tag("guid", | ||
| 99 | xs_html_text(id)), | ||
| 100 | xs_html_tag("pubDate", | ||
| 101 | xs_html_text(rss_date)), | ||
| 102 | xs_html_tag("description", | ||
| 103 | xs_html_text(content)))); | ||
| 104 | |||
| 105 | cnt++; | ||
| 106 | } | ||
| 107 | |||
| 108 | return xs_html_render_s(rss, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); | ||
| 109 | } | ||
| 110 | |||
| 111 | |||
| 112 | void rss_to_timeline(snac *user, const char *url) | ||
| 113 | /* reads an RSS and inserts all ActivityPub posts into the user's timeline */ | ||
| 114 | { | ||
| 115 | if (!xs_startswith(url, "https:/") && !xs_startswith(url, "http:/")) | ||
| 116 | return; | ||
| 117 | |||
| 118 | xs *hdrs = xs_dict_new(); | ||
| 119 | hdrs = xs_dict_set(hdrs, "accept", "application/rss+xml"); | ||
| 120 | hdrs = xs_dict_set(hdrs, "user-agent", USER_AGENT); | ||
| 121 | |||
| 122 | /* get the RSS metadata */ | ||
| 123 | xs *md5 = xs_md5_hex(url, strlen(url)); | ||
| 124 | xs *rss_md_fn = xs_fmt("%s/rss", user->basedir); | ||
| 125 | mkdirx(rss_md_fn); | ||
| 126 | rss_md_fn = xs_str_cat(rss_md_fn, "/", md5, ".json"); | ||
| 127 | |||
| 128 | xs *rss_md = NULL; | ||
| 129 | const char *etag = NULL; | ||
| 130 | |||
| 131 | FILE *f; | ||
| 132 | if ((f = fopen(rss_md_fn, "r")) != NULL) { | ||
| 133 | rss_md = xs_json_load(f); | ||
| 134 | fclose(f); | ||
| 135 | |||
| 136 | etag = xs_dict_get(rss_md, "etag"); | ||
| 137 | |||
| 138 | if (xs_is_string(etag)) | ||
| 139 | hdrs = xs_dict_set(hdrs, "if-none-match", etag); | ||
| 140 | } | ||
| 141 | |||
| 142 | if (rss_md == NULL) | ||
| 143 | rss_md = xs_dict_new(); | ||
| 144 | |||
| 145 | xs *payload = NULL; | ||
| 146 | int status; | ||
| 147 | int p_size; | ||
| 148 | |||
| 149 | xs *rsp = xs_http_request("GET", url, hdrs, NULL, 0, &status, &payload, &p_size, 0); | ||
| 150 | |||
| 151 | snac_log(user, xs_fmt("parsing RSS %s %d", url, status)); | ||
| 152 | |||
| 153 | if (!valid_status(status) || !xs_is_string(payload)) | ||
| 154 | return; | ||
| 155 | |||
| 156 | /* not an RSS? done */ | ||
| 157 | const char *ctype = xs_dict_get(rsp, "content-type"); | ||
| 158 | if (!xs_is_string(ctype) || xs_str_in(ctype, "application/rss+xml") == -1) | ||
| 159 | return; | ||
| 160 | |||
| 161 | /* yes, parsing is done with regexes (now I have two problems blah blah blah) */ | ||
| 162 | xs *links = xs_regex_select(payload, "<link>[^<]+</link>"); | ||
| 163 | const char *link; | ||
| 164 | |||
| 165 | xs_list_foreach(links, link) { | ||
| 166 | xs *l = xs_replace(link, "<link>", ""); | ||
| 167 | char *p = strchr(l, '<'); | ||
| 168 | |||
| 169 | if (p == NULL) | ||
| 170 | continue; | ||
| 171 | *p = '\0'; | ||
| 172 | |||
| 173 | /* skip this same URL */ | ||
| 174 | if (strcmp(l, url) == 0) | ||
| 175 | continue; | ||
| 176 | |||
| 177 | /* skip crap */ | ||
| 178 | if (!xs_startswith(l, "https:/") && !xs_startswith(l, "http:/")) | ||
| 179 | continue; | ||
| 180 | |||
| 181 | snac_debug(user, 1, xs_fmt("RSS link: %s", l)); | ||
| 182 | |||
| 183 | if (timeline_here(user, l)) { | ||
| 184 | snac_debug(user, 1, xs_fmt("RSS entry already in timeline %s", l)); | ||
| 185 | continue; | ||
| 186 | } | ||
| 187 | |||
| 188 | /* special trick for Mastodon: convert from the alternate format */ | ||
| 189 | if (strchr(l, '@') != NULL) { | ||
| 190 | xs *l2 = xs_split(l, "/"); | ||
| 191 | |||
| 192 | if (xs_list_len(l2) == 5) { | ||
| 193 | const char *uid = xs_list_get(l2, 3); | ||
| 194 | if (*uid == '@') { | ||
| 195 | xs *guessed_id = xs_fmt("https:/" "/%s/users/%s/statuses/%s", | ||
| 196 | xs_list_get(l2, 2), uid + 1, xs_list_get(l2, -1)); | ||
| 197 | |||
| 198 | if (timeline_here(user, guessed_id)) { | ||
| 199 | snac_debug(user, 1, xs_fmt("RSS entry already in timeline (alt) %s", guessed_id)); | ||
| 200 | continue; | ||
| 201 | } | ||
| 202 | } | ||
| 203 | } | ||
| 204 | } | ||
| 205 | |||
| 206 | xs *obj = NULL; | ||
| 207 | |||
| 208 | if (!valid_status(object_get(l, &obj))) { | ||
| 209 | /* object is not here: bring it */ | ||
| 210 | if (!valid_status(activitypub_request(user, l, &obj))) | ||
| 211 | continue; | ||
| 212 | } | ||
| 213 | |||
| 214 | if (xs_is_dict(obj)) { | ||
| 215 | const char *id = xs_dict_get(obj, "id"); | ||
| 216 | const char *type = xs_dict_get(obj, "type"); | ||
| 217 | const char *attr_to = get_atto(obj); | ||
| 218 | |||
| 219 | if (!xs_is_string(id) || !xs_is_string(type) || !xs_is_string(attr_to)) | ||
| 220 | continue; | ||
| 221 | |||
| 222 | if (!xs_match(type, POSTLIKE_OBJECT_TYPE)) | ||
| 223 | continue; | ||
| 224 | |||
| 225 | if (timeline_here(user, id)) { | ||
| 226 | snac_debug(user, 1, xs_fmt("RSS entry already in timeline (id) %s", id)); | ||
| 227 | continue; | ||
| 228 | } | ||
| 229 | |||
| 230 | enqueue_actor_refresh(user, attr_to, 0); | ||
| 231 | |||
| 232 | timeline_add(user, id, obj); | ||
| 233 | |||
| 234 | snac_log(user, xs_fmt("new '%s' (RSS) %s %s", type, attr_to, id)); | ||
| 235 | } | ||
| 236 | } | ||
| 237 | |||
| 238 | /* update the RSS metadata */ | ||
| 239 | etag = xs_dict_get(rsp, "etag"); | ||
| 240 | |||
| 241 | if (xs_is_string(etag)) { | ||
| 242 | rss_md = xs_dict_set(rss_md, "etag", etag); | ||
| 243 | rss_md = xs_dict_set(rss_md, "url", url); | ||
| 244 | if ((f = fopen(rss_md_fn, "w")) != NULL) { | ||
| 245 | xs_json_dump(rss_md, 4, f); | ||
| 246 | fclose(f); | ||
| 247 | } | ||
| 248 | } | ||
| 249 | } | ||
| 250 | |||
| 251 | |||
| 252 | void rss_poll_hashtags(void) | ||
| 253 | /* parses all RSS from all users */ | ||
| 254 | { | ||
| 255 | xs *list = user_list(); | ||
| 256 | const char *uid; | ||
| 257 | |||
| 258 | xs_list_foreach(list, uid) { | ||
| 259 | snac user; | ||
| 260 | |||
| 261 | if (user_open(&user, uid)) { | ||
| 262 | const xs_list *rss = xs_dict_get(user.config, "followed_hashtags"); | ||
| 263 | |||
| 264 | if (xs_is_list(rss)) { | ||
| 265 | const char *url; | ||
| 266 | |||
| 267 | xs_list_foreach(rss, url) | ||
| 268 | rss_to_timeline(&user, url); | ||
| 269 | } | ||
| 270 | |||
| 271 | user_free(&user); | ||
| 272 | } | ||
| 273 | } | ||
| 274 | } | ||