From 97ff66116b7633066375bbc6cc88be5b8587453b Mon Sep 17 00:00:00 2001 From: grunfink Date: Wed, 28 May 2025 07:48:47 +0200 Subject: New file rss.c. --- rss.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 rss.c (limited to 'rss.c') diff --git a/rss.c b/rss.c new file mode 100644 index 0000000..61620fa --- /dev/null +++ b/rss.c @@ -0,0 +1,105 @@ +/* snac - A simple, minimalistic ActivityPub instance */ +/* copyright (c) 2025 grunfink et al. / MIT license */ + +#include "xs.h" +#include "xs_html.h" +#include "xs_regex.h" +#include "xs_time.h" + +#include "snac.h" + +xs_str *rss_from_timeline(snac *user, const xs_list *timeline, + const char *title, const char *link, const char *desc) +/* converts a timeline to rss */ +{ + xs_html *rss = xs_html_tag("rss", + xs_html_attr("xmlns:content", "http:/" "/purl.org/rss/1.0/modules/content/"), + xs_html_attr("version", "2.0"), + xs_html_attr("xmlns:atom", "http:/" "/www.w3.org/2005/Atom")); + + xs_html *channel = xs_html_tag("channel", + xs_html_tag("title", + xs_html_text(title)), + xs_html_tag("language", + xs_html_text("en")), + xs_html_tag("link", + xs_html_text(link)), + xs_html_sctag("atom:link", + xs_html_attr("href", link), + xs_html_attr("rel", "self"), + xs_html_attr("type", "application/rss+xml")), + xs_html_tag("generator", + xs_html_text(USER_AGENT)), + xs_html_tag("description", + xs_html_text(desc))); + + xs_html_add(rss, channel); + + int cnt = 0; + const char *v; + + xs_list_foreach(timeline, v) { + xs *msg = NULL; + + if (user) { + if (!valid_status(timeline_get_by_md5(user, v, &msg))) + continue; + } + else { + if (!valid_status(object_get_by_md5(v, &msg))) + continue; + } + + const char *id = xs_dict_get(msg, "id"); + const char *content = xs_dict_get(msg, "content"); + const char *published = xs_dict_get(msg, "published"); + + if (user && !xs_startswith(id, user->actor)) + continue; + + if (!id || !content || !published) + continue; + + /* create a title with the first line of the content */ + xs *title = xs_replace(content, "
", "\n"); + title = xs_regex_replace_i(title, "<[^>]+>", " "); + title = xs_regex_replace_i(title, "&[^;]+;", " "); + int i; + + for (i = 0; title[i] && title[i] != '\n' && i < 50; i++); + + if (title[i] != '\0') { + title[i] = '\0'; + title = xs_str_cat(title, "..."); + } + + title = xs_strip_i(title); + + /* convert the date */ + time_t t = xs_parse_iso_date(published, 0); + xs *rss_date = xs_str_utctime(t, "%a, %d %b %Y %T +0000"); + + /* if it's the first one, add it to the header */ + if (cnt == 0) + xs_html_add(channel, + xs_html_tag("lastBuildDate", + xs_html_text(rss_date))); + + xs_html_add(channel, + xs_html_tag("item", + xs_html_tag("title", + xs_html_text(title)), + xs_html_tag("link", + xs_html_text(id)), + xs_html_tag("guid", + xs_html_text(id)), + xs_html_tag("pubDate", + xs_html_text(rss_date)), + xs_html_tag("description", + xs_html_text(content)))); + + cnt++; + } + + return xs_html_render_s(rss, "\n"); +} -- cgit v1.2.3 From b783f287c8b3e77bbd1eb94892ea645ba05e8770 Mon Sep 17 00:00:00 2001 From: grunfink Date: Wed, 28 May 2025 07:56:44 +0200 Subject: New function rss_to_timeline(). --- rss.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) (limited to 'rss.c') diff --git a/rss.c b/rss.c index 61620fa..a3a092e 100644 --- a/rss.c +++ b/rss.c @@ -5,6 +5,8 @@ #include "xs_html.h" #include "xs_regex.h" #include "xs_time.h" +#include "xs_match.h" +#include "xs_curl.h" #include "snac.h" @@ -103,3 +105,125 @@ xs_str *rss_from_timeline(snac *user, const xs_list *timeline, return xs_html_render_s(rss, "\n"); } + + +void rss_to_timeline(snac *user, const char *url) +/* reads an RSS and inserts all ActivityPub posts into the user's timeline */ +{ + xs *hdrs = xs_dict_new(); + hdrs = xs_dict_set(hdrs, "accept", "application/rss+xml"); + hdrs = xs_dict_set(hdrs, "user-agent", USER_AGENT); + + xs *payload = NULL; + int status; + int p_size; + + xs *rsp = xs_http_request("GET", url, hdrs, NULL, 0, &status, &payload, &p_size, 0); + + if (!valid_status(status) || !xs_is_string(payload)) + return; + + /* not an RSS? done */ + const char *ctype = xs_dict_get(rsp, "content-type"); + if (!xs_is_string(ctype) || xs_str_in(ctype, "application/rss+xml") == -1) + return; + + snac_log(user, xs_fmt("parsing RSS %s", url)); + + /* yes, parsing is done with regexes (now I have two problems blah blah blah) */ + xs *links = xs_regex_select(payload, "[^<]+"); + const char *link; + + xs_list_foreach(links, link) { + xs *l = xs_replace(link, "", ""); + char *p = strchr(l, '<'); + + if (p == NULL) + continue; + *p = '\0'; + + /* skip this same URL */ + if (strcmp(l, url) == 0) + continue; + + snac_debug(user, 1, xs_fmt("RSS link: %s", l)); + + if (timeline_here(user, l)) { + snac_debug(user, 1, xs_fmt("RSS entry already in timeline %s", l)); + continue; + } + + /* special trick for Mastodon: convert from the alternate format */ + if (strchr(l, '@') != NULL) { + xs *l2 = xs_split(l, "/"); + + if (xs_list_len(l2) == 5) { + const char *uid = xs_list_get(l2, 3); + if (*uid == '@') { + xs *guessed_id = xs_fmt("https:/" "/%s/users/%s/statuses/%s", + xs_list_get(l2, 2), uid + 1, xs_list_get(l2, -1)); + + if (timeline_here(user, guessed_id)) { + snac_debug(user, 1, xs_fmt("RSS entry already in timeline (alt) %s", guessed_id)); + continue; + } + } + } + } + + xs *obj = NULL; + + if (!valid_status(object_get(l, &obj))) { + /* object is not here: bring it */ + if (!valid_status(activitypub_request(user, l, &obj))) + continue; + } + + if (xs_is_dict(obj)) { + const char *id = xs_dict_get(obj, "id"); + const char *type = xs_dict_get(obj, "type"); + const char *attr_to = get_atto(obj); + + if (!xs_is_string(id) || !xs_is_string(type) || !xs_is_string(attr_to)) + continue; + + if (!xs_match(type, POSTLIKE_OBJECT_TYPE)) + continue; + + if (timeline_here(user, id)) { + snac_debug(user, 1, xs_fmt("RSS entry already in timeline (id) %s", id)); + continue; + } + + if (!valid_status(actor_request(user, attr_to, NULL))) + continue; + + timeline_add(user, id, obj); + } + } +} + + +void rss_process(void) +/* parses all RSS from all users */ +{ + xs *list = user_list(); + const char *uid; + + xs_list_foreach(list, uid) { + snac user; + + if (user_open(&user, uid)) { + const xs_list *rss = xs_dict_get(user.config, "rss"); + + if (xs_is_list(rss)) { + const char *url; + + xs_list_foreach(rss, url) + rss_to_timeline(&user, url); + } + + user_free(&user); + } + } +} -- cgit v1.2.3 From 9f8cd38ee72d93dec626b1e926f969cb75fd3094 Mon Sep 17 00:00:00 2001 From: grunfink Date: Wed, 28 May 2025 08:00:47 +0200 Subject: Disabled rss_process() by now. --- rss.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'rss.c') diff --git a/rss.c b/rss.c index a3a092e..b31ffd3 100644 --- a/rss.c +++ b/rss.c @@ -207,6 +207,7 @@ void rss_to_timeline(snac *user, const char *url) void rss_process(void) /* parses all RSS from all users */ { +#if 0 xs *list = user_list(); const char *uid; @@ -226,4 +227,5 @@ void rss_process(void) user_free(&user); } } +#endif } -- cgit v1.2.3 From a1369b39c1bd3d2036af12368997648454ca5564 Mon Sep 17 00:00:00 2001 From: grunfink Date: Wed, 28 May 2025 09:07:19 +0200 Subject: Activated hashtag RSS polling. --- rss.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'rss.c') diff --git a/rss.c b/rss.c index b31ffd3..ce3e184 100644 --- a/rss.c +++ b/rss.c @@ -110,6 +110,9 @@ xs_str *rss_from_timeline(snac *user, const xs_list *timeline, void rss_to_timeline(snac *user, const char *url) /* reads an RSS and inserts all ActivityPub posts into the user's timeline */ { + if (!xs_startswith(url, "https:/") && !xs_startswith(url, "http:/")) + return; + xs *hdrs = xs_dict_new(); hdrs = xs_dict_set(hdrs, "accept", "application/rss+xml"); hdrs = xs_dict_set(hdrs, "user-agent", USER_AGENT); @@ -204,10 +207,9 @@ void rss_to_timeline(snac *user, const char *url) } -void rss_process(void) +void rss_poll_hashtags(void) /* parses all RSS from all users */ { -#if 0 xs *list = user_list(); const char *uid; @@ -215,7 +217,7 @@ void rss_process(void) snac user; if (user_open(&user, uid)) { - const xs_list *rss = xs_dict_get(user.config, "rss"); + const xs_list *rss = xs_dict_get(user.config, "followed_hashtags"); if (xs_is_list(rss)) { const char *url; @@ -227,5 +229,4 @@ void rss_process(void) user_free(&user); } } -#endif } -- cgit v1.2.3 From 77b9cbe6c3064c1656fa5375f7d2d498e34a34e6 Mon Sep 17 00:00:00 2001 From: grunfink Date: Wed, 28 May 2025 09:39:07 +0200 Subject: Added more checks. --- rss.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'rss.c') diff --git a/rss.c b/rss.c index ce3e184..8798cac 100644 --- a/rss.c +++ b/rss.c @@ -149,6 +149,10 @@ void rss_to_timeline(snac *user, const char *url) if (strcmp(l, url) == 0) continue; + /* skip crap */ + if (!xs_startswith(l, "https:/") && !xs_startswith(l, "http:/")) + return; + snac_debug(user, 1, xs_fmt("RSS link: %s", l)); if (timeline_here(user, l)) { -- cgit v1.2.3 From b3067987354ca8512979eef87d3de89adb5f741d Mon Sep 17 00:00:00 2001 From: grunfink Date: Thu, 29 May 2025 17:33:41 +0200 Subject: Fixed typo in rss_to_timeline(). --- rss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'rss.c') diff --git a/rss.c b/rss.c index 8798cac..ed0bf05 100644 --- a/rss.c +++ b/rss.c @@ -151,7 +151,7 @@ void rss_to_timeline(snac *user, const char *url) /* skip crap */ if (!xs_startswith(l, "https:/") && !xs_startswith(l, "http:/")) - return; + continue; snac_debug(user, 1, xs_fmt("RSS link: %s", l)); -- cgit v1.2.3 From e030fe6c5054c0a9b76a55adc80bb81d7a73fcc3 Mon Sep 17 00:00:00 2001 From: grunfink Date: Thu, 29 May 2025 18:03:23 +0200 Subject: Use HTTP caching (etag / if-none-match) in RSS downloads. --- rss.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'rss.c') diff --git a/rss.c b/rss.c index ed0bf05..9d582f7 100644 --- a/rss.c +++ b/rss.c @@ -7,6 +7,8 @@ #include "xs_time.h" #include "xs_match.h" #include "xs_curl.h" +#include "xs_openssl.h" +#include "xs_json.h" #include "snac.h" @@ -117,12 +119,37 @@ void rss_to_timeline(snac *user, const char *url) hdrs = xs_dict_set(hdrs, "accept", "application/rss+xml"); hdrs = xs_dict_set(hdrs, "user-agent", USER_AGENT); + /* get the RSS metadata */ + xs *md5 = xs_md5_hex(url, strlen(url)); + xs *rss_md_fn = xs_fmt("%s/rss", user->basedir); + mkdirx(rss_md_fn); + rss_md_fn = xs_str_cat(rss_md_fn, "/", md5, ".json"); + + xs *rss_md = NULL; + const char *etag = NULL; + + FILE *f; + if ((f = fopen(rss_md_fn, "r")) != NULL) { + rss_md = xs_json_load(f); + fclose(f); + + etag = xs_dict_get(rss_md, "etag"); + + if (xs_is_string(etag)) + hdrs = xs_dict_set(hdrs, "if-none-match", etag); + } + + if (rss_md == NULL) + rss_md = xs_dict_new(); + xs *payload = NULL; int status; int p_size; xs *rsp = xs_http_request("GET", url, hdrs, NULL, 0, &status, &payload, &p_size, 0); + snac_log(user, xs_fmt("parsing RSS %s %d", url, status)); + if (!valid_status(status) || !xs_is_string(payload)) return; @@ -131,8 +158,6 @@ void rss_to_timeline(snac *user, const char *url) if (!xs_is_string(ctype) || xs_str_in(ctype, "application/rss+xml") == -1) return; - snac_log(user, xs_fmt("parsing RSS %s", url)); - /* yes, parsing is done with regexes (now I have two problems blah blah blah) */ xs *links = xs_regex_select(payload, "[^<]+"); const char *link; @@ -208,6 +233,17 @@ void rss_to_timeline(snac *user, const char *url) timeline_add(user, id, obj); } } + + /* update the RSS metadata */ + etag = xs_dict_get(rsp, "etag"); + + if (xs_is_string(etag)) { + rss_md = xs_dict_set(rss_md, "etag", etag); + if ((f = fopen(rss_md_fn, "w")) != NULL) { + xs_json_dump(rss_md, 4, f); + fclose(f); + } + } } -- cgit v1.2.3 From 714ff17e3ccc55a9e70e03ee7b91cd1dd75e134b Mon Sep 17 00:00:00 2001 From: grunfink Date: Thu, 29 May 2025 22:52:39 +0200 Subject: Also store the url in the RSS metadata. --- rss.c | 1 + 1 file changed, 1 insertion(+) (limited to 'rss.c') diff --git a/rss.c b/rss.c index 9d582f7..8d55370 100644 --- a/rss.c +++ b/rss.c @@ -239,6 +239,7 @@ void rss_to_timeline(snac *user, const char *url) if (xs_is_string(etag)) { rss_md = xs_dict_set(rss_md, "etag", etag); + rss_md = xs_dict_set(rss_md, "url", url); if ((f = fopen(rss_md_fn, "w")) != NULL) { xs_json_dump(rss_md, 4, f); fclose(f); -- cgit v1.2.3 From cd51d5310c8fa08a27018b40e6afdd1eb3d6e12e Mon Sep 17 00:00:00 2001 From: grunfink Date: Fri, 30 May 2025 09:47:23 +0200 Subject: Call enqueue_actor_fresh() from rss_to_timeline(). --- rss.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'rss.c') diff --git a/rss.c b/rss.c index 8d55370..f72dac4 100644 --- a/rss.c +++ b/rss.c @@ -227,10 +227,11 @@ void rss_to_timeline(snac *user, const char *url) continue; } - if (!valid_status(actor_request(user, attr_to, NULL))) - continue; + enqueue_actor_refresh(user, attr_to, 0); timeline_add(user, id, obj); + + snac_log(user, xs_fmt("new '%s' (RSS) %s %s", type, attr_to, id)); } } -- cgit v1.2.3