From 97ff66116b7633066375bbc6cc88be5b8587453b Mon Sep 17 00:00:00 2001
From: grunfink
Date: Wed, 28 May 2025 07:48:47 +0200
Subject: New file rss.c.
---
rss.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 105 insertions(+)
create mode 100644 rss.c
(limited to 'rss.c')
diff --git a/rss.c b/rss.c
new file mode 100644
index 0000000..61620fa
--- /dev/null
+++ b/rss.c
@@ -0,0 +1,105 @@
+/* snac - A simple, minimalistic ActivityPub instance */
+/* copyright (c) 2025 grunfink et al. / MIT license */
+
+#include "xs.h"
+#include "xs_html.h"
+#include "xs_regex.h"
+#include "xs_time.h"
+
+#include "snac.h"
+
+xs_str *rss_from_timeline(snac *user, const xs_list *timeline,
+ const char *title, const char *link, const char *desc)
+/* converts a timeline to rss */
+{
+ xs_html *rss = xs_html_tag("rss",
+ xs_html_attr("xmlns:content", "http:/" "/purl.org/rss/1.0/modules/content/"),
+ xs_html_attr("version", "2.0"),
+ xs_html_attr("xmlns:atom", "http:/" "/www.w3.org/2005/Atom"));
+
+ xs_html *channel = xs_html_tag("channel",
+ xs_html_tag("title",
+ xs_html_text(title)),
+ xs_html_tag("language",
+ xs_html_text("en")),
+ xs_html_tag("link",
+ xs_html_text(link)),
+ xs_html_sctag("atom:link",
+ xs_html_attr("href", link),
+ xs_html_attr("rel", "self"),
+ xs_html_attr("type", "application/rss+xml")),
+ xs_html_tag("generator",
+ xs_html_text(USER_AGENT)),
+ xs_html_tag("description",
+ xs_html_text(desc)));
+
+ xs_html_add(rss, channel);
+
+ int cnt = 0;
+ const char *v;
+
+ xs_list_foreach(timeline, v) {
+ xs *msg = NULL;
+
+ if (user) {
+ if (!valid_status(timeline_get_by_md5(user, v, &msg)))
+ continue;
+ }
+ else {
+ if (!valid_status(object_get_by_md5(v, &msg)))
+ continue;
+ }
+
+ const char *id = xs_dict_get(msg, "id");
+ const char *content = xs_dict_get(msg, "content");
+ const char *published = xs_dict_get(msg, "published");
+
+ if (user && !xs_startswith(id, user->actor))
+ continue;
+
+ if (!id || !content || !published)
+ continue;
+
+ /* create a title with the first line of the content */
+ xs *title = xs_replace(content, "
", "\n");
+ title = xs_regex_replace_i(title, "<[^>]+>", " ");
+ title = xs_regex_replace_i(title, "&[^;]+;", " ");
+ int i;
+
+ for (i = 0; title[i] && title[i] != '\n' && i < 50; i++);
+
+ if (title[i] != '\0') {
+ title[i] = '\0';
+ title = xs_str_cat(title, "...");
+ }
+
+ title = xs_strip_i(title);
+
+ /* convert the date */
+ time_t t = xs_parse_iso_date(published, 0);
+ xs *rss_date = xs_str_utctime(t, "%a, %d %b %Y %T +0000");
+
+ /* if it's the first one, add it to the header */
+ if (cnt == 0)
+ xs_html_add(channel,
+ xs_html_tag("lastBuildDate",
+ xs_html_text(rss_date)));
+
+ xs_html_add(channel,
+ xs_html_tag("item",
+ xs_html_tag("title",
+ xs_html_text(title)),
+ xs_html_tag("link",
+ xs_html_text(id)),
+ xs_html_tag("guid",
+ xs_html_text(id)),
+ xs_html_tag("pubDate",
+ xs_html_text(rss_date)),
+ xs_html_tag("description",
+ xs_html_text(content))));
+
+ cnt++;
+ }
+
+ return xs_html_render_s(rss, "\n");
+}
--
cgit v1.2.3
From b783f287c8b3e77bbd1eb94892ea645ba05e8770 Mon Sep 17 00:00:00 2001
From: grunfink
Date: Wed, 28 May 2025 07:56:44 +0200
Subject: New function rss_to_timeline().
---
rss.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 124 insertions(+)
(limited to 'rss.c')
diff --git a/rss.c b/rss.c
index 61620fa..a3a092e 100644
--- a/rss.c
+++ b/rss.c
@@ -5,6 +5,8 @@
#include "xs_html.h"
#include "xs_regex.h"
#include "xs_time.h"
+#include "xs_match.h"
+#include "xs_curl.h"
#include "snac.h"
@@ -103,3 +105,125 @@ xs_str *rss_from_timeline(snac *user, const xs_list *timeline,
return xs_html_render_s(rss, "\n");
}
+
+
+void rss_to_timeline(snac *user, const char *url)
+/* reads an RSS and inserts all ActivityPub posts into the user's timeline */
+{
+ xs *hdrs = xs_dict_new();
+ hdrs = xs_dict_set(hdrs, "accept", "application/rss+xml");
+ hdrs = xs_dict_set(hdrs, "user-agent", USER_AGENT);
+
+ xs *payload = NULL;
+ int status;
+ int p_size;
+
+ xs *rsp = xs_http_request("GET", url, hdrs, NULL, 0, &status, &payload, &p_size, 0);
+
+ if (!valid_status(status) || !xs_is_string(payload))
+ return;
+
+ /* not an RSS? done */
+ const char *ctype = xs_dict_get(rsp, "content-type");
+ if (!xs_is_string(ctype) || xs_str_in(ctype, "application/rss+xml") == -1)
+ return;
+
+ snac_log(user, xs_fmt("parsing RSS %s", url));
+
+ /* yes, parsing is done with regexes (now I have two problems blah blah blah) */
+ xs *links = xs_regex_select(payload, "[^<]+");
+ const char *link;
+
+ xs_list_foreach(links, link) {
+ xs *l = xs_replace(link, "", "");
+ char *p = strchr(l, '<');
+
+ if (p == NULL)
+ continue;
+ *p = '\0';
+
+ /* skip this same URL */
+ if (strcmp(l, url) == 0)
+ continue;
+
+ snac_debug(user, 1, xs_fmt("RSS link: %s", l));
+
+ if (timeline_here(user, l)) {
+ snac_debug(user, 1, xs_fmt("RSS entry already in timeline %s", l));
+ continue;
+ }
+
+ /* special trick for Mastodon: convert from the alternate format */
+ if (strchr(l, '@') != NULL) {
+ xs *l2 = xs_split(l, "/");
+
+ if (xs_list_len(l2) == 5) {
+ const char *uid = xs_list_get(l2, 3);
+ if (*uid == '@') {
+ xs *guessed_id = xs_fmt("https:/" "/%s/users/%s/statuses/%s",
+ xs_list_get(l2, 2), uid + 1, xs_list_get(l2, -1));
+
+ if (timeline_here(user, guessed_id)) {
+ snac_debug(user, 1, xs_fmt("RSS entry already in timeline (alt) %s", guessed_id));
+ continue;
+ }
+ }
+ }
+ }
+
+ xs *obj = NULL;
+
+ if (!valid_status(object_get(l, &obj))) {
+ /* object is not here: bring it */
+ if (!valid_status(activitypub_request(user, l, &obj)))
+ continue;
+ }
+
+ if (xs_is_dict(obj)) {
+ const char *id = xs_dict_get(obj, "id");
+ const char *type = xs_dict_get(obj, "type");
+ const char *attr_to = get_atto(obj);
+
+ if (!xs_is_string(id) || !xs_is_string(type) || !xs_is_string(attr_to))
+ continue;
+
+ if (!xs_match(type, POSTLIKE_OBJECT_TYPE))
+ continue;
+
+ if (timeline_here(user, id)) {
+ snac_debug(user, 1, xs_fmt("RSS entry already in timeline (id) %s", id));
+ continue;
+ }
+
+ if (!valid_status(actor_request(user, attr_to, NULL)))
+ continue;
+
+ timeline_add(user, id, obj);
+ }
+ }
+}
+
+
+void rss_process(void)
+/* parses all RSS from all users */
+{
+ xs *list = user_list();
+ const char *uid;
+
+ xs_list_foreach(list, uid) {
+ snac user;
+
+ if (user_open(&user, uid)) {
+ const xs_list *rss = xs_dict_get(user.config, "rss");
+
+ if (xs_is_list(rss)) {
+ const char *url;
+
+ xs_list_foreach(rss, url)
+ rss_to_timeline(&user, url);
+ }
+
+ user_free(&user);
+ }
+ }
+}
--
cgit v1.2.3
From 9f8cd38ee72d93dec626b1e926f969cb75fd3094 Mon Sep 17 00:00:00 2001
From: grunfink
Date: Wed, 28 May 2025 08:00:47 +0200
Subject: Disabled rss_process() by now.
---
rss.c | 2 ++
1 file changed, 2 insertions(+)
(limited to 'rss.c')
diff --git a/rss.c b/rss.c
index a3a092e..b31ffd3 100644
--- a/rss.c
+++ b/rss.c
@@ -207,6 +207,7 @@ void rss_to_timeline(snac *user, const char *url)
void rss_process(void)
/* parses all RSS from all users */
{
+#if 0
xs *list = user_list();
const char *uid;
@@ -226,4 +227,5 @@ void rss_process(void)
user_free(&user);
}
}
+#endif
}
--
cgit v1.2.3
From a1369b39c1bd3d2036af12368997648454ca5564 Mon Sep 17 00:00:00 2001
From: grunfink
Date: Wed, 28 May 2025 09:07:19 +0200
Subject: Activated hashtag RSS polling.
---
rss.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
(limited to 'rss.c')
diff --git a/rss.c b/rss.c
index b31ffd3..ce3e184 100644
--- a/rss.c
+++ b/rss.c
@@ -110,6 +110,9 @@ xs_str *rss_from_timeline(snac *user, const xs_list *timeline,
void rss_to_timeline(snac *user, const char *url)
/* reads an RSS and inserts all ActivityPub posts into the user's timeline */
{
+ if (!xs_startswith(url, "https:/") && !xs_startswith(url, "http:/"))
+ return;
+
xs *hdrs = xs_dict_new();
hdrs = xs_dict_set(hdrs, "accept", "application/rss+xml");
hdrs = xs_dict_set(hdrs, "user-agent", USER_AGENT);
@@ -204,10 +207,9 @@ void rss_to_timeline(snac *user, const char *url)
}
-void rss_process(void)
+void rss_poll_hashtags(void)
/* parses all RSS from all users */
{
-#if 0
xs *list = user_list();
const char *uid;
@@ -215,7 +217,7 @@ void rss_process(void)
snac user;
if (user_open(&user, uid)) {
- const xs_list *rss = xs_dict_get(user.config, "rss");
+ const xs_list *rss = xs_dict_get(user.config, "followed_hashtags");
if (xs_is_list(rss)) {
const char *url;
@@ -227,5 +229,4 @@ void rss_process(void)
user_free(&user);
}
}
-#endif
}
--
cgit v1.2.3
From 77b9cbe6c3064c1656fa5375f7d2d498e34a34e6 Mon Sep 17 00:00:00 2001
From: grunfink
Date: Wed, 28 May 2025 09:39:07 +0200
Subject: Added more checks.
---
rss.c | 4 ++++
1 file changed, 4 insertions(+)
(limited to 'rss.c')
diff --git a/rss.c b/rss.c
index ce3e184..8798cac 100644
--- a/rss.c
+++ b/rss.c
@@ -149,6 +149,10 @@ void rss_to_timeline(snac *user, const char *url)
if (strcmp(l, url) == 0)
continue;
+ /* skip crap */
+ if (!xs_startswith(l, "https:/") && !xs_startswith(l, "http:/"))
+ return;
+
snac_debug(user, 1, xs_fmt("RSS link: %s", l));
if (timeline_here(user, l)) {
--
cgit v1.2.3
From b3067987354ca8512979eef87d3de89adb5f741d Mon Sep 17 00:00:00 2001
From: grunfink
Date: Thu, 29 May 2025 17:33:41 +0200
Subject: Fixed typo in rss_to_timeline().
---
rss.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
(limited to 'rss.c')
diff --git a/rss.c b/rss.c
index 8798cac..ed0bf05 100644
--- a/rss.c
+++ b/rss.c
@@ -151,7 +151,7 @@ void rss_to_timeline(snac *user, const char *url)
/* skip crap */
if (!xs_startswith(l, "https:/") && !xs_startswith(l, "http:/"))
- return;
+ continue;
snac_debug(user, 1, xs_fmt("RSS link: %s", l));
--
cgit v1.2.3
From e030fe6c5054c0a9b76a55adc80bb81d7a73fcc3 Mon Sep 17 00:00:00 2001
From: grunfink
Date: Thu, 29 May 2025 18:03:23 +0200
Subject: Use HTTP caching (etag / if-none-match) in RSS downloads.
---
rss.c | 40 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 38 insertions(+), 2 deletions(-)
(limited to 'rss.c')
diff --git a/rss.c b/rss.c
index ed0bf05..9d582f7 100644
--- a/rss.c
+++ b/rss.c
@@ -7,6 +7,8 @@
#include "xs_time.h"
#include "xs_match.h"
#include "xs_curl.h"
+#include "xs_openssl.h"
+#include "xs_json.h"
#include "snac.h"
@@ -117,12 +119,37 @@ void rss_to_timeline(snac *user, const char *url)
hdrs = xs_dict_set(hdrs, "accept", "application/rss+xml");
hdrs = xs_dict_set(hdrs, "user-agent", USER_AGENT);
+ /* get the RSS metadata */
+ xs *md5 = xs_md5_hex(url, strlen(url));
+ xs *rss_md_fn = xs_fmt("%s/rss", user->basedir);
+ mkdirx(rss_md_fn);
+ rss_md_fn = xs_str_cat(rss_md_fn, "/", md5, ".json");
+
+ xs *rss_md = NULL;
+ const char *etag = NULL;
+
+ FILE *f;
+ if ((f = fopen(rss_md_fn, "r")) != NULL) {
+ rss_md = xs_json_load(f);
+ fclose(f);
+
+ etag = xs_dict_get(rss_md, "etag");
+
+ if (xs_is_string(etag))
+ hdrs = xs_dict_set(hdrs, "if-none-match", etag);
+ }
+
+ if (rss_md == NULL)
+ rss_md = xs_dict_new();
+
xs *payload = NULL;
int status;
int p_size;
xs *rsp = xs_http_request("GET", url, hdrs, NULL, 0, &status, &payload, &p_size, 0);
+ snac_log(user, xs_fmt("parsing RSS %s %d", url, status));
+
if (!valid_status(status) || !xs_is_string(payload))
return;
@@ -131,8 +158,6 @@ void rss_to_timeline(snac *user, const char *url)
if (!xs_is_string(ctype) || xs_str_in(ctype, "application/rss+xml") == -1)
return;
- snac_log(user, xs_fmt("parsing RSS %s", url));
-
/* yes, parsing is done with regexes (now I have two problems blah blah blah) */
xs *links = xs_regex_select(payload, "[^<]+");
const char *link;
@@ -208,6 +233,17 @@ void rss_to_timeline(snac *user, const char *url)
timeline_add(user, id, obj);
}
}
+
+ /* update the RSS metadata */
+ etag = xs_dict_get(rsp, "etag");
+
+ if (xs_is_string(etag)) {
+ rss_md = xs_dict_set(rss_md, "etag", etag);
+ if ((f = fopen(rss_md_fn, "w")) != NULL) {
+ xs_json_dump(rss_md, 4, f);
+ fclose(f);
+ }
+ }
}
--
cgit v1.2.3
From 714ff17e3ccc55a9e70e03ee7b91cd1dd75e134b Mon Sep 17 00:00:00 2001
From: grunfink
Date: Thu, 29 May 2025 22:52:39 +0200
Subject: Also store the url in the RSS metadata.
---
rss.c | 1 +
1 file changed, 1 insertion(+)
(limited to 'rss.c')
diff --git a/rss.c b/rss.c
index 9d582f7..8d55370 100644
--- a/rss.c
+++ b/rss.c
@@ -239,6 +239,7 @@ void rss_to_timeline(snac *user, const char *url)
if (xs_is_string(etag)) {
rss_md = xs_dict_set(rss_md, "etag", etag);
+ rss_md = xs_dict_set(rss_md, "url", url);
if ((f = fopen(rss_md_fn, "w")) != NULL) {
xs_json_dump(rss_md, 4, f);
fclose(f);
--
cgit v1.2.3
From cd51d5310c8fa08a27018b40e6afdd1eb3d6e12e Mon Sep 17 00:00:00 2001
From: grunfink
Date: Fri, 30 May 2025 09:47:23 +0200
Subject: Call enqueue_actor_fresh() from rss_to_timeline().
---
rss.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
(limited to 'rss.c')
diff --git a/rss.c b/rss.c
index 8d55370..f72dac4 100644
--- a/rss.c
+++ b/rss.c
@@ -227,10 +227,11 @@ void rss_to_timeline(snac *user, const char *url)
continue;
}
- if (!valid_status(actor_request(user, attr_to, NULL)))
- continue;
+ enqueue_actor_refresh(user, attr_to, 0);
timeline_add(user, id, obj);
+
+ snac_log(user, xs_fmt("new '%s' (RSS) %s %s", type, attr_to, id));
}
}
--
cgit v1.2.3