1 files changed, 274 insertions, 0 deletions
diff --git a/rss.c b/rss.c
new file mode 100644
index 0000000..f72dac4
--- /dev/null
+++ b/rss.c
@@ -0,0 +1,274 @@
+/* snac - A simple, minimalistic ActivityPub instance */
+/* copyright (c) 2025 grunfink et al. / MIT license */
+#include "xs.h"
+#include "xs_html.h"
+#include "xs_regex.h"
+#include "xs_time.h"
+#include "xs_match.h"
+#include "xs_curl.h"
+#include "xs_openssl.h"
+#include "xs_json.h"
+#include "snac.h"
+xs_str *rss_from_timeline(snac *user, const xs_list *timeline,
+                        const char *title, const char *link, const char *desc)
+/* converts a timeline to rss */
+{
+    xs_html *rss = xs_html_tag("rss",
+        xs_html_attr("xmlns:content", "http:/" "/purl.org/rss/1.0/modules/content/"),
+        xs_html_attr("version",       "2.0"),
+        xs_html_attr("xmlns:atom",    "http:/" "/www.w3.org/2005/Atom"));
+    xs_html *channel = xs_html_tag("channel",
+        xs_html_tag("title",
+            xs_html_text(title)),
+        xs_html_tag("language",
+            xs_html_text("en")),
+        xs_html_tag("link",
+            xs_html_text(link)),
+        xs_html_sctag("atom:link",
+            xs_html_attr("href", link),
+            xs_html_attr("rel", "self"),
+            xs_html_attr("type", "application/rss+xml")),
+        xs_html_tag("generator",
+            xs_html_text(USER_AGENT)),
+        xs_html_tag("description",
+            xs_html_text(desc)));
+    xs_html_add(rss, channel);
+    int cnt = 0;
+    const char *v;
+    xs_list_foreach(timeline, v) {
+        xs *msg = NULL;
+        if (user) {
+            if (!valid_status(timeline_get_by_md5(user, v, &msg)))
+                continue;
+        }
+        else {
+            if (!valid_status(object_get_by_md5(v, &msg)))
+                continue;
+        }
+        const char *id = xs_dict_get(msg, "id");
+        const char *content = xs_dict_get(msg, "content");
+        const char *published = xs_dict_get(msg, "published");
+        if (user && !xs_startswith(id, user->actor))
+            continue;
+        if (!id || !content || !published)
+            continue;
+        /* create a title with the first line of the content */
+        xs *title = xs_replace(content, "<br>", "\n");
+        title = xs_regex_replace_i(title, "<[^>]+>", " ");
+        title = xs_regex_replace_i(title, "&[^;]+;", " ");
+        int i;
+        for (i = 0; title[i] && title[i] != '\n' && i < 50; i++);
+        if (title[i] != '\0') {
+            title[i] = '\0';
+            title = xs_str_cat(title, "...");
+        }
+        title = xs_strip_i(title);
+        /* convert the date */
+        time_t t = xs_parse_iso_date(published, 0);
+        xs *rss_date = xs_str_utctime(t, "%a, %d %b %Y %T +0000");
+        /* if it's the first one, add it to the header */
+        if (cnt == 0)
+            xs_html_add(channel,
+                xs_html_tag("lastBuildDate",
+                    xs_html_text(rss_date)));
+        xs_html_add(channel,
+            xs_html_tag("item",
+                xs_html_tag("title",
+                    xs_html_text(title)),
+                xs_html_tag("link",
+                    xs_html_text(id)),
+                xs_html_tag("guid",
+                    xs_html_text(id)),
+                xs_html_tag("pubDate",
+                    xs_html_text(rss_date)),
+                xs_html_tag("description",
+                    xs_html_text(content))));
+        cnt++;
+    }
+    return xs_html_render_s(rss, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
+}
+void rss_to_timeline(snac *user, const char *url)
+/* reads an RSS and inserts all ActivityPub posts into the user's timeline */
+{
+    if (!xs_startswith(url, "https:/") && !xs_startswith(url, "http:/"))
+        return;
+    xs *hdrs = xs_dict_new();
+    hdrs = xs_dict_set(hdrs, "accept",     "application/rss+xml");
+    hdrs = xs_dict_set(hdrs, "user-agent", USER_AGENT);
+    /* get the RSS metadata */
+    xs *md5 = xs_md5_hex(url, strlen(url));
+    xs *rss_md_fn = xs_fmt("%s/rss", user->basedir);
+    mkdirx(rss_md_fn);
+    rss_md_fn = xs_str_cat(rss_md_fn, "/", md5, ".json");
+    xs *rss_md = NULL;
+    const char *etag = NULL;
+    FILE *f;
+    if ((f = fopen(rss_md_fn, "r")) != NULL) {
+        rss_md = xs_json_load(f);
+        fclose(f);
+        etag = xs_dict_get(rss_md, "etag");
+        if (xs_is_string(etag))
+            hdrs = xs_dict_set(hdrs, "if-none-match", etag);
+    }
+    if (rss_md == NULL)
+        rss_md = xs_dict_new();
+    xs *payload = NULL;
+    int status;
+    int p_size;
+    xs *rsp = xs_http_request("GET", url, hdrs, NULL, 0, &status, &payload, &p_size, 0);
+    snac_log(user, xs_fmt("parsing RSS %s %d", url, status));
+    if (!valid_status(status) || !xs_is_string(payload))
+        return;
+    /* not an RSS? done */
+    const char *ctype = xs_dict_get(rsp, "content-type");
+    if (!xs_is_string(ctype) || xs_str_in(ctype, "application/rss+xml") == -1)
+        return;
+    /* yes, parsing is done with regexes (now I have two problems blah blah blah) */
+    xs *links = xs_regex_select(payload, "<link>[^<]+</link>");
+    const char *link;
+    xs_list_foreach(links, link) {
+        xs *l = xs_replace(link, "<link>", "");
+        char *p = strchr(l, '<');
+        if (p == NULL)
+            continue;
+        *p = '\0';
+        /* skip this same URL */
+        if (strcmp(l, url) == 0)
+            continue;
+        /* skip crap */
+        if (!xs_startswith(l, "https:/") && !xs_startswith(l, "http:/"))
+            continue;
+        snac_debug(user, 1, xs_fmt("RSS link: %s", l));
+        if (timeline_here(user, l)) {
+            snac_debug(user, 1, xs_fmt("RSS entry already in timeline %s", l));
+            continue;
+        }
+        /* special trick for Mastodon: convert from the alternate format */
+        if (strchr(l, '@') != NULL) {
+            xs *l2 = xs_split(l, "/");
+            if (xs_list_len(l2) == 5) {
+                const char *uid = xs_list_get(l2, 3);
+                if (*uid == '@') {
+                    xs *guessed_id = xs_fmt("https:/" "/%s/users/%s/statuses/%s",
+                        xs_list_get(l2, 2), uid + 1, xs_list_get(l2, -1));
+                    if (timeline_here(user, guessed_id)) {
+                        snac_debug(user, 1, xs_fmt("RSS entry already in timeline (alt) %s", guessed_id));
+                        continue;
+                    }
+                }
+            }
+        }
+        xs *obj = NULL;
+        if (!valid_status(object_get(l, &obj))) {
+            /* object is not here: bring it */
+            if (!valid_status(activitypub_request(user, l, &obj)))
+                continue;
+        }
+        if (xs_is_dict(obj)) {
+            const char *id      = xs_dict_get(obj, "id");
+            const char *type    = xs_dict_get(obj, "type");
+            const char *attr_to = get_atto(obj);
+            if (!xs_is_string(id) || !xs_is_string(type) || !xs_is_string(attr_to))
+                continue;
+            if (!xs_match(type, POSTLIKE_OBJECT_TYPE))
+                continue;
+            if (timeline_here(user, id)) {
+                snac_debug(user, 1, xs_fmt("RSS entry already in timeline (id) %s", id));
+                continue;
+            }
+            enqueue_actor_refresh(user, attr_to, 0);
+            timeline_add(user, id, obj);
+            snac_log(user, xs_fmt("new '%s' (RSS) %s %s", type, attr_to, id));
+        }
+    }
+    /* update the RSS metadata */
+    etag = xs_dict_get(rsp, "etag");
+    if (xs_is_string(etag)) {
+        rss_md = xs_dict_set(rss_md, "etag", etag);
+        rss_md = xs_dict_set(rss_md, "url", url);
+        if ((f = fopen(rss_md_fn, "w")) != NULL) {
+            xs_json_dump(rss_md, 4, f);
+            fclose(f);
+        }
+    }
+}
+void rss_poll_hashtags(void)
+/* parses all RSS from all users */
+{
+    xs *list = user_list();
+    const char *uid;
+    xs_list_foreach(list, uid) {
+        snac user;
+        if (user_open(&user, uid)) {
+            const xs_list *rss = xs_dict_get(user.config, "followed_hashtags");
+            if (xs_is_list(rss)) {
+                const char *url;
+                xs_list_foreach(rss, url)
+                    rss_to_timeline(&user, url);
+            }
+            user_free(&user);
+        }
+    }
+}

diff --git a/rss.c b/rss.c new file mode 100644 index 0000000..f72dac4 --- /dev/null +++ b/rss.c
@@ -0,0 +1,274 @@
	1	/* snac - A simple, minimalistic ActivityPub instance */
	2	/* copyright (c) 2025 grunfink et al. / MIT license */
	3
	4	#include "xs.h"
	5	#include "xs_html.h"
	6	#include "xs_regex.h"
	7	#include "xs_time.h"
	8	#include "xs_match.h"
	9	#include "xs_curl.h"
	10	#include "xs_openssl.h"
	11	#include "xs_json.h"
	12
	13	#include "snac.h"
	14
	15	xs_str rss_from_timeline(snac user, const xs_list *timeline,
	16	const char title, const char link, const char *desc)
	17	/* converts a timeline to rss */
	18	{
	19	xs_html *rss = xs_html_tag("rss",
	20	xs_html_attr("xmlns:content", "http:/" "/purl.org/rss/1.0/modules/content/"),
	21	xs_html_attr("version", "2.0"),
	22	xs_html_attr("xmlns:atom", "http:/" "/www.w3.org/2005/Atom"));
	23
	24	xs_html *channel = xs_html_tag("channel",
	25	xs_html_tag("title",
	26	xs_html_text(title)),
	27	xs_html_tag("language",
	28	xs_html_text("en")),
	29	xs_html_tag("link",
	30	xs_html_text(link)),
	31	xs_html_sctag("atom:link",
	32	xs_html_attr("href", link),
	33	xs_html_attr("rel", "self"),
	34	xs_html_attr("type", "application/rss+xml")),
	35	xs_html_tag("generator",
	36	xs_html_text(USER_AGENT)),
	37	xs_html_tag("description",
	38	xs_html_text(desc)));
	39
	40	xs_html_add(rss, channel);
	41
	42	int cnt = 0;
	43	const char *v;
	44
	45	xs_list_foreach(timeline, v) {
	46	xs *msg = NULL;
	47
	48	if (user) {
	49	if (!valid_status(timeline_get_by_md5(user, v, &msg)))
	50	continue;
	51	}
	52	else {
	53	if (!valid_status(object_get_by_md5(v, &msg)))
	54	continue;
	55	}
	56
	57	const char *id = xs_dict_get(msg, "id");
	58	const char *content = xs_dict_get(msg, "content");
	59	const char *published = xs_dict_get(msg, "published");
	60
	61	if (user && !xs_startswith(id, user->actor))
	62	continue;
	63
	64	if (!id \|\| !content \|\| !published)
	65	continue;
	66
	67	/* create a title with the first line of the content */
	68	xs *title = xs_replace(content, "<br>", "\n");
	69	title = xs_regex_replace_i(title, "<[^>]+>", " ");
	70	title = xs_regex_replace_i(title, "&[^;]+;", " ");
	71	int i;
	72
	73	for (i = 0; title[i] && title[i] != '\n' && i < 50; i++);
	74
	75	if (title[i] != '\0') {
	76	title[i] = '\0';
	77	title = xs_str_cat(title, "...");
	78	}
	79
	80	title = xs_strip_i(title);
	81
	82	/* convert the date */
	83	time_t t = xs_parse_iso_date(published, 0);
	84	xs *rss_date = xs_str_utctime(t, "%a, %d %b %Y %T +0000");
	85
	86	/* if it's the first one, add it to the header */
	87	if (cnt == 0)
	88	xs_html_add(channel,
	89	xs_html_tag("lastBuildDate",
	90	xs_html_text(rss_date)));
	91
	92	xs_html_add(channel,
	93	xs_html_tag("item",
	94	xs_html_tag("title",
	95	xs_html_text(title)),
	96	xs_html_tag("link",
	97	xs_html_text(id)),
	98	xs_html_tag("guid",
	99	xs_html_text(id)),
	100	xs_html_tag("pubDate",
	101	xs_html_text(rss_date)),
	102	xs_html_tag("description",
	103	xs_html_text(content))));
	104
	105	cnt++;
	106	}
	107
	108	return xs_html_render_s(rss, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
	109	}
	110
	111
	112	void rss_to_timeline(snac user, const char url)
	113	/* reads an RSS and inserts all ActivityPub posts into the user's timeline */
	114	{
	115	if (!xs_startswith(url, "https:/") && !xs_startswith(url, "http:/"))
	116	return;
	117
	118	xs *hdrs = xs_dict_new();
	119	hdrs = xs_dict_set(hdrs, "accept", "application/rss+xml");
	120	hdrs = xs_dict_set(hdrs, "user-agent", USER_AGENT);
	121
	122	/* get the RSS metadata */
	123	xs *md5 = xs_md5_hex(url, strlen(url));
	124	xs *rss_md_fn = xs_fmt("%s/rss", user->basedir);
	125	mkdirx(rss_md_fn);
	126	rss_md_fn = xs_str_cat(rss_md_fn, "/", md5, ".json");
	127
	128	xs *rss_md = NULL;
	129	const char *etag = NULL;
	130
	131	FILE *f;
	132	if ((f = fopen(rss_md_fn, "r")) != NULL) {
	133	rss_md = xs_json_load(f);
	134	fclose(f);
	135
	136	etag = xs_dict_get(rss_md, "etag");
	137
	138	if (xs_is_string(etag))
	139	hdrs = xs_dict_set(hdrs, "if-none-match", etag);
	140	}
	141
	142	if (rss_md == NULL)
	143	rss_md = xs_dict_new();
	144
	145	xs *payload = NULL;
	146	int status;
	147	int p_size;
	148
	149	xs *rsp = xs_http_request("GET", url, hdrs, NULL, 0, &status, &payload, &p_size, 0);
	150
	151	snac_log(user, xs_fmt("parsing RSS %s %d", url, status));
	152
	153	if (!valid_status(status) \|\| !xs_is_string(payload))
	154	return;
	155
	156	/* not an RSS? done */
	157	const char *ctype = xs_dict_get(rsp, "content-type");
	158	if (!xs_is_string(ctype) \|\| xs_str_in(ctype, "application/rss+xml") == -1)
	159	return;
	160
	161	/* yes, parsing is done with regexes (now I have two problems blah blah blah) */
	162	xs *links = xs_regex_select(payload, "<link>[^<]+</link>");
	163	const char *link;
	164
	165	xs_list_foreach(links, link) {
	166	xs *l = xs_replace(link, "<link>", "");
	167	char *p = strchr(l, '<');
	168
	169	if (p == NULL)
	170	continue;
	171	*p = '\0';
	172
	173	/* skip this same URL */
	174	if (strcmp(l, url) == 0)
	175	continue;
	176
	177	/* skip crap */
	178	if (!xs_startswith(l, "https:/") && !xs_startswith(l, "http:/"))
	179	continue;
	180
	181	snac_debug(user, 1, xs_fmt("RSS link: %s", l));
	182
	183	if (timeline_here(user, l)) {
	184	snac_debug(user, 1, xs_fmt("RSS entry already in timeline %s", l));
	185	continue;
	186	}
	187
	188	/* special trick for Mastodon: convert from the alternate format */
	189	if (strchr(l, '@') != NULL) {
	190	xs *l2 = xs_split(l, "/");
	191
	192	if (xs_list_len(l2) == 5) {
	193	const char *uid = xs_list_get(l2, 3);
	194	if (*uid == '@') {
	195	xs *guessed_id = xs_fmt("https:/" "/%s/users/%s/statuses/%s",
	196	xs_list_get(l2, 2), uid + 1, xs_list_get(l2, -1));
	197
	198	if (timeline_here(user, guessed_id)) {
	199	snac_debug(user, 1, xs_fmt("RSS entry already in timeline (alt) %s", guessed_id));
	200	continue;
	201	}
	202	}
	203	}
	204	}
	205
	206	xs *obj = NULL;
	207
	208	if (!valid_status(object_get(l, &obj))) {
	209	/* object is not here: bring it */
	210	if (!valid_status(activitypub_request(user, l, &obj)))
	211	continue;
	212	}
	213
	214	if (xs_is_dict(obj)) {
	215	const char *id = xs_dict_get(obj, "id");
	216	const char *type = xs_dict_get(obj, "type");
	217	const char *attr_to = get_atto(obj);
	218
	219	if (!xs_is_string(id) \|\| !xs_is_string(type) \|\| !xs_is_string(attr_to))
	220	continue;
	221
	222	if (!xs_match(type, POSTLIKE_OBJECT_TYPE))
	223	continue;
	224
	225	if (timeline_here(user, id)) {
	226	snac_debug(user, 1, xs_fmt("RSS entry already in timeline (id) %s", id));
	227	continue;
	228	}
	229
	230	enqueue_actor_refresh(user, attr_to, 0);
	231
	232	timeline_add(user, id, obj);
	233
	234	snac_log(user, xs_fmt("new '%s' (RSS) %s %s", type, attr_to, id));
	235	}
	236	}
	237
	238	/* update the RSS metadata */
	239	etag = xs_dict_get(rsp, "etag");
	240
	241	if (xs_is_string(etag)) {
	242	rss_md = xs_dict_set(rss_md, "etag", etag);
	243	rss_md = xs_dict_set(rss_md, "url", url);
	244	if ((f = fopen(rss_md_fn, "w")) != NULL) {
	245	xs_json_dump(rss_md, 4, f);
	246	fclose(f);
	247	}
	248	}
	249	}
	250
	251
	252	void rss_poll_hashtags(void)
	253	/* parses all RSS from all users */
	254	{
	255	xs *list = user_list();
	256	const char *uid;
	257
	258	xs_list_foreach(list, uid) {
	259	snac user;
	260
	261	if (user_open(&user, uid)) {
	262	const xs_list *rss = xs_dict_get(user.config, "followed_hashtags");
	263
	264	if (xs_is_list(rss)) {
	265	const char *url;
	266
	267	xs_list_foreach(rss, url)
	268	rss_to_timeline(&user, url);
	269	}
	270
	271	user_free(&user);
	272	}
	273	}
	274	}