extract inline images from MFM and Markdown

2025-06-14 19:45:12 -04:00 · 2025-06-14 19:45:12 -04:00 · c3a6ba93ca
commit c3a6ba93ca
parent 5e46efe60d
3 changed files with 165 additions and 14 deletions
--- a/packages/backend/src/core/activitypub/misc/extract-media-from-mfm.ts
+++ b/packages/backend/src/core/activitypub/misc/extract-media-from-mfm.ts
@ -0,0 +1,61 @@
+/*
+ * SPDX-FileCopyrightText: hazelnoot and other Sharkey contributors
+ * SPDX-License-Identifier: AGPL-3.0-only
+ */
+
+import { parse, inspect, extract } from 'mfm-js';
+import type { IApDocument } from '@/core/activitypub/type.js';
+import type { MfmNode, MfmText } from 'mfm-js';
+
+/**
+ * Finds MFM notes representing inline media and returns them as simulated AP documents.
+ * Returns an empty array if the input cannot be parsed, or no media was found.
+ * @param mfm Input MFM to analyze.
+ */
+export function extractMediaFromMfm(mfm: string): IApDocument[] {
+	const nodes = parseMfm(mfm);
+	if (nodes == null) return [];
+
+	const attachments = new Map<string, IApDocument>();
+
+	inspect(nodes, node => {
+		if (node.type === 'link' && node.props.image) {
+			const alt: string[] = [];
+
+			inspect(node.children, node => {
+				switch (node.type) {
+					case 'text':
+						alt.push(node.props.text);
+						break;
+					case 'unicodeEmoji':
+						alt.push(node.props.emoji);
+						break;
+					case 'emojiCode':
+						alt.push(':');
+						alt.push(node.props.name);
+						alt.push(':');
+						break;
+				}
+			});
+
+			attachments.set(node.props.url, {
+				type: 'Image',
+				url: node.props.url,
+				name: alt.length > 0
+					? alt.join('')
+					: null,
+			});
+		}
+	});
+
+	return Array.from(attachments.values());
+}
+
+function parseMfm(mfm: string): MfmNode[] | null {
+	try {
+		return parse(mfm);
+	} catch {
+		// Don't worry about invalid MFM
+		return null;
+	}
+}
--- a/packages/backend/src/core/activitypub/models/ApNoteService.ts
+++ b/packages/backend/src/core/activitypub/models/ApNoteService.ts
@ -29,6 +29,7 @@ import { IdentifiableError } from '@/misc/identifiable-error.js';
 import { isRetryableError } from '@/misc/is-retryable-error.js';
 import { renderInlineError } from '@/misc/render-inline-error.js';
 import { extractMediaFromHtml } from '@/core/activitypub/misc/extract-media-from-html.js';
+import { extractMediaFromMfm } from '@/core/activitypub/misc/extract-media-from-mfm.js';
 import { getContentByType } from '@/core/activitypub/misc/get-content-by-type.js';
 import { getOneApId, getApId, validPost, isEmoji, getApType, isApObject, isDocument, IApDocument, isLink } from '../type.js';
 import { ApLoggerService } from '../ApLoggerService.js';
@ -724,20 +725,17 @@ export class ApNoteService {
 			}
 		}

-		// Extract inline media from markdown content.
-		// TODO We first need to implement support for "!" prefix in sfm-js.
-		//  That will be implemented as part of https://activitypub.software/TransFem-org/Sharkey/-/issues/1105
-		// const markdownContent =
-		// 	getContentByType(note, 'text/x.misskeymarkdown') ??
-		// 	getContentByType(note, 'text/markdown');
-		// if (markdownContent) {
-		// 	for (const attach of extractMediaFromMarkdown(markdownContent)) {
-		// 		if (hasUrl(attach)) {
-		// 			attach.sensitive ??= note.sensitive;
-		// 			attachments.set(attach.url, attach);
-		// 		}
-		// 	}
-		// }
+		// Extract inline media from MFM / markdown content.
+		const mfmContent =
+			getContentByType(note, 'text/x.misskeymarkdown') ??
+			getContentByType(note, 'text/markdown');
+		if (mfmContent) {
+			for (const attach of extractMediaFromMfm(mfmContent)) {
+				if (hasUrl(attach)) {
+					attachments.set(attach.url, attach);
+				}
+			}
+		}

 		// Some software (Peertube) attaches a thumbnail under "icon" instead of "attachment"
 		const icon = getBestIcon(note);
--- a/packages/backend/test/unit/core/activitypub/misc/extract-media-from-mfm.ts
+++ b/packages/backend/test/unit/core/activitypub/misc/extract-media-from-mfm.ts
@ -0,0 +1,92 @@
+/*
+ * SPDX-FileCopyrightText: hazelnoot and other Sharkey contributors
+ * SPDX-License-Identifier: AGPL-3.0-only
+ */
+
+import { extractMediaFromMfm } from '@/core/activitypub/misc/extract-media-from-mfm.js';
+
+describe(extractMediaFromMfm, () => {
+	it('should return empty for empty input', () => {
+		const result = extractMediaFromMfm('');
+		expect(result).toEqual([]);
+	});
+
+	it('should return empty for invalid input', () => {
+		const result = extractMediaFromMfm('*broken markdown\0');
+		expect(result).toEqual([]);
+	});
+
+	it('should extract all image links', () => {
+		const result = extractMediaFromMfm(`
+			![1](https://example.com/images/1.png)
+			![](https://example.com/images/2.png)
+			**![3](https://example.com/images/3.png)**
+		`);
+
+		expect(result).toEqual([
+			{
+				type: 'Image',
+				url: 'https://example.com/images/1.png',
+				name: '1',
+			},
+			{
+				type: 'Image',
+				url: 'https://example.com/images/2.png',
+				name: null,
+			},
+			{
+				type: 'Image',
+				url: 'https://example.com/images/3.png',
+				name: '3',
+			},
+		]);
+	});
+
+	it('should ignore regular links', () => {
+		const result = extractMediaFromMfm(`
+			[1](https://example.com/images/1.png)
+			[](https://example.com/images/2.png)
+			**[3](https://example.com/images/3.png)**
+		`);
+
+		expect(result).toEqual([]);
+	});
+
+	it('should ignore silent links', () => {
+		const result = extractMediaFromMfm(`
+			?[1](https://example.com/images/1.png)
+			?[](https://example.com/images/2.png)
+			**?[3](https://example.com/images/3.png)**
+		`);
+
+		expect(result).toEqual([]);
+	});
+
+	it('should extract complex text', () => {
+		const result = extractMediaFromMfm('![this is an **image** with *complex* text! :owo: 💙](https://example.com/image.png)');
+
+		expect(result).toEqual([
+			{
+				type: 'Image',
+				url: 'https://example.com/image.png',
+				name: 'this is an image with complex text! :owo: 💙',
+			},
+		]);
+	});
+
+	it('should de-duplicate images', () => {
+		const result = extractMediaFromMfm(`
+			![1](https://example.com/images/1.png)
+			![](https://example.com/images/1.png)
+			**![3](https://example.com/images/1.png)**
+		`);
+
+		expect(result).toEqual([
+			{
+				type: 'Image',
+				url: 'https://example.com/images/1.png',
+				name: '3',
+			},
+		]);
+	});
+});