extract inline images from MFM and Markdown

This commit is contained in:
Hazelnoot 2025-06-14 19:45:12 -04:00
parent 5e46efe60d
commit c3a6ba93ca
3 changed files with 165 additions and 14 deletions

View file

@ -0,0 +1,61 @@
/*
* SPDX-FileCopyrightText: hazelnoot and other Sharkey contributors
* SPDX-License-Identifier: AGPL-3.0-only
*/
import { parse, inspect, extract } from 'mfm-js';
import type { IApDocument } from '@/core/activitypub/type.js';
import type { MfmNode, MfmText } from 'mfm-js';
/**
* Finds MFM notes representing inline media and returns them as simulated AP documents.
* Returns an empty array if the input cannot be parsed, or no media was found.
* @param mfm Input MFM to analyze.
*/
export function extractMediaFromMfm(mfm: string): IApDocument[] {
const nodes = parseMfm(mfm);
if (nodes == null) return [];
const attachments = new Map<string, IApDocument>();
inspect(nodes, node => {
if (node.type === 'link' && node.props.image) {
const alt: string[] = [];
inspect(node.children, node => {
switch (node.type) {
case 'text':
alt.push(node.props.text);
break;
case 'unicodeEmoji':
alt.push(node.props.emoji);
break;
case 'emojiCode':
alt.push(':');
alt.push(node.props.name);
alt.push(':');
break;
}
});
attachments.set(node.props.url, {
type: 'Image',
url: node.props.url,
name: alt.length > 0
? alt.join('')
: null,
});
}
});
return Array.from(attachments.values());
}
function parseMfm(mfm: string): MfmNode[] | null {
try {
return parse(mfm);
} catch {
// Don't worry about invalid MFM
return null;
}
}

View file

@ -29,6 +29,7 @@ import { IdentifiableError } from '@/misc/identifiable-error.js';
import { isRetryableError } from '@/misc/is-retryable-error.js';
import { renderInlineError } from '@/misc/render-inline-error.js';
import { extractMediaFromHtml } from '@/core/activitypub/misc/extract-media-from-html.js';
import { extractMediaFromMfm } from '@/core/activitypub/misc/extract-media-from-mfm.js';
import { getContentByType } from '@/core/activitypub/misc/get-content-by-type.js';
import { getOneApId, getApId, validPost, isEmoji, getApType, isApObject, isDocument, IApDocument, isLink } from '../type.js';
import { ApLoggerService } from '../ApLoggerService.js';
@ -724,20 +725,17 @@ export class ApNoteService {
}
}
// Extract inline media from markdown content.
// TODO We first need to implement support for "!" prefix in sfm-js.
// That will be implemented as part of https://activitypub.software/TransFem-org/Sharkey/-/issues/1105
// const markdownContent =
// getContentByType(note, 'text/x.misskeymarkdown') ??
// getContentByType(note, 'text/markdown');
// if (markdownContent) {
// for (const attach of extractMediaFromMarkdown(markdownContent)) {
// if (hasUrl(attach)) {
// attach.sensitive ??= note.sensitive;
// attachments.set(attach.url, attach);
// }
// }
// }
// Extract inline media from MFM / markdown content.
const mfmContent =
getContentByType(note, 'text/x.misskeymarkdown') ??
getContentByType(note, 'text/markdown');
if (mfmContent) {
for (const attach of extractMediaFromMfm(mfmContent)) {
if (hasUrl(attach)) {
attachments.set(attach.url, attach);
}
}
}
// Some software (Peertube) attaches a thumbnail under "icon" instead of "attachment"
const icon = getBestIcon(note);

View file

@ -0,0 +1,92 @@
/*
* SPDX-FileCopyrightText: hazelnoot and other Sharkey contributors
* SPDX-License-Identifier: AGPL-3.0-only
*/
import { extractMediaFromMfm } from '@/core/activitypub/misc/extract-media-from-mfm.js';
describe(extractMediaFromMfm, () => {
it('should return empty for empty input', () => {
const result = extractMediaFromMfm('');
expect(result).toEqual([]);
});
it('should return empty for invalid input', () => {
const result = extractMediaFromMfm('*broken markdown\0');
expect(result).toEqual([]);
});
it('should extract all image links', () => {
const result = extractMediaFromMfm(`
![1](https://example.com/images/1.png)
![](https://example.com/images/2.png)
**![3](https://example.com/images/3.png)**
`);
expect(result).toEqual([
{
type: 'Image',
url: 'https://example.com/images/1.png',
name: '1',
},
{
type: 'Image',
url: 'https://example.com/images/2.png',
name: null,
},
{
type: 'Image',
url: 'https://example.com/images/3.png',
name: '3',
},
]);
});
it('should ignore regular links', () => {
const result = extractMediaFromMfm(`
[1](https://example.com/images/1.png)
[](https://example.com/images/2.png)
**[3](https://example.com/images/3.png)**
`);
expect(result).toEqual([]);
});
it('should ignore silent links', () => {
const result = extractMediaFromMfm(`
?[1](https://example.com/images/1.png)
?[](https://example.com/images/2.png)
**?[3](https://example.com/images/3.png)**
`);
expect(result).toEqual([]);
});
it('should extract complex text', () => {
const result = extractMediaFromMfm('![this is an **image** with *complex* text! :owo: 💙](https://example.com/image.png)');
expect(result).toEqual([
{
type: 'Image',
url: 'https://example.com/image.png',
name: 'this is an image with complex text! :owo: 💙',
},
]);
});
it('should de-duplicate images', () => {
const result = extractMediaFromMfm(`
![1](https://example.com/images/1.png)
![](https://example.com/images/1.png)
**![3](https://example.com/images/1.png)**
`);
expect(result).toEqual([
{
type: 'Image',
url: 'https://example.com/images/1.png',
name: '3',
},
]);
});
});