merge: Extract inline media from remote posts (resolves #1074, #1104, and #1105) (!1113)

View MR for information: https://activitypub.software/TransFem-org/Sharkey/-/merge_requests/1113

Closes #1074, #1104, and #1105

Approved-by: dakkar <dakkar@thenautilus.net>
Approved-by: Marie <github@yuugi.dev>
This commit is contained in:
Hazelnoot 2025-06-19 23:36:44 +00:00
commit 8926ba06a6
16 changed files with 960 additions and 181 deletions

View file

@ -0,0 +1,297 @@
/*
* SPDX-FileCopyrightText: hazelnoot and other Sharkey contributors
* SPDX-License-Identifier: AGPL-3.0-only
*/
import { extractMediaFromHtml } from '@/core/activitypub/misc/extract-media-from-html.js';
describe(extractMediaFromHtml, () => {
it('should return empty for invalid input', () => {
const result = extractMediaFromHtml('<broken html');
expect(result).toEqual([]);
});
it('should return empty for empty input', () => {
const result = extractMediaFromHtml('');
expect(result).toEqual([]);
});
it('should return empty for input without attachments', () => {
const result = extractMediaFromHtml('<div>No media here!</div>');
expect(result).toEqual([]);
});
it('should extract img tags', () => {
const result = extractMediaFromHtml('<img src="https://example.com/img.png" alt=""/>');
expect(result).toEqual([{
type: 'Image',
url: 'https://example.com/img.png',
name: null,
}]);
});
it('should ignore img tags without src', () => {
const result = extractMediaFromHtml('<img alt=""/>');
expect(result).toEqual([]);
});
it('should extract picture tags with img', () => {
const result = extractMediaFromHtml('<picture><img src="https://example.com/picture.png" alt=""/></picture>');
expect(result).toEqual([{
type: 'Image',
url: 'https://example.com/picture.png',
name: null,
}]);
});
it('should ignore picture tags without img', () => {
const result = extractMediaFromHtml('<picture><source src="https://example.com/picture.png"/></picture>');
expect(result).toEqual([]);
});
it('should ignore picture tags without src', () => {
const result = extractMediaFromHtml('<picture><source/><img alt=""/></picture>');
expect(result).toEqual([]);
});
it('should extract object tags', () => {
const result = extractMediaFromHtml('<object data="https://example.com/object.dat"></object>');
expect(result).toEqual([{
type: 'Document',
url: 'https://example.com/object.dat',
name: null,
}]);
});
it('should ignore object tags without data', () => {
const result = extractMediaFromHtml('<object></object>');
expect(result).toEqual([]);
});
it('should extract object tags with img fallback', () => {
const result = extractMediaFromHtml('<object><img src="https://example.com/object.png" alt=""/></object>');
expect(result).toEqual([{
type: 'Image',
url: 'https://example.com/object.png',
name: null,
}]);
});
it('should ignore object tags with empty img fallback', () => {
const result = extractMediaFromHtml('<object><img alt=""/></object>');
expect(result).toEqual([]);
});
it('should extract embed tags', () => {
const result = extractMediaFromHtml('<embed src="https://example.com/embed.dat"/>');
expect(result).toEqual([{
type: 'Document',
url: 'https://example.com/embed.dat',
name: null,
}]);
});
it('should ignore embed tags without src', () => {
const result = extractMediaFromHtml('<embed/>');
expect(result).toEqual([]);
});
it('should extract audio tags', () => {
const result = extractMediaFromHtml('<audio src="https://example.com/audio.mp3"></audio>');
expect(result).toEqual([{
type: 'Audio',
url: 'https://example.com/audio.mp3',
name: null,
}]);
});
it('should ignore audio tags without src', () => {
const result = extractMediaFromHtml('<audio></audio>');
expect(result).toEqual([]);
});
it('should extract video tags', () => {
const result = extractMediaFromHtml('<video src="https://example.com/video.mp4"></video>');
expect(result).toEqual([{
type: 'Video',
url: 'https://example.com/video.mp4',
name: null,
}]);
});
it('should ignore video tags without src', () => {
const result = extractMediaFromHtml('<video></video>');
expect(result).toEqual([]);
});
it('should extract alt text from alt property', () => {
const result = extractMediaFromHtml(`
<img src="https://example.com/img.png" alt="img tag" title="wrong"/>
<picture><img src="https://example.com/picture.png" alt="picture tag" title="wrong"/></picture>
<object data="https://example.com/object-1.dat" alt="object tag" title="wrong"></object>
<object><img src="https://example.com/object-2.png" alt="object tag" title="wrong"/></object>
<embed src="https://example.com/embed.dat" alt="embed tag" title="wrong"/>
<audio src="https://example.com/audio.mp3" alt="audio tag" title="wrong"/>
<video src="https://example.com/video.mp4" alt="video tag" title="wrong"/>
`);
expect(result).toEqual([
{
type: 'Image',
url: 'https://example.com/img.png',
name: 'img tag',
},
{
type: 'Image',
url: 'https://example.com/picture.png',
name: 'picture tag',
},
{
type: 'Image',
url: 'https://example.com/object-2.png',
name: 'object tag',
},
{
type: 'Document',
url: 'https://example.com/object-1.dat',
name: 'object tag',
},
{
type: 'Document',
url: 'https://example.com/embed.dat',
name: 'embed tag',
},
{
type: 'Audio',
url: 'https://example.com/audio.mp3',
name: 'audio tag',
},
{
type: 'Video',
url: 'https://example.com/video.mp4',
name: 'video tag',
},
]);
});
it('should extract alt text from title property', () => {
const result = extractMediaFromHtml(`
<img src="https://example.com/img.png" title="img tag"/>
<picture><img src="https://example.com/picture.png" title="picture tag"/></picture>
<object data="https://example.com/object-1.dat" title="object tag"></object>
<object><img src="https://example.com/object-2.png" title="object tag"/></object>
<embed src="https://example.com/embed.dat" title="embed tag"/>
<audio src="https://example.com/audio.mp3" title="audio tag"/>
<video src="https://example.com/video.mp4" title="video tag"/>
`);
expect(result).toEqual([
{
type: 'Image',
url: 'https://example.com/img.png',
name: 'img tag',
},
{
type: 'Image',
url: 'https://example.com/picture.png',
name: 'picture tag',
},
{
type: 'Image',
url: 'https://example.com/object-2.png',
name: 'object tag',
},
{
type: 'Document',
url: 'https://example.com/object-1.dat',
name: 'object tag',
},
{
type: 'Document',
url: 'https://example.com/embed.dat',
name: 'embed tag',
},
{
type: 'Audio',
url: 'https://example.com/audio.mp3',
name: 'audio tag',
},
{
type: 'Video',
url: 'https://example.com/video.mp4',
name: 'video tag',
},
]);
});
it('should ignore missing alt text', () => {
const result = extractMediaFromHtml(`
<img src="https://example.com/img.png"/>
<picture><img src="https://example.com/picture.png"/></picture>
<object data="https://example.com/object-1.dat"></object>
<object><img src="https://example.com/object-2.png"/></object>
<embed src="https://example.com/embed.dat"/>
<audio src="https://example.com/audio.mp3"/>
<video src="https://example.com/video.mp4"/>
`);
expect(result).toEqual([
{
type: 'Image',
url: 'https://example.com/img.png',
name: null,
},
{
type: 'Image',
url: 'https://example.com/picture.png',
name: null,
},
{
type: 'Image',
url: 'https://example.com/object-2.png',
name: null,
},
{
type: 'Document',
url: 'https://example.com/object-1.dat',
name: null,
},
{
type: 'Document',
url: 'https://example.com/embed.dat',
name: null,
},
{
type: 'Audio',
url: 'https://example.com/audio.mp3',
name: null,
},
{
type: 'Video',
url: 'https://example.com/video.mp4',
name: null,
},
]);
});
it('should de-duplicate attachments', () => {
const result = extractMediaFromHtml(`
<img src="https://example.com/1.png" alt="img 1"/>
<img src="https://example.com/2.png" alt="img 2"/>
<embed src="https://example.com/1.png" alt="embed 1"/>
`);
expect(result).toEqual([
{
type: 'Document',
url: 'https://example.com/1.png',
name: 'embed 1',
},
{
type: 'Image',
url: 'https://example.com/2.png',
name: 'img 2',
},
]);
});
});

View file

@ -0,0 +1,92 @@
/*
* SPDX-FileCopyrightText: hazelnoot and other Sharkey contributors
* SPDX-License-Identifier: AGPL-3.0-only
*/
import { extractMediaFromMfm } from '@/core/activitypub/misc/extract-media-from-mfm.js';
describe(extractMediaFromMfm, () => {
it('should return empty for empty input', () => {
const result = extractMediaFromMfm('');
expect(result).toEqual([]);
});
it('should return empty for invalid input', () => {
const result = extractMediaFromMfm('*broken markdown\0');
expect(result).toEqual([]);
});
it('should extract all image links', () => {
const result = extractMediaFromMfm(`
![1](https://example.com/images/1.png)
![](https://example.com/images/2.png)
**![3](https://example.com/images/3.png)**
`);
expect(result).toEqual([
{
type: 'Image',
url: 'https://example.com/images/1.png',
name: '1',
},
{
type: 'Image',
url: 'https://example.com/images/2.png',
name: null,
},
{
type: 'Image',
url: 'https://example.com/images/3.png',
name: '3',
},
]);
});
it('should ignore regular links', () => {
const result = extractMediaFromMfm(`
[1](https://example.com/images/1.png)
[](https://example.com/images/2.png)
**[3](https://example.com/images/3.png)**
`);
expect(result).toEqual([]);
});
it('should ignore silent links', () => {
const result = extractMediaFromMfm(`
?[1](https://example.com/images/1.png)
?[](https://example.com/images/2.png)
**?[3](https://example.com/images/3.png)**
`);
expect(result).toEqual([]);
});
it('should extract complex text', () => {
const result = extractMediaFromMfm('![this is an **image** with *complex* text! :owo: 💙](https://example.com/image.png)');
expect(result).toEqual([
{
type: 'Image',
url: 'https://example.com/image.png',
name: 'this is an image with complex text! :owo: 💙',
},
]);
});
it('should de-duplicate images', () => {
const result = extractMediaFromMfm(`
![1](https://example.com/images/1.png)
![](https://example.com/images/1.png)
**![3](https://example.com/images/1.png)**
`);
expect(result).toEqual([
{
type: 'Image',
url: 'https://example.com/images/1.png',
name: '3',
},
]);
});
});

View file

@ -0,0 +1,167 @@
/*
* SPDX-FileCopyrightText: hazelnoot and other Sharkey contributors
* SPDX-License-Identifier: AGPL-3.0-only
*/
import { getContentByType } from '@/core/activitypub/misc/get-content-by-type.js';
describe(getContentByType, () => {
describe('when permissive', () => {
it('should return source.content when it matches', () => {
const obj = {
source: {
content: 'source content',
},
_misskey_content: 'misskey content',
content: 'native content',
mediaType: 'text/x.misskeYMarkdown, text/markdown',
};
const content = getContentByType(obj, 'text/x.misskeymarkdown', true);
expect(content).toBe('source content');
});
it('should return _misskey_content when it matches', () => {
const obj = {
source: {
content: 'source content',
mediaType: 'text/plain',
},
_misskey_content: 'misskey content',
content: 'native content',
mediaType: 'text/x.misskeYMarkdown, text/markdown',
};
const content = getContentByType(obj, 'text/x.misskeymarkdown', true);
expect(content).toBe('misskey content');
});
it('should return content when it matches', () => {
const obj = {
source: {
content: 'source content',
mediaType: 'text/plain',
},
_misskey_content: null,
content: 'native content',
mediaType: 'text/x.misskeYMarkdown, text/markdown',
};
const content = getContentByType(obj, 'text/x.misskeymarkdown', true);
expect(content).toBe('native content');
});
it('should return null when nothing matches', () => {
const obj = {
source: {
content: 'source content',
mediaType: 'text/plain',
},
_misskey_content: null,
content: 'native content',
mediaType: 'text/plain',
};
const content = getContentByType(obj, 'text/x.misskeymarkdown', true);
expect(content).toBe(null);
});
it('should return null for invalid inputs', () => {
const objects = [
{},
{ source: 'nope' },
{ content: null },
{ _misskey_content: 123 },
];
const results = objects.map(c => getContentByType(c, 'text/misskeymarkdown', true));
const expected = objects.map(() => null);
expect(results).toEqual(expected);
});
});
describe('when not permissive', () => {
it('should return source.content when it matches', () => {
const obj = {
source: {
content: 'source content',
mediaType: 'text/x.misskeymarkdown',
},
_misskey_content: 'misskey content',
content: 'native content',
mediaType: 'text/x.misskeymarkdown',
};
const content = getContentByType(obj, 'text/x.misskeymarkdown');
expect(content).toBe('source content');
});
it('should return _misskey_content when it matches', () => {
const obj = {
source: {
content: 'source content',
mediaType: 'text/plain',
},
_misskey_content: 'misskey content',
content: 'native content',
mediaType: 'text/x.misskeymarkdown',
};
const content = getContentByType(obj, 'text/x.misskeymarkdown');
expect(content).toBe('misskey content');
});
it('should return content when it matches', () => {
const obj = {
source: {
content: 'source content',
mediaType: 'text/plain',
},
_misskey_content: null,
content: 'native content',
mediaType: 'text/x.misskeymarkdown',
};
const content = getContentByType(obj, 'text/x.misskeymarkdown');
expect(content).toBe('native content');
});
it('should return null when nothing matches', () => {
const obj = {
source: {
content: 'source content',
mediaType: 'text/plain',
},
_misskey_content: null,
content: 'native content',
mediaType: 'text/plain',
};
const content = getContentByType(obj, 'text/x.misskeymarkdown');
expect(content).toBe(null);
});
it('should return null for invalid inputs', () => {
const objects = [
{},
{ source: 'nope' },
{ content: null },
{ _misskey_content: 123 },
];
const results = objects.map(c => getContentByType(c, 'text/misskeymarkdown'));
const expected = objects.map(() => null);
expect(results).toEqual(expected);
});
});
});