diff --git a/packages/backend/src/core/activitypub/misc/extract-media-from-html.ts b/packages/backend/src/core/activitypub/misc/extract-media-from-html.ts
new file mode 100644
index 0000000000..9e898e965b
--- /dev/null
+++ b/packages/backend/src/core/activitypub/misc/extract-media-from-html.ts
@@ -0,0 +1,83 @@
+/*
+ * SPDX-FileCopyrightText: hazelnoot and other Sharkey contributors
+ * SPDX-License-Identifier: AGPL-3.0-only
+ */
+
+import { load as cheerio } from 'cheerio/slim';
+import type { IApDocument } from '@/core/activitypub/type.js';
+import type { CheerioAPI } from 'cheerio/slim';
+
+/**
+ * Finds HTML elements representing inline media and returns them as simulated AP documents.
+ * Returns an empty array if the input cannot be parsed, or no media was found.
+ * @param html Input HTML to analyze.
+ */
+export function extractMediaFromHtml(html: string): IApDocument[] {
+ const $ = parseHtml(html);
+ if (!$) return [];
+
+ const attachments: IApDocument[] = [];
+
+ // tags, including and fallback elements
+ // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/img
+ $('img[src]')
+ .toArray()
+ .forEach(img => attachments.push({
+ type: 'Image',
+ url: img.attribs.src,
+ name: img.attribs.alt || img.attribs.title || null,
+ }));
+
+ // tags
+ // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/object
+ $('object[data]')
+ .toArray()
+ .forEach(object => attachments.push({
+ type: 'Document',
+ url: object.attribs.data,
+ name: object.attribs.alt || object.attribs.title || null,
+ }));
+
+ // tags
+ // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/embed
+ $('embed[src]')
+ .toArray()
+ .forEach(embed => attachments.push({
+ type: 'Document',
+ url: embed.attribs.src,
+ name: embed.attribs.alt || embed.attribs.title || null,
+ }));
+
+ // tags
+ // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/audio
+ $('audio[src]')
+ .toArray()
+ .forEach(audio => attachments.push({
+ type: 'Audio',
+ url: audio.attribs.src,
+ name: audio.attribs.alt || audio.attribs.title || null,
+ }));
+
+ // tags
+ // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/video
+ $('video[src]')
+ .toArray()
+ .forEach(audio => attachments.push({
+ type: 'Video',
+ url: audio.attribs.src,
+ name: audio.attribs.alt || audio.attribs.title || null,
+ }));
+
+ // TODO support ? We would need to extract it directly from the HTML and save to a temp file.
+
+ return attachments;
+}
+
+function parseHtml(html: string): CheerioAPI | null {
+ try {
+ return cheerio(html);
+ } catch {
+ // Don't worry about invalid HTML
+ return null;
+ }
+}
diff --git a/packages/backend/src/core/activitypub/misc/get-content-by-type.ts b/packages/backend/src/core/activitypub/misc/get-content-by-type.ts
new file mode 100644
index 0000000000..1b0d9f8d04
--- /dev/null
+++ b/packages/backend/src/core/activitypub/misc/get-content-by-type.ts
@@ -0,0 +1,74 @@
+/*
+ * SPDX-FileCopyrightText: hazelnoot and other Sharkey contributors
+ * SPDX-License-Identifier: AGPL-3.0-only
+ */
+
+import type { IPost } from '@/core/activitypub/type.js';
+import { toArray } from '@/misc/prelude/array.js';
+
+/**
+ * Gets content of a specified media type from a provided object.
+ *
+ * Optionally supports a "permissive" mode which enables the following changes:
+ * 1. MIME types are checked in a case-insensitive manner.
+ * 2. MIME types are matched based on inclusion, not strict equality.
+ * 3. A candidate content is considered to match if it has no specified MIME type.
+ *
+ * Note: this method is written defensively to protect against malform remote objects.
+ * When extending or modifying it, please be sure to work with "unknown" type and validate everything.
+ *
+ * Note: the logic in this method is carefully ordered to match the selection priority of existing code in ApNoteService.
+ * Please do not re-arrange it without testing!
+ * New checks can be added to the end of the method to safely extend the existing logic.
+ *
+ * @param object AP object to extract content from.
+ * @param mimeType MIME type to look for.
+ * @param permissive Enables permissive mode, as described above. Defaults to false (disabled).
+ */
+export function getContentByType(object: IPost | Record, mimeType: string, permissive = false): string | null {
+ // Case 1: Extended "source" property
+ if (object.source && typeof(object.source) === 'object') {
+ // "source" is permitted to be an array, though no implementations are known to do this yet.
+ const sources = toArray(object.source) as Record[];
+ for (const source of sources) {
+ if (typeof (source.content) === 'string' && checkMediaType(source.mediaType)) {
+ return source.content;
+ }
+ }
+ }
+
+ // Case 2: Special case for MFM
+ if (typeof(object._misskey_content) === 'string' && mimeType === 'text/x.misskeymarkdown') {
+ return object._misskey_content;
+ }
+
+ // Case 3: AP native "content" property
+ if (typeof(object.content) === 'string' && checkMediaType(object.mediaType)) {
+ return object.content;
+ }
+
+ return null;
+
+ // Checks if the provided media type matches the input parameters.
+ function checkMediaType(mediaType: unknown): boolean {
+ if (typeof(mediaType) === 'string') {
+ // Strict match
+ if (mediaType === mimeType) {
+ return true;
+ }
+
+ // Permissive match
+ if (permissive && mediaType.toLowerCase().includes(mimeType)) {
+ return true;
+ }
+ }
+
+ // Permissive fallback match
+ if (permissive && mediaType == null) {
+ return true;
+ }
+
+ // No match
+ return false;
+ }
+}
diff --git a/packages/backend/src/core/activitypub/models/ApNoteService.ts b/packages/backend/src/core/activitypub/models/ApNoteService.ts
index 300cdbd0a0..aa0f6d0140 100644
--- a/packages/backend/src/core/activitypub/models/ApNoteService.ts
+++ b/packages/backend/src/core/activitypub/models/ApNoteService.ts
@@ -5,7 +5,6 @@
import { forwardRef, Inject, Injectable } from '@nestjs/common';
import { In } from 'typeorm';
-import { load as cheerio } from 'cheerio/slim';
import { UnrecoverableError } from 'bullmq';
import { DI } from '@/di-symbols.js';
import type { UsersRepository, PollsRepository, EmojisRepository, NotesRepository, MiMeta } from '@/models/_.js';
@@ -28,6 +27,8 @@ import { checkHttps } from '@/misc/check-https.js';
import { IdentifiableError } from '@/misc/identifiable-error.js';
import { isRetryableError } from '@/misc/is-retryable-error.js';
import { renderInlineError } from '@/misc/render-inline-error.js';
+import { extractMediaFromHtml } from '@/core/activitypub/misc/extract-media-from-html.js';
+import { getContentByType } from '@/core/activitypub/misc/get-content-by-type.js';
import { getOneApId, getApId, validPost, isEmoji, getApType, isApObject, isDocument, IApDocument, isLink } from '../type.js';
import { ApLoggerService } from '../ApLoggerService.js';
import { ApMfmService } from '../ApMfmService.js';
@@ -42,7 +43,6 @@ import { ApQuestionService } from './ApQuestionService.js';
import { ApImageService } from './ApImageService.js';
import type { Resolver } from '../ApResolverService.js';
import type { IObject, IPost } from '../type.js';
-import type { CheerioAPI } from 'cheerio/slim';
@Injectable()
export class ApNoteService {
@@ -208,12 +208,8 @@ export class ApNoteService {
const cw = note.summary === '' ? null : note.summary;
// テキストのパース
- let text: string | null = null;
- if (note.source?.mediaType === 'text/x.misskeymarkdown' && typeof note.source.content === 'string') {
- text = note.source.content;
- } else if (typeof note._misskey_content !== 'undefined') {
- text = note._misskey_content;
- } else if (typeof note.content === 'string') {
+ let text = getContentByType(note, 'text/x.misskeymarkdown');
+ if (text == null && typeof note.content === 'string') {
text = this.apMfmService.htmlToMfm(note.content, note.tag);
}
@@ -251,31 +247,9 @@ export class ApNoteService {
}
// 添付ファイル
- const files: MiDriveFile[] = [];
-
- for (const attach of toArray(note.attachment)) {
- attach.sensitive ??= note.sensitive;
- const file = await this.apImageService.resolveImage(actor, attach);
- if (file) files.push(file);
- }
-
- // Some software (Peertube) attaches a thumbnail under "icon" instead of "attachment"
- const icon = getBestIcon(note);
- if (icon) {
- icon.sensitive ??= note.sensitive;
- const file = await this.apImageService.resolveImage(actor, icon);
- if (file) files.push(file);
- }
-
- // Extract inline media from note content.
- // Don't use source.content, _misskey_content, or anything else because those aren't HTML.
- if (note.content) {
- for (const attach of extractInlineMedia(note.content)) {
- attach.sensitive ??= note.sensitive;
- const file = await this.apImageService.resolveImage(actor, attach);
- if (file) files.push(file);
- }
- }
+ // Note: implementation moved to getAttachment function to avoid duplication.
+ // Please copy any upstream changes to that method! (It's in the bottom of this class)
+ const files = await this.getAttachments(note, actor);
// リプライ
const reply: MiNote | null = note.inReplyTo
@@ -424,12 +398,8 @@ export class ApNoteService {
const cw = note.summary === '' ? null : note.summary;
// テキストのパース
- let text: string | null = null;
- if (note.source?.mediaType === 'text/x.misskeymarkdown' && typeof note.source.content === 'string') {
- text = note.source.content;
- } else if (typeof note._misskey_content !== 'undefined') {
- text = note._misskey_content;
- } else if (typeof note.content === 'string') {
+ let text = getContentByType(note, 'text/x.misskeymarkdown');
+ if (text == null && typeof note.content === 'string') {
text = this.apMfmService.htmlToMfm(note.content, note.tag);
}
@@ -459,31 +429,7 @@ export class ApNoteService {
}
// 添付ファイル
- const files: MiDriveFile[] = [];
-
- for (const attach of toArray(note.attachment)) {
- attach.sensitive ??= note.sensitive;
- const file = await this.apImageService.resolveImage(actor, attach);
- if (file) files.push(file);
- }
-
- // Some software (Peertube) attaches a thumbnail under "icon" instead of "attachment"
- const icon = getBestIcon(note);
- if (icon) {
- icon.sensitive ??= note.sensitive;
- const file = await this.apImageService.resolveImage(actor, icon);
- if (file) files.push(file);
- }
-
- // Extract inline media from note content.
- // Don't use source.content, _misskey_content, or anything else because those aren't HTML.
- if (note.content) {
- for (const attach of extractInlineMedia(note.content)) {
- attach.sensitive ??= note.sensitive;
- const file = await this.apImageService.resolveImage(actor, attach);
- if (file) files.push(file);
- }
- }
+ const files = await this.getAttachments(note, actor);
// リプライ
const reply: MiNote | null = note.inReplyTo
@@ -744,6 +690,55 @@ export class ApNoteService {
// Permanent error - return null
return null;
}
+
+ /**
+ * Extracts and saves all media attachments from the provided note.
+ * Returns an array of all the created files.
+ * TODO: suppress errors and set a processError entry instead.
+ * TODO: run in parallel (with promiseLimit!)
+ */
+ private async getAttachments(note: IPost, actor: MiRemoteUser): Promise {
+ const files: MiDriveFile[] = [];
+
+ for (const attach of toArray(note.attachment)) {
+ attach.sensitive ??= note.sensitive;
+ const file = await this.apImageService.resolveImage(actor, attach);
+ if (file) files.push(file);
+ }
+
+ // Some software (Peertube) attaches a thumbnail under "icon" instead of "attachment"
+ const icon = getBestIcon(note);
+ if (icon) {
+ icon.sensitive ??= note.sensitive;
+ const file = await this.apImageService.resolveImage(actor, icon);
+ if (file) files.push(file);
+ }
+
+ // Extract inline media from markdown content.
+ // Don't use source.content, _misskey_content, or anything else because those aren't HTML.
+ const htmlContent = getContentByType(note, 'text/html');
+ if (htmlContent) {
+ for (const attach of extractMediaFromHtml(htmlContent)) {
+ attach.sensitive ??= note.sensitive;
+ const file = await this.apImageService.resolveImage(actor, attach);
+ if (file) files.push(file);
+ }
+ }
+
+ // Extract inline media from markdown content.
+ // TODO We first need to implement support for "!" prefix in sfm-js.
+ // That will be implemented as part of https://activitypub.software/TransFem-org/Sharkey/-/issues/1105
+ // const markdownContent = getContentByType(note, 'text/markdown') || text;
+ // if (markdownContent) {
+ // for (const attach of extractMediaFromMarkdown(markdownContent)) {
+ // attach.sensitive ??= note.sensitive;
+ // const file = await this.apImageService.resolveImage(actor, attach);
+ // if (file) files.push(file);
+ // }
+ // }
+
+ return files;
+ }
}
function getBestIcon(note: IObject): IObject | null {
@@ -764,72 +759,3 @@ function getBestIcon(note: IObject): IObject | null {
}, null as IApDocument | null) ?? null;
}
-function extractInlineMedia(html: string): IApDocument[] {
- const $ = parseHtml(html);
- if (!$) return [];
-
- const attachments: IApDocument[] = [];
-
- // tags, including and fallback elements
- // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/img
- $('img[src]')
- .toArray()
- .forEach(img => attachments.push({
- type: 'Image',
- url: img.attribs.src,
- name: img.attribs.alt || img.attribs.title || null,
- }));
-
- // tags
- // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/object
- $('object[data]')
- .toArray()
- .forEach(object => attachments.push({
- type: 'Document',
- url: object.attribs.data,
- name: object.attribs.alt || object.attribs.title || null,
- }));
-
- // tags
- // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/embed
- $('embed[src]')
- .toArray()
- .forEach(embed => attachments.push({
- type: 'Document',
- url: embed.attribs.src,
- name: embed.attribs.alt || embed.attribs.title || null,
- }));
-
- // tags
- // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/audio
- $('audio[src]')
- .toArray()
- .forEach(audio => attachments.push({
- type: 'Audio',
- url: audio.attribs.src,
- name: audio.attribs.alt || audio.attribs.title || null,
- }));
-
- // tags
- // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/video
- $('video[src]')
- .toArray()
- .forEach(audio => attachments.push({
- type: 'Video',
- url: audio.attribs.src,
- name: audio.attribs.alt || audio.attribs.title || null,
- }));
-
- // TODO support ? we will need to extract it directly from the HTML.
-
- return attachments;
-}
-
-function parseHtml(html: string): CheerioAPI | null {
- try {
- return cheerio(html);
- } catch {
- // Don't worry about invalid HTML
- return null;
- }
-}