factor out attachment code to reduce duplication and allow testing

This commit is contained in:
Hazelnoot 2025-06-13 12:56:14 -04:00
parent b0b0218b75
commit 5430b00f72
3 changed files with 216 additions and 133 deletions

View file

@ -0,0 +1,83 @@
/*
* SPDX-FileCopyrightText: hazelnoot and other Sharkey contributors
* SPDX-License-Identifier: AGPL-3.0-only
*/
import { load as cheerio } from 'cheerio/slim';
import type { IApDocument } from '@/core/activitypub/type.js';
import type { CheerioAPI } from 'cheerio/slim';
/**
* Finds HTML elements representing inline media and returns them as simulated AP documents.
* Returns an empty array if the input cannot be parsed, or no media was found.
* @param html Input HTML to analyze.
*/
export function extractMediaFromHtml(html: string): IApDocument[] {
const $ = parseHtml(html);
if (!$) return [];
const attachments: IApDocument[] = [];
// <img> tags, including <picture> and <object> fallback elements
// https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/img
$('img[src]')
.toArray()
.forEach(img => attachments.push({
type: 'Image',
url: img.attribs.src,
name: img.attribs.alt || img.attribs.title || null,
}));
// <object> tags
// https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/object
$('object[data]')
.toArray()
.forEach(object => attachments.push({
type: 'Document',
url: object.attribs.data,
name: object.attribs.alt || object.attribs.title || null,
}));
// <embed> tags
// https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/embed
$('embed[src]')
.toArray()
.forEach(embed => attachments.push({
type: 'Document',
url: embed.attribs.src,
name: embed.attribs.alt || embed.attribs.title || null,
}));
// <audio> tags
// https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/audio
$('audio[src]')
.toArray()
.forEach(audio => attachments.push({
type: 'Audio',
url: audio.attribs.src,
name: audio.attribs.alt || audio.attribs.title || null,
}));
// <video> tags
// https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/video
$('video[src]')
.toArray()
.forEach(audio => attachments.push({
type: 'Video',
url: audio.attribs.src,
name: audio.attribs.alt || audio.attribs.title || null,
}));
// TODO support <svg>? We would need to extract it directly from the HTML and save to a temp file.
return attachments;
}
function parseHtml(html: string): CheerioAPI | null {
try {
return cheerio(html);
} catch {
// Don't worry about invalid HTML
return null;
}
}

View file

@ -0,0 +1,74 @@
/*
* SPDX-FileCopyrightText: hazelnoot and other Sharkey contributors
* SPDX-License-Identifier: AGPL-3.0-only
*/
import type { IPost } from '@/core/activitypub/type.js';
import { toArray } from '@/misc/prelude/array.js';
/**
* Gets content of a specified media type from a provided object.
*
* Optionally supports a "permissive" mode which enables the following changes:
* 1. MIME types are checked in a case-insensitive manner.
* 2. MIME types are matched based on inclusion, not strict equality.
* 3. A candidate content is considered to match if it has no specified MIME type.
*
* Note: this method is written defensively to protect against malform remote objects.
* When extending or modifying it, please be sure to work with "unknown" type and validate everything.
*
* Note: the logic in this method is carefully ordered to match the selection priority of existing code in ApNoteService.
* Please do not re-arrange it without testing!
* New checks can be added to the end of the method to safely extend the existing logic.
*
* @param object AP object to extract content from.
* @param mimeType MIME type to look for.
* @param permissive Enables permissive mode, as described above. Defaults to false (disabled).
*/
export function getContentByType(object: IPost | Record<string, unknown>, mimeType: string, permissive = false): string | null {
// Case 1: Extended "source" property
if (object.source && typeof(object.source) === 'object') {
// "source" is permitted to be an array, though no implementations are known to do this yet.
const sources = toArray(object.source) as Record<string, unknown>[];
for (const source of sources) {
if (typeof (source.content) === 'string' && checkMediaType(source.mediaType)) {
return source.content;
}
}
}
// Case 2: Special case for MFM
if (typeof(object._misskey_content) === 'string' && mimeType === 'text/x.misskeymarkdown') {
return object._misskey_content;
}
// Case 3: AP native "content" property
if (typeof(object.content) === 'string' && checkMediaType(object.mediaType)) {
return object.content;
}
return null;
// Checks if the provided media type matches the input parameters.
function checkMediaType(mediaType: unknown): boolean {
if (typeof(mediaType) === 'string') {
// Strict match
if (mediaType === mimeType) {
return true;
}
// Permissive match
if (permissive && mediaType.toLowerCase().includes(mimeType)) {
return true;
}
}
// Permissive fallback match
if (permissive && mediaType == null) {
return true;
}
// No match
return false;
}
}

View file

@ -5,7 +5,6 @@
import { forwardRef, Inject, Injectable } from '@nestjs/common';
import { In } from 'typeorm';
import { load as cheerio } from 'cheerio/slim';
import { UnrecoverableError } from 'bullmq';
import { DI } from '@/di-symbols.js';
import type { UsersRepository, PollsRepository, EmojisRepository, NotesRepository, MiMeta } from '@/models/_.js';
@ -28,6 +27,8 @@ import { checkHttps } from '@/misc/check-https.js';
import { IdentifiableError } from '@/misc/identifiable-error.js';
import { isRetryableError } from '@/misc/is-retryable-error.js';
import { renderInlineError } from '@/misc/render-inline-error.js';
import { extractMediaFromHtml } from '@/core/activitypub/misc/extract-media-from-html.js';
import { getContentByType } from '@/core/activitypub/misc/get-content-by-type.js';
import { getOneApId, getApId, validPost, isEmoji, getApType, isApObject, isDocument, IApDocument, isLink } from '../type.js';
import { ApLoggerService } from '../ApLoggerService.js';
import { ApMfmService } from '../ApMfmService.js';
@ -42,7 +43,6 @@ import { ApQuestionService } from './ApQuestionService.js';
import { ApImageService } from './ApImageService.js';
import type { Resolver } from '../ApResolverService.js';
import type { IObject, IPost } from '../type.js';
import type { CheerioAPI } from 'cheerio/slim';
@Injectable()
export class ApNoteService {
@ -208,12 +208,8 @@ export class ApNoteService {
const cw = note.summary === '' ? null : note.summary;
// テキストのパース
let text: string | null = null;
if (note.source?.mediaType === 'text/x.misskeymarkdown' && typeof note.source.content === 'string') {
text = note.source.content;
} else if (typeof note._misskey_content !== 'undefined') {
text = note._misskey_content;
} else if (typeof note.content === 'string') {
let text = getContentByType(note, 'text/x.misskeymarkdown');
if (text == null && typeof note.content === 'string') {
text = this.apMfmService.htmlToMfm(note.content, note.tag);
}
@ -251,31 +247,9 @@ export class ApNoteService {
}
// 添付ファイル
const files: MiDriveFile[] = [];
for (const attach of toArray(note.attachment)) {
attach.sensitive ??= note.sensitive;
const file = await this.apImageService.resolveImage(actor, attach);
if (file) files.push(file);
}
// Some software (Peertube) attaches a thumbnail under "icon" instead of "attachment"
const icon = getBestIcon(note);
if (icon) {
icon.sensitive ??= note.sensitive;
const file = await this.apImageService.resolveImage(actor, icon);
if (file) files.push(file);
}
// Extract inline media from note content.
// Don't use source.content, _misskey_content, or anything else because those aren't HTML.
if (note.content) {
for (const attach of extractInlineMedia(note.content)) {
attach.sensitive ??= note.sensitive;
const file = await this.apImageService.resolveImage(actor, attach);
if (file) files.push(file);
}
}
// Note: implementation moved to getAttachment function to avoid duplication.
// Please copy any upstream changes to that method! (It's in the bottom of this class)
const files = await this.getAttachments(note, actor);
// リプライ
const reply: MiNote | null = note.inReplyTo
@ -424,12 +398,8 @@ export class ApNoteService {
const cw = note.summary === '' ? null : note.summary;
// テキストのパース
let text: string | null = null;
if (note.source?.mediaType === 'text/x.misskeymarkdown' && typeof note.source.content === 'string') {
text = note.source.content;
} else if (typeof note._misskey_content !== 'undefined') {
text = note._misskey_content;
} else if (typeof note.content === 'string') {
let text = getContentByType(note, 'text/x.misskeymarkdown');
if (text == null && typeof note.content === 'string') {
text = this.apMfmService.htmlToMfm(note.content, note.tag);
}
@ -459,31 +429,7 @@ export class ApNoteService {
}
// 添付ファイル
const files: MiDriveFile[] = [];
for (const attach of toArray(note.attachment)) {
attach.sensitive ??= note.sensitive;
const file = await this.apImageService.resolveImage(actor, attach);
if (file) files.push(file);
}
// Some software (Peertube) attaches a thumbnail under "icon" instead of "attachment"
const icon = getBestIcon(note);
if (icon) {
icon.sensitive ??= note.sensitive;
const file = await this.apImageService.resolveImage(actor, icon);
if (file) files.push(file);
}
// Extract inline media from note content.
// Don't use source.content, _misskey_content, or anything else because those aren't HTML.
if (note.content) {
for (const attach of extractInlineMedia(note.content)) {
attach.sensitive ??= note.sensitive;
const file = await this.apImageService.resolveImage(actor, attach);
if (file) files.push(file);
}
}
const files = await this.getAttachments(note, actor);
// リプライ
const reply: MiNote | null = note.inReplyTo
@ -744,6 +690,55 @@ export class ApNoteService {
// Permanent error - return null
return null;
}
/**
* Extracts and saves all media attachments from the provided note.
* Returns an array of all the created files.
* TODO: suppress errors and set a processError entry instead.
* TODO: run in parallel (with promiseLimit!)
*/
private async getAttachments(note: IPost, actor: MiRemoteUser): Promise<MiDriveFile[]> {
const files: MiDriveFile[] = [];
for (const attach of toArray(note.attachment)) {
attach.sensitive ??= note.sensitive;
const file = await this.apImageService.resolveImage(actor, attach);
if (file) files.push(file);
}
// Some software (Peertube) attaches a thumbnail under "icon" instead of "attachment"
const icon = getBestIcon(note);
if (icon) {
icon.sensitive ??= note.sensitive;
const file = await this.apImageService.resolveImage(actor, icon);
if (file) files.push(file);
}
// Extract inline media from markdown content.
// Don't use source.content, _misskey_content, or anything else because those aren't HTML.
const htmlContent = getContentByType(note, 'text/html');
if (htmlContent) {
for (const attach of extractMediaFromHtml(htmlContent)) {
attach.sensitive ??= note.sensitive;
const file = await this.apImageService.resolveImage(actor, attach);
if (file) files.push(file);
}
}
// Extract inline media from markdown content.
// TODO We first need to implement support for "!" prefix in sfm-js.
// That will be implemented as part of https://activitypub.software/TransFem-org/Sharkey/-/issues/1105
// const markdownContent = getContentByType(note, 'text/markdown') || text;
// if (markdownContent) {
// for (const attach of extractMediaFromMarkdown(markdownContent)) {
// attach.sensitive ??= note.sensitive;
// const file = await this.apImageService.resolveImage(actor, attach);
// if (file) files.push(file);
// }
// }
return files;
}
}
function getBestIcon(note: IObject): IObject | null {
@ -764,72 +759,3 @@ function getBestIcon(note: IObject): IObject | null {
}, null as IApDocument | null) ?? null;
}
function extractInlineMedia(html: string): IApDocument[] {
const $ = parseHtml(html);
if (!$) return [];
const attachments: IApDocument[] = [];
// <img> tags, including <picture> and <object> fallback elements
// https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/img
$('img[src]')
.toArray()
.forEach(img => attachments.push({
type: 'Image',
url: img.attribs.src,
name: img.attribs.alt || img.attribs.title || null,
}));
// <object> tags
// https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/object
$('object[data]')
.toArray()
.forEach(object => attachments.push({
type: 'Document',
url: object.attribs.data,
name: object.attribs.alt || object.attribs.title || null,
}));
// <embed> tags
// https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/embed
$('embed[src]')
.toArray()
.forEach(embed => attachments.push({
type: 'Document',
url: embed.attribs.src,
name: embed.attribs.alt || embed.attribs.title || null,
}));
// <audio> tags
// https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/audio
$('audio[src]')
.toArray()
.forEach(audio => attachments.push({
type: 'Audio',
url: audio.attribs.src,
name: audio.attribs.alt || audio.attribs.title || null,
}));
// <video> tags
// https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/video
$('video[src]')
.toArray()
.forEach(audio => attachments.push({
type: 'Video',
url: audio.attribs.src,
name: audio.attribs.alt || audio.attribs.title || null,
}));
// TODO support <svg>? we will need to extract it directly from the HTML.
return attachments;
}
function parseHtml(html: string): CheerioAPI | null {
try {
return cheerio(html);
} catch {
// Don't worry about invalid HTML
return null;
}
}