improving link parser

This commit is contained in:
otsmr 2026-01-21 23:14:23 +01:00
parent 15c5a44b7d
commit f5d4f97c02
11 changed files with 276 additions and 246 deletions

View file

@ -8,13 +8,14 @@ import 'package:html/parser.dart';
import 'package:http/http.dart' as http;
import 'package:twonly/src/utils/log.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/base.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/html_parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/json_ld_parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/og_parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/other_parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/twitter_parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/html.parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/json_ld.parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/mastodon.parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/og.parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/other.parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/twitter.parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/util.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/youtube_parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/youtube.parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/utils.dart';
Future<Metadata?> getMetadata(String link) async {
@ -81,7 +82,7 @@ Future<Metadata?> getInfo(
final document = responseToDocument(response);
if (document == null) return info;
final data_ = _parse(document, url: url);
final data_ = _parse(document, url);
return data_;
} catch (error) {
@ -103,83 +104,44 @@ Document? responseToDocument(http.Response response) {
return document;
}
Metadata _parse(Document? document, {String? url}) {
final output = Metadata();
Metadata _parse(Document? document, String url) {
final output = Metadata()..url = url;
final parsers = [
_openGraph(document),
_twitterCard(document),
_youtubeCard(document),
_jsonLdSchema(document),
_htmlMeta(document),
_otherParser(document),
final allParsers = [
// start with vendor specific to parse the vendor type
MastodonParser(document),
YoutubeParser(document, url),
TwitterParser(document, url),
JsonLdParser(document),
OpenGraphParser(document),
HtmlMetaParser(document),
OtherParser(document),
];
for (final p in parsers) {
if (p == null) continue;
output.title ??= p.title;
output.desc ??= p.desc;
output.image ??= p.image;
output.siteName ??= p.siteName;
output.url ??= p.url ?? url;
if (output.hasAllMetadata) break;
for (final parser in allParsers) {
try {
output.vendor ??= parser.vendor;
output.title ??= parser.title;
output.desc ??= parser.desc;
if (output.vendor == Vendor.twitterPosting) {
if (output.image == null) {
if (parser.image?.contains('/media/') ?? false) {
output.image ??= parser.image;
}
}
} else {
output.image ??= parser.image;
}
output.siteName ??= parser.siteName;
output.publishDate ??= parser.publishDate;
output.likeAction ??= parser.likeAction;
output.shareAction ??= parser.shareAction;
if (output.hasAllMetadata) break;
} catch (e) {
Log.error(e);
}
final url_ = output.url ?? url;
final image = output.image;
if (url_ != null && image != null) {
output.image = Uri.parse(url_).resolve(image).toString();
}
return output;
}
Metadata? _openGraph(Document? document) {
try {
return OpenGraphParser(document).parse();
} catch (e) {
return null;
}
}
Metadata? _htmlMeta(Document? document) {
try {
return HtmlMetaParser(document).parse();
} catch (e) {
return null;
}
}
Metadata? _jsonLdSchema(Document? document) {
try {
return JsonLdParser(document).parse();
} catch (e) {
return null;
}
}
Metadata? _youtubeCard(Document? document) {
try {
return YoutubeParser(document).parse();
} catch (e) {
return null;
}
}
Metadata? _twitterCard(Document? document) {
try {
return TwitterParser(document).parse();
} catch (e) {
return null;
}
}
Metadata? _otherParser(Document? document) {
try {
return OtherParser(document).parse();
} catch (e) {
return null;
}
}

View file

@ -1,29 +1,29 @@
enum Vendor { mastodonSocialMediaPosting, youtubeVideo, twitterPosting }
mixin BaseMetaInfo {
late String url;
String? title;
String? desc;
String? image;
String? url;
String? siteName;
Vendor? vendor;
DateTime? publishDate;
int? likeAction; // https://schema.org/LikeAction
int? shareAction; // https://schema.org/ShareAction
/// Returns `true` if any parameter other than [url] is filled.
bool get hasData =>
((title?.isNotEmpty ?? false) && title != 'null') ||
((desc?.isNotEmpty ?? false) && desc != 'null') ||
((image?.isNotEmpty ?? false) && image != 'null');
Metadata parse() {
return Metadata()
..title = title
..desc = desc
..image = image
..url = url
..siteName = siteName;
}
}
/// Container class for Metadata.
class Metadata with BaseMetaInfo {
Metadata();
bool get hasAllMetadata {
return title != null && desc != null && image != null && url != null;
return title != null && desc != null && image != null;
}
}

View file

@ -34,7 +34,4 @@ class HtmlMetaParser with BaseMetaInfo {
?.querySelector("meta[name='site_name']")
?.attributes
.get('content');
@override
String toString() => parse().toString();
}

View file

@ -0,0 +1,98 @@
import 'dart:convert';
import 'package:html/dom.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/base.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/og.parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/util.dart';
/// Parses [Metadata] from `json-ld` data in `<script>` tags.
class JsonLdParser with BaseMetaInfo {
JsonLdParser(this.document) {
_parseToJson(document);
}
Document? document;
Map<String, dynamic>? _jsonData;
void _parseToJson(Document? document) {
try {
final data = document?.head
?.querySelector("script[type='application/ld+json']")
?.innerHtml;
if (data == null) return;
// For multiline json file
// Replacing all new line characters with empty space
// before performing json decode on data
_jsonData =
jsonDecode(data.replaceAll('\n', ' ')) as Map<String, dynamic>;
// ignore: empty_catches
} catch (e) {}
}
/// Get the [Metadata.title] from the <title> tag.
@override
String? get title {
final data = _jsonData;
if (data is Map<String, dynamic>) {
return data['name'] as String? ?? data['headline'] as String?;
}
return null;
}
@override
int? get shareAction {
final statistics = _jsonData?['interactionStatistic'] as List<dynamic>?;
if (statistics != null) {
for (final statsDy in statistics) {
final stats = statsDy as Map<String, dynamic>?;
if (stats != null) {
if (stats['interactionType'] == 'https://schema.org/ShareAction') {
return stats['userInteractionCount'] as int?;
}
}
}
}
return null;
}
@override
int? get likeAction {
final statistics = _jsonData?['interactionStatistic'] as List<dynamic>?;
if (statistics != null) {
for (final statsDy in statistics) {
final stats = statsDy as Map<String, dynamic>?;
if (stats != null) {
if (stats['interactionType'] == 'https://schema.org/LikeAction') {
return stats['userInteractionCount'] as int?;
}
}
}
}
return null;
}
@override
String? get desc {
return _jsonData?['description'] as String?;
}
/// Get the [Metadata.image] from the first <img> tag in the body.
@override
String? get image {
final data = _jsonData;
return _imgResultToStr(
data?.getDynamic('logo') ?? data?.getDynamic('image'),
);
}
/// JSON LD does not have a siteName property, so we get it from
/// [og:site_name] if available.
@override
String? get siteName => OpenGraphParser(document).siteName;
String? _imgResultToStr(dynamic result) {
if (result is List && result.isNotEmpty) result = result.first;
if (result is String) return result;
return null;
}
}

View file

@ -1,80 +0,0 @@
import 'dart:convert';
import 'package:html/dom.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/base.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/og_parser.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/util.dart';
/// Parses [Metadata] from `json-ld` data in `<script>` tags.
class JsonLdParser with BaseMetaInfo {
JsonLdParser(this.document) {
_jsonData = _parseToJson(document);
}
/// The [Document] to parse.
Document? document;
dynamic _jsonData;
dynamic _parseToJson(Document? document) {
final data = document?.head
?.querySelector("script[type='application/ld+json']")
?.innerHtml;
if (data == null) return null;
// For multiline json file
// Replacing all new line characters with empty space
// before performing json decode on data
return jsonDecode(data.replaceAll('\n', ' '));
}
/// Get the [Metadata.title] from the <title> tag.
@override
String? get title {
final data = _jsonData;
if (data is Map<String, dynamic>) {
return data['name'] as String? ?? data['headline'] as String?;
}
return null;
}
/// Get the [Metadata.desc] from the content of the
/// <meta name="description"> tag.
@override
String? get desc {
final data = _jsonData;
if (data is List<Map<String, dynamic>>) {
return data.first['description'] as String? ??
data.first['headline'] as String?;
} else if (data is Map<String, dynamic>) {
return data['description'] as String? ?? data['description'] as String?;
}
return null;
}
/// Get the [Metadata.image] from the first <img> tag in the body.
@override
String? get image {
final data = _jsonData;
if (data is List && data.isNotEmpty) {
return _imgResultToStr(data.first['logo'] ?? data.first['image']);
} else if (data is Map) {
return _imgResultToStr(
data.getDynamic('logo') ?? data.getDynamic('image'),
);
}
return null;
}
/// JSON LD does not have a siteName property, so we get it from
/// [og:site_name] if available.
@override
String? get siteName => OpenGraphParser(document).siteName;
String? _imgResultToStr(dynamic result) {
if (result is List && result.isNotEmpty) result = result.first;
if (result is String) return result;
return null;
}
@override
String toString() => parse().toString();
}

View file

@ -0,0 +1,15 @@
import 'package:html/dom.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/base.dart';
class MastodonParser with BaseMetaInfo {
MastodonParser(this._document);
final Document? _document;
@override
Vendor? get vendor => ((_document?.head?.innerHtml
.contains('"repository":"mastodon/mastodon"') ??
false) &&
(_document?.head?.innerHtml.contains('SocialMediaPosting') ?? false))
? Vendor.mastodonSocialMediaPosting
: null;
}

View file

@ -25,11 +25,4 @@ class OpenGraphParser with BaseMetaInfo {
/// Get [Metadata.siteName] from 'og:site_name'.
@override
String? get siteName => getProperty(_document, property: 'og:site_name');
/// Get [Metadata.url] from 'og:url'.
@override
String? get url => getProperty(_document, property: 'og:url');
@override
String toString() => parse().toString();
}

View file

@ -7,33 +7,21 @@ import 'util.dart';
class OtherParser with BaseMetaInfo {
OtherParser(this._document);
/// The [Document] to be parse
final Document? _document;
/// Get [Metadata.title] from 'title'.
@override
String? get title =>
getProperty(_document, attribute: 'name', property: 'title');
/// Get [Metadata.desc] from 'description'.
@override
String? get desc =>
getProperty(_document, attribute: 'name', property: 'description');
/// Get [Metadata.image] from 'image'.
@override
String? get image =>
getProperty(_document, attribute: 'name', property: 'image');
/// Get [Metadata.siteName] from 'description'.
@override
String? get siteName =>
getProperty(_document, attribute: 'name', property: 'site_name');
/// Get [Metadata.url] from 'url'.
@override
String? get url => getProperty(_document, attribute: 'name', property: 'url');
@override
String toString() => parse().toString();
}

View file

@ -1,23 +1,18 @@
import 'package:html/dom.dart';
import 'base.dart';
import 'og_parser.dart';
import 'util.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/base.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/util.dart';
/// Parses [Metadata] from `<meta property='twitter:*'>` tags.
class TwitterParser with BaseMetaInfo {
TwitterParser(this._document);
/// The [Document] to parse.
TwitterParser(this._document, this._url);
final Document? _document;
final String _url;
/// Get [Metadata.title] from 'twitter:title'
@override
String? get title =>
getProperty(_document, attribute: 'name', property: 'twitter:title') ??
getProperty(_document, property: 'twitter:title');
/// Get [Metadata.desc] from 'twitter:description'
@override
String? get desc =>
getProperty(
@ -27,22 +22,14 @@ class TwitterParser with BaseMetaInfo {
) ??
getProperty(_document, property: 'twitter:description');
/// Get [Metadata.image] from 'twitter:image'
@override
String? get image =>
getProperty(_document, attribute: 'name', property: 'twitter:image') ??
getProperty(_document, property: 'twitter:image');
/// Twitter Cards do not have a siteName property, so we get it from
/// [og:site_name] if available.
@override
String? get siteName => OpenGraphParser(_document).siteName;
/// Twitter Cards do not have a url property, so we get the url from
/// [og:url] if available.
@override
String? get url => OpenGraphParser(_document).url;
@override
String toString() => parse().toString();
Vendor? get vendor =>
_url.startsWith('https://x.com/') && _url.contains('/status/')
? Vendor.twitterPosting
: null;
}

View file

@ -4,14 +4,18 @@ import 'base.dart';
import 'util.dart';
class YoutubeParser with BaseMetaInfo {
YoutubeParser(this.document) {
YoutubeParser(this.document, this.url) {
_jsonData = _parseToJson(document);
}
@override
String url;
Document? document;
dynamic _jsonData;
dynamic _parseToJson(Document? document) {
try {
final data = document?.outerHtml
.replaceAll('<html><head></head><body>', '')
.replaceAll('</body></html>', '');
@ -21,9 +25,11 @@ class YoutubeParser with BaseMetaInfo {
// before performing json decode on data
final d = jsonDecode(data.replaceAll('\n', ' '));
return d;
} catch (e) {
return null;
}
}
/// Get the [Metadata.title] from the [<title>] tag
@override
String? get title {
final data = _jsonData;
@ -35,7 +41,6 @@ class YoutubeParser with BaseMetaInfo {
return null;
}
/// Get the [Metadata.image] from the first <img> tag in the body
@override
String? get image {
final data = _jsonData;
@ -59,22 +64,13 @@ class YoutubeParser with BaseMetaInfo {
}
@override
String? get url {
final data = _jsonData;
if (data is List<Map<String, dynamic>>) {
return data.first['provider_url'] as String?;
} else if (data is Map) {
return data.get('provider_url');
}
return null;
}
Vendor? get vendor => (Uri.parse(url).host.contains('youtube.com'))
? Vendor.youtubeVideo
: null;
String? _imgResultToStr(dynamic result) {
if (result is List && result.isNotEmpty) result = result.first;
if (result is String) return result;
return null;
}
@override
String toString() => parse().toString();
}

View file

@ -1,20 +1,30 @@
import 'package:flutter_test/flutter_test.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parse_link.dart';
import 'package:twonly/src/views/camera/share_image_editor/layers/link_preview/parser/base.dart';
class LinkParserTest {
LinkParserTest({
required this.url,
required this.title,
required this.siteName,
required this.desc,
required this.url,
this.desc,
this.image,
this.siteName,
this.vendor,
this.publishDate,
this.likeAction,
this.shareAction,
});
String title;
String? desc;
String? image;
String url;
String? siteName;
final String url;
final String title;
final String siteName;
final String desc;
final String? image;
Vendor? vendor;
DateTime? publishDate;
int? likeAction; // https://schema.org/LikeAction
int? shareAction;
}
void main() {
@ -27,6 +37,9 @@ void main() {
desc: 'Attached: 1 image',
image:
'https://files.mastodon.social/media_attachments/files/115/883/317/526/523/824/original/6fa7ef90ec68f1f1.jpg',
vendor: Vendor.mastodonSocialMediaPosting,
shareAction: 90,
likeAction: 290,
),
LinkParserTest(
url: 'https://chaos.social/@netzpolitik_feed/115921534467938262',
@ -36,7 +49,53 @@ void main() {
'Die EU-Kommission erkennt Open Source als entscheidend für die digitale Souveränität an und wünscht sich mehr Kommerzialisierung. Bis April will Brüssel eine neue Strategie veröffentlichen. In einer laufenden Konsultation bekräftigen Stimmen aus ganz Europa, welche Vorteile sie in offenem Quellcode sehen.\n'
'\n'
'https://netzpolitik.org/2026/konsultation-eu-kommission-arbeitet-an-neuer-open-source-strategie/',
vendor: Vendor.mastodonSocialMediaPosting,
shareAction: 70,
likeAction: 90,
),
LinkParserTest(
title: 'Kuketz-Blog 🛡 (@kuketzblog@social.tchncs.de)',
url: 'https://social.tchncs.de/@kuketzblog/115898752560771936',
siteName: 'Mastodon',
desc:
'AWS verspricht jetzt »Souveränität« mit einem »europäischen« Cloud-Angebot Standort Deutschland, großes Vertrauens-Theater.\n'
'\n'
'Nur: Souveränität ist keine Postleitzahl. Wenn der Anbieter Amazon heißt, bleibt es dasselbe Märchen mit neuem Umschlag: Der Cloud Act, FISA etc. gilt trotzdem. US-Recht schlägt Geografie. Das Gerede von »Souveränität« ist kein Konzept, sondern Marketing.\n'
'\n'
'https://www.heise.de/news/AWS-verspricht-Souveraenitaet-mit-europaeischem-Cloudangebot-11141800.html',
vendor: Vendor.mastodonSocialMediaPosting,
shareAction: 15,
likeAction: 190,
),
LinkParserTest(
title:
'David Kriesel: Traue keinem Scan, den du nicht selbst gefälscht hast',
url: 'https://www.youtube.com/watch?v=7FeqF1-Z1g0',
siteName: 'YouTube',
vendor: Vendor.youtubeVideo,
image: 'https://i.ytimg.com/vi/7FeqF1-Z1g0/hqdefault.jpg',
),
LinkParserTest(
title: 'netzpolitik.org (@netzpolitik_org) on X',
url: 'https://x.com/netzpolitik_org/status/1782791019412529665',
siteName: 'X (formerly Twitter)',
desc:
'Jetzt ist wirklich Schluss: Wir verlassen als Redaktion das zur Plattform für Rechtsradikale verkommene Twitter und freuen uns, wenn ihr uns woanders folgt.\n'
'\n'
'https://t.co/8W0hGly5bL',
vendor: Vendor.twitterPosting,
),
LinkParserTest(
title: 'netzpolitik.org (@netzpolitik_org) on X',
url: 'https://x.com/netzpolitik_org/status/1162346968124968960',
siteName: 'X (formerly Twitter)',
desc:
'Weil unsere Datenanalyse zum Twitter-Account von Maaßen rechte Millieus und ihre Verbindungen offengelegt hat, haben wir einen rechten Shitstorm an der Backe. Klar ist: Wir lassen uns nicht einschüchtern und freuen uns auf Unterstützung! \n'
'\n'
'https://t.co/MQZ7ulHakF',
image: 'https://pbs.twimg.com/media/ECF8Z5KWwAIBZ6o.jpg:large',
vendor: Vendor.twitterPosting,
)
];
for (final testCase in testCases) {
@ -44,7 +103,22 @@ void main() {
expect(metadata.title, testCase.title);
expect(metadata.siteName, testCase.siteName);
expect(metadata.desc, testCase.desc);
expect(metadata.url, testCase.url);
expect(metadata.image, testCase.image);
expect(metadata.vendor, testCase.vendor, reason: metadata.url);
if (testCase.shareAction != null) {
expect(
metadata.shareAction,
greaterThanOrEqualTo(testCase.shareAction!),
);
}
if (testCase.shareAction != null) {
expect(
metadata.likeAction,
greaterThanOrEqualTo(testCase.likeAction!),
);
}
expect(metadata.publishDate, testCase.publishDate);
}
});
}