]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/telegram.py
[ie/matchtv] Fix extractor (#10190)
[yt-dlp.git] / yt_dlp / extractor / telegram.py
CommitLineData
96b9e9cf 1import re
2
5bcccbfe 3from .common import InfoExtractor
96b9e9cf 4from ..utils import (
5 clean_html,
6 format_field,
7 get_element_by_class,
8 parse_duration,
9 parse_qs,
10 traverse_obj,
11 unified_timestamp,
12 update_url_query,
13 url_basename,
14)
5bcccbfe
HTL
15
16
17class TelegramEmbedIE(InfoExtractor):
18 IE_NAME = 'telegram:embed'
96b9e9cf 19 _VALID_URL = r'https?://t\.me/(?P<channel_id>[^/]+)/(?P<id>\d+)'
5bcccbfe
HTL
20 _TESTS = [{
21 'url': 'https://t.me/europa_press/613',
96b9e9cf 22 'md5': 'dd707708aea958c11a590e8068825f22',
5bcccbfe
HTL
23 'info_dict': {
24 'id': '613',
25 'ext': 'mp4',
96b9e9cf 26 'title': 'md5:6ce2d7e8d56eda16d80607b23db7b252',
27 'description': 'md5:6ce2d7e8d56eda16d80607b23db7b252',
28 'channel_id': 'europa_press',
29 'channel': 'Europa Press ✔',
30 'thumbnail': r're:^https?://.+',
31 'timestamp': 1635631203,
32 'upload_date': '20211030',
33 'duration': 61,
34 },
35 }, {
36 # 2-video post
37 'url': 'https://t.me/vorposte/29342',
38 'info_dict': {
39 'id': 'vorposte-29342',
40 'title': 'Форпост 29342',
41 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
42 },
43 'playlist_count': 2,
44 'params': {
45 'skip_download': True,
46 },
47 }, {
48 # 2-video post with --no-playlist
49 'url': 'https://t.me/vorposte/29343',
50 'md5': '1724e96053c18e788c8464038876e245',
51 'info_dict': {
52 'id': '29343',
53 'ext': 'mp4',
54 'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
55 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
56 'channel_id': 'vorposte',
57 'channel': 'Форпост',
58 'thumbnail': r're:^https?://.+',
59 'timestamp': 1666384480,
60 'upload_date': '20221021',
61 'duration': 35,
62 },
63 'params': {
64 'noplaylist': True,
add96eb9 65 },
96b9e9cf 66 }, {
67 # 2-video post with 'single' query param
68 'url': 'https://t.me/vorposte/29342?single',
69 'md5': 'd20b202f1e41400a9f43201428add18f',
70 'info_dict': {
71 'id': '29342',
72 'ext': 'mp4',
73 'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
74 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
75 'channel_id': 'vorposte',
76 'channel': 'Форпост',
77 'thumbnail': r're:^https?://.+',
78 'timestamp': 1666384480,
79 'upload_date': '20221021',
80 'duration': 33,
5bcccbfe
HTL
81 },
82 }]
83
84 def _real_extract(self, url):
96b9e9cf 85 channel_id, msg_id = self._match_valid_url(url).group('channel_id', 'id')
86 embed = self._download_webpage(
87 url, msg_id, query={'embed': '1', 'single': []}, note='Downloading embed frame')
5bcccbfe 88
96b9e9cf 89 def clean_text(html_class, html):
90 text = clean_html(get_element_by_class(html_class, html))
91 return text.replace('\n', ' ') if text else None
92
93 description = clean_text('tgme_widget_message_text', embed)
94 message = {
95 'title': description or '',
96 'description': description,
97 'channel': clean_text('tgme_widget_message_author', embed),
98 'channel_id': channel_id,
99 'timestamp': unified_timestamp(self._search_regex(
100 r'<time[^>]*datetime="([^"]*)"', embed, 'timestamp', fatal=False)),
5bcccbfe 101 }
96b9e9cf 102
103 videos = []
104 for video in re.findall(r'<a class="tgme_widget_message_video_player(?s:.+?)</time>', embed):
105 video_url = self._search_regex(
106 r'<video[^>]+src="([^"]+)"', video, 'video URL', fatal=False)
107 webpage_url = self._search_regex(
108 r'<a class="tgme_widget_message_video_player[^>]+href="([^"]+)"',
109 video, 'webpage URL', fatal=False)
110 if not video_url or not webpage_url:
111 continue
112 formats = [{
113 'url': video_url,
114 'ext': 'mp4',
115 }]
96b9e9cf 116 videos.append({
117 'id': url_basename(webpage_url),
118 'webpage_url': update_url_query(webpage_url, {'single': True}),
119 'duration': parse_duration(self._search_regex(
120 r'<time[^>]+duration[^>]*>([\d:]+)</time>', video, 'duration', fatal=False)),
121 'thumbnail': self._search_regex(
122 r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)',
123 video, 'thumbnail', fatal=False),
124 'formats': formats,
125 **message,
126 })
127
128 playlist_id = None
129 if len(videos) > 1 and 'single' not in parse_qs(url, keep_blank_values=True):
130 playlist_id = f'{channel_id}-{msg_id}'
131
132 if self._yes_playlist(playlist_id, msg_id):
133 return self.playlist_result(
134 videos, playlist_id, format_field(message, 'channel', f'%s {msg_id}'), description)
135 else:
136 return traverse_obj(videos, lambda _, x: x['id'] == msg_id, get_all=False)