]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/nitter.py
[extractor/youtube] Support `/live/` URL
[yt-dlp.git] / yt_dlp / extractor / nitter.py
CommitLineData
bb8a73a0 1from .common import InfoExtractor
2from ..compat import compat_urlparse
3from ..utils import (
4 parse_count,
bb8a73a0 5 unified_timestamp,
6 remove_end,
7 determine_ext,
8)
9import re
a4ddaf23 10import random
bb8a73a0 11
12
13class NitterIE(InfoExtractor):
14 # Taken from https://github.com/zedeus/nitter/wiki/Instances
a4ddaf23 15
16 NON_HTTP_INSTANCES = (
17 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
18 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
19 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
20 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
21 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
22 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
23 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
510809f1 24 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
25 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
26 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
27 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
28 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
29 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
30 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
31 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
32 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
33 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
a4ddaf23 34
35 'nitter.i2p',
36 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
37
38 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
39 )
40
41 HTTP_INSTANCES = (
42 'nitter.42l.fr',
43 'nitter.pussthecat.org',
44 'nitter.nixnet.services',
a4ddaf23 45 'nitter.fdn.fr',
46 'nitter.1d4.us',
47 'nitter.kavin.rocks',
a4ddaf23 48 'nitter.unixfox.eu',
49 'nitter.domain.glass',
a4ddaf23 50 'nitter.eu',
51 'nitter.namazso.eu',
a4ddaf23 52 'nitter.actionsack.com',
a4ddaf23 53 'birdsite.xanny.family',
510809f1 54 'nitter.hu',
55 'twitr.gq',
56 'nitter.moomoo.me',
57 'nittereu.moomoo.me',
58 'bird.from.tf',
59 'nitter.it',
60 'twitter.censors.us',
61 'twitter.grimneko.de',
62 'nitter.alefvanoon.xyz',
63 'n.hyperborea.cloud',
64 'nitter.ca',
65 'twitter.076.ne.jp',
66 'twitter.mstdn.social',
67 'nitter.fly.dev',
68 'notabird.site',
69 'nitter.weiler.rocks',
70 'nitter.silkky.cloud',
71 'nitter.sethforprivacy.com',
72 'nttr.stream',
73 'nitter.cutelab.space',
74 'nitter.nl',
75 'nitter.mint.lgbt',
76 'nitter.bus-hit.me',
77 'fuckthesacklers.network',
78 'nitter.govt.land',
79 'nitter.datatunnel.xyz',
80 'nitter.esmailelbob.xyz',
81 'tw.artemislena.eu',
82 'de.nttr.stream',
83 'nitter.winscloud.net',
84 'nitter.tiekoetter.com',
85 'nitter.spaceint.fr',
86 'twtr.bch.bar',
87 'nitter.exonip.de',
88 'nitter.mastodon.pro',
89 'nitter.notraxx.ch',
90
a4ddaf23 91
92 # not in the list anymore
510809f1 93 'nitter.skrep.in',
a4ddaf23 94 'nitter.snopyta.org',
95 )
96
97 DEAD_INSTANCES = (
98 # maintenance
99 'nitter.ethibox.fr',
100
101 # official, rate limited
102 'nitter.net',
103 # offline
510809f1 104 'is-nitter.resolv.ee',
105 'lu-nitter.resolv.ee',
a4ddaf23 106 'nitter.13ad.de',
510809f1 107 'nitter.40two.app',
108 'nitter.cattube.org',
109 'nitter.cc',
110 'nitter.dark.fail',
111 'nitter.himiko.cloud',
112 'nitter.koyu.space',
113 'nitter.mailstation.de',
114 'nitter.mastodont.cat',
115 'nitter.tedomum.net',
116 'nitter.tokhmi.xyz',
a4ddaf23 117 'nitter.weaponizedhumiliation.com',
510809f1 118 'nitter.vxempire.xyz',
119 'tweet.lambda.dance',
a4ddaf23 120 )
121
122 INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
bb8a73a0 123
510809f1 124 _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
125 _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
a4ddaf23 126 current_instance = random.choice(HTTP_INSTANCES)
127
bb8a73a0 128 _TESTS = [
129 {
130 # GIF (wrapped in mp4)
510809f1 131 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
bb8a73a0 132 'info_dict': {
133 'id': '1314279897502629888',
134 'ext': 'mp4',
510809f1 135 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
136 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
bb8a73a0 137 'thumbnail': r're:^https?://.*\.jpg$',
138 'uploader': 'Firefox 🔥',
139 'uploader_id': 'firefox',
510809f1 140 'uploader_url': f'https://{current_instance}/firefox',
bb8a73a0 141 'upload_date': '20201008',
142 'timestamp': 1602183720,
510809f1 143 'like_count': int,
144 'repost_count': int,
145 'comment_count': int,
bb8a73a0 146 },
147 }, { # normal video
510809f1 148 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
bb8a73a0 149 'info_dict': {
150 'id': '1299715685392756737',
151 'ext': 'mp4',
510809f1 152 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
a4ddaf23 153 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
bb8a73a0 154 'thumbnail': r're:^https?://.*\.jpg$',
510809f1 155 'uploader': 're:^Le *Doc',
bb8a73a0 156 'uploader_id': 'Le___Doc',
510809f1 157 'uploader_url': f'https://{current_instance}/Le___Doc',
bb8a73a0 158 'upload_date': '20200829',
510809f1 159 'timestamp': 1598711340,
bb8a73a0 160 'view_count': int,
161 'like_count': int,
162 'repost_count': int,
163 'comment_count': int,
164 },
165 }, { # video embed in a "Streaming Political Ads" box
510809f1 166 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
bb8a73a0 167 'info_dict': {
168 'id': '1321147074491092994',
169 'ext': 'mp4',
510809f1 170 'title': 'md5:8290664aabb43b9189145c008386bf12',
171 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
bb8a73a0 172 'thumbnail': r're:^https?://.*\.jpg$',
173 'uploader': 'Mozilla',
174 'uploader_id': 'mozilla',
510809f1 175 'uploader_url': f'https://{current_instance}/mozilla',
bb8a73a0 176 'upload_date': '20201027',
510809f1 177 'timestamp': 1603820940,
178 'view_count': int,
179 'like_count': int,
180 'repost_count': int,
181 'comment_count': int,
bb8a73a0 182 },
510809f1 183 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
a4ddaf23 184 }, { # not the first tweet but main-tweet
510809f1 185 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
a4ddaf23 186 'info_dict': {
510809f1 187 'id': '1354848277481414657',
a4ddaf23 188 'ext': 'mp4',
510809f1 189 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
190 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
a4ddaf23 191 'thumbnail': r're:^https?://.*\.jpg$',
510809f1 192 'uploader': 'Firefox 🔥',
193 'uploader_id': 'firefox',
194 'uploader_url': f'https://{current_instance}/firefox',
195 'upload_date': '20210128',
196 'timestamp': 1611855960,
197 'view_count': int,
198 'like_count': int,
199 'repost_count': int,
200 'comment_count': int,
a4ddaf23 201 }
202 }
bb8a73a0 203 ]
204
205 def _real_extract(self, url):
510809f1 206 video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
bb8a73a0 207 parsed_url = compat_urlparse.urlparse(url)
510809f1 208 base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
bb8a73a0 209
210 self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
510809f1 211 full_webpage = webpage = self._download_webpage(url, video_id)
a4ddaf23 212
213 main_tweet_start = full_webpage.find('class="main-tweet"')
214 if main_tweet_start > 0:
215 webpage = full_webpage[main_tweet_start:]
bb8a73a0 216
510809f1 217 video_url = '%s%s' % (base_url, self._html_search_regex(
218 r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
bb8a73a0 219 ext = determine_ext(video_url)
220
221 if ext == 'unknown_video':
222 formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
223 else:
224 formats = [{
225 'url': video_url,
226 'ext': ext
227 }]
228
510809f1 229 title = description = self._og_search_description(full_webpage) or self._html_search_regex(
230 r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
bb8a73a0 231
510809f1 232 uploader_id = self._html_search_regex(
233 r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
bb8a73a0 234
510809f1 235 uploader = self._html_search_regex(
236 r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
237 if uploader:
238 title = f'{uploader} - {title}'
bb8a73a0 239
510809f1 240 counts = {
241 f'{x[0]}_count': self._html_search_regex(
242 fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
243 webpage, f'{x[0]} count', fatal=False)
244 for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
245 }
246 counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
bb8a73a0 247
510809f1 248 thumbnail = (
249 self._html_search_meta('og:image', full_webpage, 'thumbnail url')
250 or remove_end('%s%s' % (base_url, self._html_search_regex(
251 r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
252
253 thumbnails = [
254 {'id': id, 'url': f'{thumbnail}%3A{id}'}
255 for id in ('thumb', 'small', 'large', 'medium', 'orig')
256 ]
257
258 date = self._html_search_regex(
259 r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
260 webpage, 'upload date', default='').replace('·', '')
bb8a73a0 261
262 return {
263 'id': video_id,
264 'title': title,
265 'description': description,
266 'uploader': uploader,
510809f1 267 'timestamp': unified_timestamp(date),
bb8a73a0 268 'uploader_id': uploader_id,
510809f1 269 'uploader_url': f'{base_url}/{uploader_id}',
bb8a73a0 270 'formats': formats,
271 'thumbnails': thumbnails,
272 'thumbnail': thumbnail,
510809f1 273 **counts,
bb8a73a0 274 }