]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/nitter.py
[ie/crunchyroll] Fix stream extraction (#10005)
[yt-dlp.git] / yt_dlp / extractor / nitter.py
CommitLineData
bb8a73a0 1from .common import InfoExtractor
2from ..compat import compat_urlparse
3from ..utils import (
4 parse_count,
bb8a73a0 5 unified_timestamp,
6 remove_end,
7 determine_ext,
8)
9import re
a4ddaf23 10import random
bb8a73a0 11
12
13class NitterIE(InfoExtractor):
14 # Taken from https://github.com/zedeus/nitter/wiki/Instances
a4ddaf23 15
16 NON_HTTP_INSTANCES = (
17 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
18 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
19 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
20 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
21 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
22 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
23 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
510809f1 24 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
25 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
26 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
27 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
28 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
29 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
30 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
31 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
32 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
33 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
a4ddaf23 34
35 'nitter.i2p',
36 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
37
38 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
39 )
40
41 HTTP_INSTANCES = (
a9189510 42 'nitter.lacontrevoie.fr',
a4ddaf23 43 'nitter.fdn.fr',
44 'nitter.1d4.us',
45 'nitter.kavin.rocks',
a4ddaf23 46 'nitter.unixfox.eu',
47 'nitter.domain.glass',
a4ddaf23 48 'nitter.namazso.eu',
a4ddaf23 49 'birdsite.xanny.family',
510809f1 50 'nitter.moomoo.me',
a9189510 51 'bird.trom.tf',
510809f1 52 'nitter.it',
53 'twitter.censors.us',
a9189510 54 'nitter.grimneko.de',
510809f1 55 'twitter.076.ne.jp',
510809f1 56 'nitter.fly.dev',
57 'notabird.site',
58 'nitter.weiler.rocks',
510809f1 59 'nitter.sethforprivacy.com',
510809f1 60 'nitter.cutelab.space',
61 'nitter.nl',
62 'nitter.mint.lgbt',
63 'nitter.bus-hit.me',
510809f1 64 'nitter.esmailelbob.xyz',
65 'tw.artemislena.eu',
510809f1 66 'nitter.winscloud.net',
67 'nitter.tiekoetter.com',
68 'nitter.spaceint.fr',
a9189510
O
69 'nitter.privacy.com.de',
70 'nitter.poast.org',
71 'nitter.bird.froth.zone',
72 'nitter.dcs0.hu',
73 'twitter.dr460nf1r3.org',
74 'nitter.garudalinux.org',
75 'twitter.femboy.hu',
76 'nitter.cz',
77 'nitter.privacydev.net',
78 'nitter.evil.site',
79 'tweet.lambda.dance',
80 'nitter.kylrth.com',
81 'nitter.foss.wtf',
82 'nitter.priv.pw',
83 'nitter.tokhmi.xyz',
84 'nitter.catalyst.sx',
85 'unofficialbird.com',
86 'nitter.projectsegfau.lt',
87 'nitter.eu.projectsegfau.lt',
88 'singapore.unofficialbird.com',
89 'canada.unofficialbird.com',
90 'india.unofficialbird.com',
91 'nederland.unofficialbird.com',
92 'uk.unofficialbird.com',
93 'n.l5.ca',
94 'nitter.slipfox.xyz',
95 'nitter.soopy.moe',
96 'nitter.qwik.space',
97 'read.whatever.social',
98 'nitter.rawbit.ninja',
99 'nt.vern.cc',
100 'ntr.odyssey346.dev',
101 'nitter.ir',
102 'nitter.privacytools.io',
103 'nitter.sneed.network',
104 'n.sneed.network',
105 'nitter.manasiwibi.com',
106 'nitter.smnz.de',
107 'nitter.twei.space',
108 'nitter.inpt.fr',
109 'nitter.d420.de',
110 'nitter.caioalonso.com',
111 'nitter.at',
112 'nitter.drivet.xyz',
113 'nitter.pw',
114 'nitter.nicfab.eu',
115 'bird.habedieeh.re',
116 'nitter.hostux.net',
117 'nitter.adminforge.de',
118 'nitter.platypush.tech',
119 'nitter.mask.sh',
120 'nitter.pufe.org',
121 'nitter.us.projectsegfau.lt',
122 'nitter.arcticfoxes.net',
123 't.com.sb',
124 'nitter.kling.gg',
125 'nitter.ktachibana.party',
126 'nitter.riverside.rocks',
127 'nitter.girlboss.ceo',
128 'nitter.lunar.icu',
129 'twitter.moe.ngo',
130 'nitter.freedit.eu',
131 'ntr.frail.duckdns.org',
132 'nitter.librenode.org',
133 'n.opnxng.com',
134 'nitter.plus.st',
a4ddaf23 135 )
136
137 DEAD_INSTANCES = (
138 # maintenance
139 'nitter.ethibox.fr',
140
141 # official, rate limited
142 'nitter.net',
143 # offline
510809f1 144 'is-nitter.resolv.ee',
145 'lu-nitter.resolv.ee',
a4ddaf23 146 'nitter.13ad.de',
510809f1 147 'nitter.40two.app',
148 'nitter.cattube.org',
149 'nitter.cc',
150 'nitter.dark.fail',
151 'nitter.himiko.cloud',
152 'nitter.koyu.space',
153 'nitter.mailstation.de',
154 'nitter.mastodont.cat',
155 'nitter.tedomum.net',
156 'nitter.tokhmi.xyz',
a4ddaf23 157 'nitter.weaponizedhumiliation.com',
510809f1 158 'nitter.vxempire.xyz',
159 'tweet.lambda.dance',
a9189510
O
160 'nitter.ca',
161 'nitter.42l.fr',
162 'nitter.pussthecat.org',
163 'nitter.nixnet.services',
164 'nitter.eu',
165 'nitter.actionsack.com',
166 'nitter.hu',
167 'twitr.gq',
168 'nittereu.moomoo.me',
169 'bird.from.tf',
170 'twitter.grimneko.de',
171 'nitter.alefvanoon.xyz',
172 'n.hyperborea.cloud',
173 'twitter.mstdn.social',
174 'nitter.silkky.cloud',
175 'nttr.stream',
176 'fuckthesacklers.network',
177 'nitter.govt.land',
178 'nitter.datatunnel.xyz',
179 'de.nttr.stream',
180 'twtr.bch.bar',
181 'nitter.exonip.de',
182 'nitter.mastodon.pro',
183 'nitter.notraxx.ch',
184 'nitter.skrep.in',
185 'nitter.snopyta.org',
a4ddaf23 186 )
187
188 INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
bb8a73a0 189
510809f1 190 _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
191 _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
a4ddaf23 192 current_instance = random.choice(HTTP_INSTANCES)
193
bb8a73a0 194 _TESTS = [
195 {
196 # GIF (wrapped in mp4)
510809f1 197 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
bb8a73a0 198 'info_dict': {
199 'id': '1314279897502629888',
200 'ext': 'mp4',
510809f1 201 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
202 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
bb8a73a0 203 'thumbnail': r're:^https?://.*\.jpg$',
204 'uploader': 'Firefox 🔥',
205 'uploader_id': 'firefox',
510809f1 206 'uploader_url': f'https://{current_instance}/firefox',
bb8a73a0 207 'upload_date': '20201008',
208 'timestamp': 1602183720,
510809f1 209 'like_count': int,
210 'repost_count': int,
211 'comment_count': int,
bb8a73a0 212 },
213 }, { # normal video
510809f1 214 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
bb8a73a0 215 'info_dict': {
216 'id': '1299715685392756737',
217 'ext': 'mp4',
510809f1 218 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
a4ddaf23 219 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
bb8a73a0 220 'thumbnail': r're:^https?://.*\.jpg$',
510809f1 221 'uploader': 're:^Le *Doc',
bb8a73a0 222 'uploader_id': 'Le___Doc',
510809f1 223 'uploader_url': f'https://{current_instance}/Le___Doc',
bb8a73a0 224 'upload_date': '20200829',
510809f1 225 'timestamp': 1598711340,
bb8a73a0 226 'view_count': int,
227 'like_count': int,
228 'repost_count': int,
229 'comment_count': int,
230 },
231 }, { # video embed in a "Streaming Political Ads" box
510809f1 232 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
bb8a73a0 233 'info_dict': {
234 'id': '1321147074491092994',
235 'ext': 'mp4',
510809f1 236 'title': 'md5:8290664aabb43b9189145c008386bf12',
237 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
bb8a73a0 238 'thumbnail': r're:^https?://.*\.jpg$',
239 'uploader': 'Mozilla',
240 'uploader_id': 'mozilla',
510809f1 241 'uploader_url': f'https://{current_instance}/mozilla',
bb8a73a0 242 'upload_date': '20201027',
510809f1 243 'timestamp': 1603820940,
244 'view_count': int,
245 'like_count': int,
246 'repost_count': int,
247 'comment_count': int,
bb8a73a0 248 },
510809f1 249 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
a4ddaf23 250 }, { # not the first tweet but main-tweet
510809f1 251 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
a4ddaf23 252 'info_dict': {
510809f1 253 'id': '1354848277481414657',
a4ddaf23 254 'ext': 'mp4',
510809f1 255 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
256 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
a4ddaf23 257 'thumbnail': r're:^https?://.*\.jpg$',
510809f1 258 'uploader': 'Firefox 🔥',
259 'uploader_id': 'firefox',
260 'uploader_url': f'https://{current_instance}/firefox',
261 'upload_date': '20210128',
262 'timestamp': 1611855960,
263 'view_count': int,
264 'like_count': int,
265 'repost_count': int,
266 'comment_count': int,
a4ddaf23 267 }
a83da371
A
268 }, { # no OpenGraph title
269 'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
270 'info_dict': {
271 'id': '1678455464038735895',
272 'ext': 'mp4',
273 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
274 'description': 'Local man, what did Romanians ever do to you?',
275 'thumbnail': r're:^https?://.*\.jpg$',
276 'uploader': 'Your Typical Local Man',
277 'uploader_id': 'LocalBateman',
278 'uploader_url': f'https://{current_instance}/LocalBateman',
279 'upload_date': '20230710',
280 'timestamp': 1689009900,
281 'view_count': int,
282 'like_count': int,
283 'repost_count': int,
284 'comment_count': int,
285 },
286 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
287 'params': {'skip_download': 'm3u8'},
a4ddaf23 288 }
bb8a73a0 289 ]
290
291 def _real_extract(self, url):
510809f1 292 video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
bb8a73a0 293 parsed_url = compat_urlparse.urlparse(url)
510809f1 294 base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
bb8a73a0 295
296 self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
510809f1 297 full_webpage = webpage = self._download_webpage(url, video_id)
a4ddaf23 298
299 main_tweet_start = full_webpage.find('class="main-tweet"')
300 if main_tweet_start > 0:
301 webpage = full_webpage[main_tweet_start:]
bb8a73a0 302
510809f1 303 video_url = '%s%s' % (base_url, self._html_search_regex(
304 r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
bb8a73a0 305 ext = determine_ext(video_url)
306
307 if ext == 'unknown_video':
308 formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
309 else:
310 formats = [{
311 'url': video_url,
312 'ext': ext
313 }]
314
a83da371 315 title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex(
510809f1 316 r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
bb8a73a0 317
510809f1 318 uploader_id = self._html_search_regex(
319 r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
bb8a73a0 320
510809f1 321 uploader = self._html_search_regex(
322 r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
323 if uploader:
324 title = f'{uploader} - {title}'
bb8a73a0 325
510809f1 326 counts = {
327 f'{x[0]}_count': self._html_search_regex(
328 fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
329 webpage, f'{x[0]} count', fatal=False)
330 for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
331 }
332 counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
bb8a73a0 333
510809f1 334 thumbnail = (
335 self._html_search_meta('og:image', full_webpage, 'thumbnail url')
336 or remove_end('%s%s' % (base_url, self._html_search_regex(
337 r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
338
339 thumbnails = [
340 {'id': id, 'url': f'{thumbnail}%3A{id}'}
341 for id in ('thumb', 'small', 'large', 'medium', 'orig')
342 ]
343
344 date = self._html_search_regex(
345 r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
346 webpage, 'upload date', default='').replace('·', '')
bb8a73a0 347
348 return {
349 'id': video_id,
350 'title': title,
351 'description': description,
352 'uploader': uploader,
510809f1 353 'timestamp': unified_timestamp(date),
bb8a73a0 354 'uploader_id': uploader_id,
510809f1 355 'uploader_url': f'{base_url}/{uploader_id}',
bb8a73a0 356 'formats': formats,
357 'thumbnails': thumbnails,
358 'thumbnail': thumbnail,
510809f1 359 **counts,
bb8a73a0 360 }