]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/nitter.py
[ie/matchtv] Fix extractor (#10190)
[yt-dlp.git] / yt_dlp / extractor / nitter.py
CommitLineData
e897bd82
SS
1import random
2import re
add96eb9 3import urllib.parse
e897bd82 4
bb8a73a0 5from .common import InfoExtractor
bb8a73a0 6from ..utils import (
e897bd82 7 determine_ext,
bb8a73a0 8 parse_count,
bb8a73a0 9 remove_end,
e897bd82 10 unified_timestamp,
bb8a73a0 11)
bb8a73a0 12
13
14class NitterIE(InfoExtractor):
15 # Taken from https://github.com/zedeus/nitter/wiki/Instances
a4ddaf23 16
17 NON_HTTP_INSTANCES = (
18 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
19 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
20 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
21 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
22 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
23 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
24 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
510809f1 25 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
26 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
27 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
28 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
29 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
30 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
31 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
32 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
33 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
34 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
a4ddaf23 35
36 'nitter.i2p',
37 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
38
39 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
40 )
41
42 HTTP_INSTANCES = (
a9189510 43 'nitter.lacontrevoie.fr',
a4ddaf23 44 'nitter.fdn.fr',
45 'nitter.1d4.us',
46 'nitter.kavin.rocks',
a4ddaf23 47 'nitter.unixfox.eu',
48 'nitter.domain.glass',
a4ddaf23 49 'nitter.namazso.eu',
a4ddaf23 50 'birdsite.xanny.family',
510809f1 51 'nitter.moomoo.me',
a9189510 52 'bird.trom.tf',
510809f1 53 'nitter.it',
54 'twitter.censors.us',
a9189510 55 'nitter.grimneko.de',
510809f1 56 'twitter.076.ne.jp',
510809f1 57 'nitter.fly.dev',
58 'notabird.site',
59 'nitter.weiler.rocks',
510809f1 60 'nitter.sethforprivacy.com',
510809f1 61 'nitter.cutelab.space',
62 'nitter.nl',
63 'nitter.mint.lgbt',
64 'nitter.bus-hit.me',
510809f1 65 'nitter.esmailelbob.xyz',
66 'tw.artemislena.eu',
510809f1 67 'nitter.winscloud.net',
68 'nitter.tiekoetter.com',
69 'nitter.spaceint.fr',
a9189510
O
70 'nitter.privacy.com.de',
71 'nitter.poast.org',
72 'nitter.bird.froth.zone',
73 'nitter.dcs0.hu',
74 'twitter.dr460nf1r3.org',
75 'nitter.garudalinux.org',
76 'twitter.femboy.hu',
77 'nitter.cz',
78 'nitter.privacydev.net',
79 'nitter.evil.site',
80 'tweet.lambda.dance',
81 'nitter.kylrth.com',
82 'nitter.foss.wtf',
83 'nitter.priv.pw',
84 'nitter.tokhmi.xyz',
85 'nitter.catalyst.sx',
86 'unofficialbird.com',
87 'nitter.projectsegfau.lt',
88 'nitter.eu.projectsegfau.lt',
89 'singapore.unofficialbird.com',
90 'canada.unofficialbird.com',
91 'india.unofficialbird.com',
92 'nederland.unofficialbird.com',
93 'uk.unofficialbird.com',
94 'n.l5.ca',
95 'nitter.slipfox.xyz',
96 'nitter.soopy.moe',
97 'nitter.qwik.space',
98 'read.whatever.social',
99 'nitter.rawbit.ninja',
100 'nt.vern.cc',
101 'ntr.odyssey346.dev',
102 'nitter.ir',
103 'nitter.privacytools.io',
104 'nitter.sneed.network',
105 'n.sneed.network',
106 'nitter.manasiwibi.com',
107 'nitter.smnz.de',
108 'nitter.twei.space',
109 'nitter.inpt.fr',
110 'nitter.d420.de',
111 'nitter.caioalonso.com',
112 'nitter.at',
113 'nitter.drivet.xyz',
114 'nitter.pw',
115 'nitter.nicfab.eu',
116 'bird.habedieeh.re',
117 'nitter.hostux.net',
118 'nitter.adminforge.de',
119 'nitter.platypush.tech',
120 'nitter.mask.sh',
121 'nitter.pufe.org',
122 'nitter.us.projectsegfau.lt',
123 'nitter.arcticfoxes.net',
124 't.com.sb',
125 'nitter.kling.gg',
126 'nitter.ktachibana.party',
127 'nitter.riverside.rocks',
128 'nitter.girlboss.ceo',
129 'nitter.lunar.icu',
130 'twitter.moe.ngo',
131 'nitter.freedit.eu',
132 'ntr.frail.duckdns.org',
133 'nitter.librenode.org',
134 'n.opnxng.com',
135 'nitter.plus.st',
a4ddaf23 136 )
137
138 DEAD_INSTANCES = (
139 # maintenance
140 'nitter.ethibox.fr',
141
142 # official, rate limited
143 'nitter.net',
144 # offline
510809f1 145 'is-nitter.resolv.ee',
146 'lu-nitter.resolv.ee',
a4ddaf23 147 'nitter.13ad.de',
510809f1 148 'nitter.40two.app',
149 'nitter.cattube.org',
150 'nitter.cc',
151 'nitter.dark.fail',
152 'nitter.himiko.cloud',
153 'nitter.koyu.space',
154 'nitter.mailstation.de',
155 'nitter.mastodont.cat',
156 'nitter.tedomum.net',
157 'nitter.tokhmi.xyz',
a4ddaf23 158 'nitter.weaponizedhumiliation.com',
510809f1 159 'nitter.vxempire.xyz',
160 'tweet.lambda.dance',
a9189510
O
161 'nitter.ca',
162 'nitter.42l.fr',
163 'nitter.pussthecat.org',
164 'nitter.nixnet.services',
165 'nitter.eu',
166 'nitter.actionsack.com',
167 'nitter.hu',
168 'twitr.gq',
169 'nittereu.moomoo.me',
170 'bird.from.tf',
171 'twitter.grimneko.de',
172 'nitter.alefvanoon.xyz',
173 'n.hyperborea.cloud',
174 'twitter.mstdn.social',
175 'nitter.silkky.cloud',
176 'nttr.stream',
177 'fuckthesacklers.network',
178 'nitter.govt.land',
179 'nitter.datatunnel.xyz',
180 'de.nttr.stream',
181 'twtr.bch.bar',
182 'nitter.exonip.de',
183 'nitter.mastodon.pro',
184 'nitter.notraxx.ch',
185 'nitter.skrep.in',
186 'nitter.snopyta.org',
a4ddaf23 187 )
188
189 INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
bb8a73a0 190
510809f1 191 _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
192 _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
a4ddaf23 193 current_instance = random.choice(HTTP_INSTANCES)
194
bb8a73a0 195 _TESTS = [
196 {
197 # GIF (wrapped in mp4)
510809f1 198 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
bb8a73a0 199 'info_dict': {
200 'id': '1314279897502629888',
201 'ext': 'mp4',
510809f1 202 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
203 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
bb8a73a0 204 'thumbnail': r're:^https?://.*\.jpg$',
205 'uploader': 'Firefox 🔥',
206 'uploader_id': 'firefox',
510809f1 207 'uploader_url': f'https://{current_instance}/firefox',
bb8a73a0 208 'upload_date': '20201008',
209 'timestamp': 1602183720,
510809f1 210 'like_count': int,
211 'repost_count': int,
212 'comment_count': int,
bb8a73a0 213 },
214 }, { # normal video
510809f1 215 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
bb8a73a0 216 'info_dict': {
217 'id': '1299715685392756737',
218 'ext': 'mp4',
510809f1 219 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
a4ddaf23 220 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
bb8a73a0 221 'thumbnail': r're:^https?://.*\.jpg$',
510809f1 222 'uploader': 're:^Le *Doc',
bb8a73a0 223 'uploader_id': 'Le___Doc',
510809f1 224 'uploader_url': f'https://{current_instance}/Le___Doc',
bb8a73a0 225 'upload_date': '20200829',
510809f1 226 'timestamp': 1598711340,
bb8a73a0 227 'view_count': int,
228 'like_count': int,
229 'repost_count': int,
230 'comment_count': int,
231 },
232 }, { # video embed in a "Streaming Political Ads" box
510809f1 233 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
bb8a73a0 234 'info_dict': {
235 'id': '1321147074491092994',
236 'ext': 'mp4',
510809f1 237 'title': 'md5:8290664aabb43b9189145c008386bf12',
238 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
bb8a73a0 239 'thumbnail': r're:^https?://.*\.jpg$',
240 'uploader': 'Mozilla',
241 'uploader_id': 'mozilla',
510809f1 242 'uploader_url': f'https://{current_instance}/mozilla',
bb8a73a0 243 'upload_date': '20201027',
510809f1 244 'timestamp': 1603820940,
245 'view_count': int,
246 'like_count': int,
247 'repost_count': int,
248 'comment_count': int,
bb8a73a0 249 },
510809f1 250 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
a4ddaf23 251 }, { # not the first tweet but main-tweet
510809f1 252 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
a4ddaf23 253 'info_dict': {
510809f1 254 'id': '1354848277481414657',
a4ddaf23 255 'ext': 'mp4',
510809f1 256 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
257 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
a4ddaf23 258 'thumbnail': r're:^https?://.*\.jpg$',
510809f1 259 'uploader': 'Firefox 🔥',
260 'uploader_id': 'firefox',
261 'uploader_url': f'https://{current_instance}/firefox',
262 'upload_date': '20210128',
263 'timestamp': 1611855960,
264 'view_count': int,
265 'like_count': int,
266 'repost_count': int,
267 'comment_count': int,
add96eb9 268 },
a83da371
A
269 }, { # no OpenGraph title
270 'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
271 'info_dict': {
272 'id': '1678455464038735895',
273 'ext': 'mp4',
274 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
275 'description': 'Local man, what did Romanians ever do to you?',
276 'thumbnail': r're:^https?://.*\.jpg$',
277 'uploader': 'Your Typical Local Man',
278 'uploader_id': 'LocalBateman',
279 'uploader_url': f'https://{current_instance}/LocalBateman',
280 'upload_date': '20230710',
281 'timestamp': 1689009900,
282 'view_count': int,
283 'like_count': int,
284 'repost_count': int,
285 'comment_count': int,
286 },
287 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
288 'params': {'skip_download': 'm3u8'},
add96eb9 289 },
bb8a73a0 290 ]
291
292 def _real_extract(self, url):
510809f1 293 video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
add96eb9 294 parsed_url = urllib.parse.urlparse(url)
510809f1 295 base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
bb8a73a0 296
297 self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
510809f1 298 full_webpage = webpage = self._download_webpage(url, video_id)
a4ddaf23 299
300 main_tweet_start = full_webpage.find('class="main-tweet"')
301 if main_tweet_start > 0:
302 webpage = full_webpage[main_tweet_start:]
bb8a73a0 303
add96eb9 304 video_url = '{}{}'.format(base_url, self._html_search_regex(
510809f1 305 r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
bb8a73a0 306 ext = determine_ext(video_url)
307
308 if ext == 'unknown_video':
309 formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
310 else:
311 formats = [{
312 'url': video_url,
add96eb9 313 'ext': ext,
bb8a73a0 314 }]
315
a83da371 316 title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex(
510809f1 317 r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
bb8a73a0 318
510809f1 319 uploader_id = self._html_search_regex(
320 r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
bb8a73a0 321
510809f1 322 uploader = self._html_search_regex(
323 r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
324 if uploader:
325 title = f'{uploader} - {title}'
bb8a73a0 326
510809f1 327 counts = {
328 f'{x[0]}_count': self._html_search_regex(
329 fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
330 webpage, f'{x[0]} count', fatal=False)
331 for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
332 }
333 counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
bb8a73a0 334
510809f1 335 thumbnail = (
336 self._html_search_meta('og:image', full_webpage, 'thumbnail url')
add96eb9 337 or remove_end('{}{}'.format(base_url, self._html_search_regex(
510809f1 338 r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
339
340 thumbnails = [
add96eb9 341 {'id': id_, 'url': f'{thumbnail}%3A{id_}'}
342 for id_ in ('thumb', 'small', 'large', 'medium', 'orig')
510809f1 343 ]
344
345 date = self._html_search_regex(
346 r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
347 webpage, 'upload date', default='').replace('·', '')
bb8a73a0 348
349 return {
350 'id': video_id,
351 'title': title,
352 'description': description,
353 'uploader': uploader,
510809f1 354 'timestamp': unified_timestamp(date),
bb8a73a0 355 'uploader_id': uploader_id,
510809f1 356 'uploader_url': f'{base_url}/{uploader_id}',
bb8a73a0 357 'formats': formats,
358 'thumbnails': thumbnails,
359 'thumbnail': thumbnail,
510809f1 360 **counts,
bb8a73a0 361 }