]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/nitter.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / nitter.py
1 from .common import InfoExtractor
2 from ..compat import compat_urlparse
3 from ..utils import (
4 parse_count,
5 unified_timestamp,
6 remove_end,
7 determine_ext,
8 )
9 import re
10 import random
11
12
13 class NitterIE(InfoExtractor):
14 # Taken from https://github.com/zedeus/nitter/wiki/Instances
15
16 NON_HTTP_INSTANCES = (
17 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
18 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
19 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
20 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
21 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
22 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
23 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
24 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
25 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
26 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
27 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
28 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
29 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
30 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
31 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
32 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
33 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
34
35 'nitter.i2p',
36 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
37
38 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
39 )
40
41 HTTP_INSTANCES = (
42 'nitter.lacontrevoie.fr',
43 'nitter.fdn.fr',
44 'nitter.1d4.us',
45 'nitter.kavin.rocks',
46 'nitter.unixfox.eu',
47 'nitter.domain.glass',
48 'nitter.namazso.eu',
49 'birdsite.xanny.family',
50 'nitter.moomoo.me',
51 'bird.trom.tf',
52 'nitter.it',
53 'twitter.censors.us',
54 'nitter.grimneko.de',
55 'twitter.076.ne.jp',
56 'nitter.fly.dev',
57 'notabird.site',
58 'nitter.weiler.rocks',
59 'nitter.sethforprivacy.com',
60 'nitter.cutelab.space',
61 'nitter.nl',
62 'nitter.mint.lgbt',
63 'nitter.bus-hit.me',
64 'nitter.esmailelbob.xyz',
65 'tw.artemislena.eu',
66 'nitter.winscloud.net',
67 'nitter.tiekoetter.com',
68 'nitter.spaceint.fr',
69 'nitter.privacy.com.de',
70 'nitter.poast.org',
71 'nitter.bird.froth.zone',
72 'nitter.dcs0.hu',
73 'twitter.dr460nf1r3.org',
74 'nitter.garudalinux.org',
75 'twitter.femboy.hu',
76 'nitter.cz',
77 'nitter.privacydev.net',
78 'nitter.evil.site',
79 'tweet.lambda.dance',
80 'nitter.kylrth.com',
81 'nitter.foss.wtf',
82 'nitter.priv.pw',
83 'nitter.tokhmi.xyz',
84 'nitter.catalyst.sx',
85 'unofficialbird.com',
86 'nitter.projectsegfau.lt',
87 'nitter.eu.projectsegfau.lt',
88 'singapore.unofficialbird.com',
89 'canada.unofficialbird.com',
90 'india.unofficialbird.com',
91 'nederland.unofficialbird.com',
92 'uk.unofficialbird.com',
93 'n.l5.ca',
94 'nitter.slipfox.xyz',
95 'nitter.soopy.moe',
96 'nitter.qwik.space',
97 'read.whatever.social',
98 'nitter.rawbit.ninja',
99 'nt.vern.cc',
100 'ntr.odyssey346.dev',
101 'nitter.ir',
102 'nitter.privacytools.io',
103 'nitter.sneed.network',
104 'n.sneed.network',
105 'nitter.manasiwibi.com',
106 'nitter.smnz.de',
107 'nitter.twei.space',
108 'nitter.inpt.fr',
109 'nitter.d420.de',
110 'nitter.caioalonso.com',
111 'nitter.at',
112 'nitter.drivet.xyz',
113 'nitter.pw',
114 'nitter.nicfab.eu',
115 'bird.habedieeh.re',
116 'nitter.hostux.net',
117 'nitter.adminforge.de',
118 'nitter.platypush.tech',
119 'nitter.mask.sh',
120 'nitter.pufe.org',
121 'nitter.us.projectsegfau.lt',
122 'nitter.arcticfoxes.net',
123 't.com.sb',
124 'nitter.kling.gg',
125 'nitter.ktachibana.party',
126 'nitter.riverside.rocks',
127 'nitter.girlboss.ceo',
128 'nitter.lunar.icu',
129 'twitter.moe.ngo',
130 'nitter.freedit.eu',
131 'ntr.frail.duckdns.org',
132 'nitter.librenode.org',
133 'n.opnxng.com',
134 'nitter.plus.st',
135 )
136
137 DEAD_INSTANCES = (
138 # maintenance
139 'nitter.ethibox.fr',
140
141 # official, rate limited
142 'nitter.net',
143 # offline
144 'is-nitter.resolv.ee',
145 'lu-nitter.resolv.ee',
146 'nitter.13ad.de',
147 'nitter.40two.app',
148 'nitter.cattube.org',
149 'nitter.cc',
150 'nitter.dark.fail',
151 'nitter.himiko.cloud',
152 'nitter.koyu.space',
153 'nitter.mailstation.de',
154 'nitter.mastodont.cat',
155 'nitter.tedomum.net',
156 'nitter.tokhmi.xyz',
157 'nitter.weaponizedhumiliation.com',
158 'nitter.vxempire.xyz',
159 'tweet.lambda.dance',
160 'nitter.ca',
161 'nitter.42l.fr',
162 'nitter.pussthecat.org',
163 'nitter.nixnet.services',
164 'nitter.eu',
165 'nitter.actionsack.com',
166 'nitter.hu',
167 'twitr.gq',
168 'nittereu.moomoo.me',
169 'bird.from.tf',
170 'twitter.grimneko.de',
171 'nitter.alefvanoon.xyz',
172 'n.hyperborea.cloud',
173 'twitter.mstdn.social',
174 'nitter.silkky.cloud',
175 'nttr.stream',
176 'fuckthesacklers.network',
177 'nitter.govt.land',
178 'nitter.datatunnel.xyz',
179 'de.nttr.stream',
180 'twtr.bch.bar',
181 'nitter.exonip.de',
182 'nitter.mastodon.pro',
183 'nitter.notraxx.ch',
184 'nitter.skrep.in',
185 'nitter.snopyta.org',
186 )
187
188 INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
189
190 _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
191 _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
192 current_instance = random.choice(HTTP_INSTANCES)
193
194 _TESTS = [
195 {
196 # GIF (wrapped in mp4)
197 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
198 'info_dict': {
199 'id': '1314279897502629888',
200 'ext': 'mp4',
201 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
202 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
203 'thumbnail': r're:^https?://.*\.jpg$',
204 'uploader': 'Firefox 🔥',
205 'uploader_id': 'firefox',
206 'uploader_url': f'https://{current_instance}/firefox',
207 'upload_date': '20201008',
208 'timestamp': 1602183720,
209 'like_count': int,
210 'repost_count': int,
211 'comment_count': int,
212 },
213 }, { # normal video
214 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
215 'info_dict': {
216 'id': '1299715685392756737',
217 'ext': 'mp4',
218 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
219 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
220 'thumbnail': r're:^https?://.*\.jpg$',
221 'uploader': 're:^Le *Doc',
222 'uploader_id': 'Le___Doc',
223 'uploader_url': f'https://{current_instance}/Le___Doc',
224 'upload_date': '20200829',
225 'timestamp': 1598711340,
226 'view_count': int,
227 'like_count': int,
228 'repost_count': int,
229 'comment_count': int,
230 },
231 }, { # video embed in a "Streaming Political Ads" box
232 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
233 'info_dict': {
234 'id': '1321147074491092994',
235 'ext': 'mp4',
236 'title': 'md5:8290664aabb43b9189145c008386bf12',
237 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
238 'thumbnail': r're:^https?://.*\.jpg$',
239 'uploader': 'Mozilla',
240 'uploader_id': 'mozilla',
241 'uploader_url': f'https://{current_instance}/mozilla',
242 'upload_date': '20201027',
243 'timestamp': 1603820940,
244 'view_count': int,
245 'like_count': int,
246 'repost_count': int,
247 'comment_count': int,
248 },
249 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
250 }, { # not the first tweet but main-tweet
251 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
252 'info_dict': {
253 'id': '1354848277481414657',
254 'ext': 'mp4',
255 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
256 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
257 'thumbnail': r're:^https?://.*\.jpg$',
258 'uploader': 'Firefox 🔥',
259 'uploader_id': 'firefox',
260 'uploader_url': f'https://{current_instance}/firefox',
261 'upload_date': '20210128',
262 'timestamp': 1611855960,
263 'view_count': int,
264 'like_count': int,
265 'repost_count': int,
266 'comment_count': int,
267 }
268 }, { # no OpenGraph title
269 'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
270 'info_dict': {
271 'id': '1678455464038735895',
272 'ext': 'mp4',
273 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
274 'description': 'Local man, what did Romanians ever do to you?',
275 'thumbnail': r're:^https?://.*\.jpg$',
276 'uploader': 'Your Typical Local Man',
277 'uploader_id': 'LocalBateman',
278 'uploader_url': f'https://{current_instance}/LocalBateman',
279 'upload_date': '20230710',
280 'timestamp': 1689009900,
281 'view_count': int,
282 'like_count': int,
283 'repost_count': int,
284 'comment_count': int,
285 },
286 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
287 'params': {'skip_download': 'm3u8'},
288 }
289 ]
290
291 def _real_extract(self, url):
292 video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
293 parsed_url = compat_urlparse.urlparse(url)
294 base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
295
296 self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
297 full_webpage = webpage = self._download_webpage(url, video_id)
298
299 main_tweet_start = full_webpage.find('class="main-tweet"')
300 if main_tweet_start > 0:
301 webpage = full_webpage[main_tweet_start:]
302
303 video_url = '%s%s' % (base_url, self._html_search_regex(
304 r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
305 ext = determine_ext(video_url)
306
307 if ext == 'unknown_video':
308 formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
309 else:
310 formats = [{
311 'url': video_url,
312 'ext': ext
313 }]
314
315 title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex(
316 r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
317
318 uploader_id = self._html_search_regex(
319 r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
320
321 uploader = self._html_search_regex(
322 r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
323 if uploader:
324 title = f'{uploader} - {title}'
325
326 counts = {
327 f'{x[0]}_count': self._html_search_regex(
328 fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
329 webpage, f'{x[0]} count', fatal=False)
330 for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
331 }
332 counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
333
334 thumbnail = (
335 self._html_search_meta('og:image', full_webpage, 'thumbnail url')
336 or remove_end('%s%s' % (base_url, self._html_search_regex(
337 r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
338
339 thumbnails = [
340 {'id': id, 'url': f'{thumbnail}%3A{id}'}
341 for id in ('thumb', 'small', 'large', 'medium', 'orig')
342 ]
343
344 date = self._html_search_regex(
345 r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
346 webpage, 'upload date', default='').replace('·', '')
347
348 return {
349 'id': video_id,
350 'title': title,
351 'description': description,
352 'uploader': uploader,
353 'timestamp': unified_timestamp(date),
354 'uploader_id': uploader_id,
355 'uploader_url': f'{base_url}/{uploader_id}',
356 'formats': formats,
357 'thumbnails': thumbnails,
358 'thumbnail': thumbnail,
359 **counts,
360 }