1 from .common
import InfoExtractor
2 from ..compat
import compat_urlparse
13 class NitterIE(InfoExtractor
):
14 # Taken from https://github.com/zedeus/nitter/wiki/Instances
16 NON_HTTP_INSTANCES
= (
17 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
18 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
19 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
20 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
21 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
22 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
23 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
24 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
25 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
26 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
27 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
28 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
29 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
30 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
31 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
32 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
33 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
36 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
38 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
42 'nitter.lacontrevoie.fr',
47 'nitter.domain.glass',
49 'birdsite.xanny.family',
58 'nitter.weiler.rocks',
59 'nitter.sethforprivacy.com',
60 'nitter.cutelab.space',
64 'nitter.esmailelbob.xyz',
66 'nitter.winscloud.net',
67 'nitter.tiekoetter.com',
69 'nitter.privacy.com.de',
71 'nitter.bird.froth.zone',
73 'twitter.dr460nf1r3.org',
74 'nitter.garudalinux.org',
77 'nitter.privacydev.net',
86 'nitter.projectsegfau.lt',
87 'nitter.eu.projectsegfau.lt',
88 'singapore.unofficialbird.com',
89 'canada.unofficialbird.com',
90 'india.unofficialbird.com',
91 'nederland.unofficialbird.com',
92 'uk.unofficialbird.com',
97 'read.whatever.social',
98 'nitter.rawbit.ninja',
100 'ntr.odyssey346.dev',
102 'nitter.privacytools.io',
103 'nitter.sneed.network',
105 'nitter.manasiwibi.com',
110 'nitter.caioalonso.com',
117 'nitter.adminforge.de',
118 'nitter.platypush.tech',
121 'nitter.us.projectsegfau.lt',
122 'nitter.arcticfoxes.net',
125 'nitter.ktachibana.party',
126 'nitter.riverside.rocks',
127 'nitter.girlboss.ceo',
131 'ntr.frail.duckdns.org',
132 'nitter.librenode.org',
141 # official, rate limited
144 'is-nitter.resolv.ee',
145 'lu-nitter.resolv.ee',
148 'nitter.cattube.org',
151 'nitter.himiko.cloud',
153 'nitter.mailstation.de',
154 'nitter.mastodont.cat',
155 'nitter.tedomum.net',
157 'nitter.weaponizedhumiliation.com',
158 'nitter.vxempire.xyz',
159 'tweet.lambda.dance',
162 'nitter.pussthecat.org',
163 'nitter.nixnet.services',
165 'nitter.actionsack.com',
168 'nittereu.moomoo.me',
170 'twitter.grimneko.de',
171 'nitter.alefvanoon.xyz',
172 'n.hyperborea.cloud',
173 'twitter.mstdn.social',
174 'nitter.silkky.cloud',
176 'fuckthesacklers.network',
178 'nitter.datatunnel.xyz',
182 'nitter.mastodon.pro',
185 'nitter.snopyta.org',
188 INSTANCES
= NON_HTTP_INSTANCES
+ HTTP_INSTANCES
+ DEAD_INSTANCES
190 _INSTANCES_RE
= f
'(?:{"|".join(map(re.escape, INSTANCES))})'
191 _VALID_URL
= fr
'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
192 current_instance
= random
.choice(HTTP_INSTANCES
)
196 # GIF (wrapped in mp4)
197 'url': f
'https://{current_instance}/firefox/status/1314279897502629888#m',
199 'id': '1314279897502629888',
201 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
202 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
203 'thumbnail': r
're:^https?://.*\.jpg$',
204 'uploader': 'Firefox 🔥',
205 'uploader_id': 'firefox',
206 'uploader_url': f
'https://{current_instance}/firefox',
207 'upload_date': '20201008',
208 'timestamp': 1602183720,
211 'comment_count': int,
214 'url': f
'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
216 'id': '1299715685392756737',
218 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
219 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
220 'thumbnail': r
're:^https?://.*\.jpg$',
221 'uploader': 're:^Le *Doc',
222 'uploader_id': 'Le___Doc',
223 'uploader_url': f
'https://{current_instance}/Le___Doc',
224 'upload_date': '20200829',
225 'timestamp': 1598711340,
229 'comment_count': int,
231 }, { # video embed in a "Streaming Political Ads" box
232 'url': f
'https://{current_instance}/mozilla/status/1321147074491092994#m',
234 'id': '1321147074491092994',
236 'title': 'md5:8290664aabb43b9189145c008386bf12',
237 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
238 'thumbnail': r
're:^https?://.*\.jpg$',
239 'uploader': 'Mozilla',
240 'uploader_id': 'mozilla',
241 'uploader_url': f
'https://{current_instance}/mozilla',
242 'upload_date': '20201027',
243 'timestamp': 1603820940,
247 'comment_count': int,
249 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
250 }, { # not the first tweet but main-tweet
251 'url': f
'https://{current_instance}/firefox/status/1354848277481414657#m',
253 'id': '1354848277481414657',
255 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
256 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
257 'thumbnail': r
're:^https?://.*\.jpg$',
258 'uploader': 'Firefox 🔥',
259 'uploader_id': 'firefox',
260 'uploader_url': f
'https://{current_instance}/firefox',
261 'upload_date': '20210128',
262 'timestamp': 1611855960,
266 'comment_count': int,
268 }, { # no OpenGraph title
269 'url': f
'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
271 'id': '1678455464038735895',
273 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
274 'description': 'Local man, what did Romanians ever do to you?',
275 'thumbnail': r
're:^https?://.*\.jpg$',
276 'uploader': 'Your Typical Local Man',
277 'uploader_id': 'LocalBateman',
278 'uploader_url': f
'https://{current_instance}/LocalBateman',
279 'upload_date': '20230710',
280 'timestamp': 1689009900,
284 'comment_count': int,
286 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
287 'params': {'skip_download': 'm3u8'}
,
291 def _real_extract(self
, url
):
292 video_id
, uploader_id
= self
._match
_valid
_url
(url
).group('id', 'uploader_id')
293 parsed_url
= compat_urlparse
.urlparse(url
)
294 base_url
= f
'{parsed_url.scheme}://{parsed_url.netloc}'
296 self
._set
_cookie
(parsed_url
.netloc
, 'hlsPlayback', 'on')
297 full_webpage
= webpage
= self
._download
_webpage
(url
, video_id
)
299 main_tweet_start
= full_webpage
.find('class="main-tweet"')
300 if main_tweet_start
> 0:
301 webpage
= full_webpage
[main_tweet_start
:]
303 video_url
= '%s%s' % (base_url
, self
._html
_search
_regex
(
304 r
'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage
, 'video url'))
305 ext
= determine_ext(video_url
)
307 if ext
== 'unknown_video':
308 formats
= self
._extract
_m
3u8_formats
(video_url
, video_id
, ext
='mp4')
315 title
= description
= self
._og
_search
_description
(full_webpage
, default
=None) or self
._html
_search
_regex
(
316 r
'<div class="tweet-content[^>]+>([^<]+)</div>', webpage
, 'title', fatal
=False)
318 uploader_id
= self
._html
_search
_regex
(
319 r
'<a class="username"[^>]+title="@([^"]+)"', webpage
, 'uploader id', fatal
=False) or uploader_id
321 uploader
= self
._html
_search
_regex
(
322 r
'<a class="fullname"[^>]+title="([^"]+)"', webpage
, 'uploader name', fatal
=False)
324 title
= f
'{uploader} - {title}'
327 f
'{x[0]}_count': self
._html
_search
_regex
(
328 fr
'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
329 webpage
, f
'{x[0]} count', fatal
=False)
330 for x
in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
332 counts
= {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
335 self
._html
_search
_meta
('og:image', full_webpage
, 'thumbnail url')
336 or remove_end('%s%s' % (base_url
, self
._html
_search
_regex
(
337 r
'<video[^>]+poster="([^"]+)"', webpage
, 'thumbnail url', fatal
=False)), '%3Asmall'))
340 {'id': id, 'url': f'{thumbnail}
%3A{id}
'}
341 for id in ('thumb
', 'small
', 'large
', 'medium
', 'orig
')
344 date = self._html_search_regex(
345 r'<span
[^
>]+class="tweet-date"[^
>]*><a
[^
>]+title
="([^"]+)"',
346 webpage, 'upload date', default='').replace('·', '')
351 'description': description,
352 'uploader': uploader,
353 'timestamp': unified_timestamp(date),
354 'uploader_id': uploader_id,
355 'uploader_url': f'{base_url}/{uploader_id}',
357 'thumbnails': thumbnails,
358 'thumbnail': thumbnail,