1 from .common
import InfoExtractor
2 from ..compat
import compat_urlparse
13 class NitterIE(InfoExtractor
):
14 # Taken from https://github.com/zedeus/nitter/wiki/Instances
16 NON_HTTP_INSTANCES
= (
17 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
18 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
19 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
20 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
21 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
22 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
23 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
24 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
25 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
26 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
27 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
28 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
29 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
30 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
31 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
32 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
33 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
36 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
38 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
42 'nitter.lacontrevoie.fr',
47 'nitter.domain.glass',
49 'birdsite.xanny.family',
58 'nitter.weiler.rocks',
59 'nitter.sethforprivacy.com',
60 'nitter.cutelab.space',
64 'nitter.esmailelbob.xyz',
66 'nitter.winscloud.net',
67 'nitter.tiekoetter.com',
69 'nitter.privacy.com.de',
71 'nitter.bird.froth.zone',
73 'twitter.dr460nf1r3.org',
74 'nitter.garudalinux.org',
77 'nitter.privacydev.net',
86 'nitter.projectsegfau.lt',
87 'nitter.eu.projectsegfau.lt',
88 'singapore.unofficialbird.com',
89 'canada.unofficialbird.com',
90 'india.unofficialbird.com',
91 'nederland.unofficialbird.com',
92 'uk.unofficialbird.com',
97 'read.whatever.social',
98 'nitter.rawbit.ninja',
100 'ntr.odyssey346.dev',
102 'nitter.privacytools.io',
103 'nitter.sneed.network',
105 'nitter.manasiwibi.com',
110 'nitter.caioalonso.com',
117 'nitter.adminforge.de',
118 'nitter.platypush.tech',
121 'nitter.us.projectsegfau.lt',
122 'nitter.arcticfoxes.net',
125 'nitter.ktachibana.party',
126 'nitter.riverside.rocks',
127 'nitter.girlboss.ceo',
131 'ntr.frail.duckdns.org',
132 'nitter.librenode.org',
141 # official, rate limited
144 'is-nitter.resolv.ee',
145 'lu-nitter.resolv.ee',
148 'nitter.cattube.org',
151 'nitter.himiko.cloud',
153 'nitter.mailstation.de',
154 'nitter.mastodont.cat',
155 'nitter.tedomum.net',
157 'nitter.weaponizedhumiliation.com',
158 'nitter.vxempire.xyz',
159 'tweet.lambda.dance',
162 'nitter.pussthecat.org',
163 'nitter.nixnet.services',
165 'nitter.actionsack.com',
168 'nittereu.moomoo.me',
170 'twitter.grimneko.de',
171 'nitter.alefvanoon.xyz',
172 'n.hyperborea.cloud',
173 'twitter.mstdn.social',
174 'nitter.silkky.cloud',
176 'fuckthesacklers.network',
178 'nitter.datatunnel.xyz',
182 'nitter.mastodon.pro',
185 'nitter.snopyta.org',
188 INSTANCES
= NON_HTTP_INSTANCES
+ HTTP_INSTANCES
+ DEAD_INSTANCES
190 _INSTANCES_RE
= f
'(?:{"|".join(map(re.escape, INSTANCES))})'
191 _VALID_URL
= fr
'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
192 current_instance
= random
.choice(HTTP_INSTANCES
)
196 # GIF (wrapped in mp4)
197 'url': f
'https://{current_instance}/firefox/status/1314279897502629888#m',
199 'id': '1314279897502629888',
201 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
202 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
203 'thumbnail': r
're:^https?://.*\.jpg$',
204 'uploader': 'Firefox 🔥',
205 'uploader_id': 'firefox',
206 'uploader_url': f
'https://{current_instance}/firefox',
207 'upload_date': '20201008',
208 'timestamp': 1602183720,
211 'comment_count': int,
214 'url': f
'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
216 'id': '1299715685392756737',
218 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
219 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
220 'thumbnail': r
're:^https?://.*\.jpg$',
221 'uploader': 're:^Le *Doc',
222 'uploader_id': 'Le___Doc',
223 'uploader_url': f
'https://{current_instance}/Le___Doc',
224 'upload_date': '20200829',
225 'timestamp': 1598711340,
229 'comment_count': int,
231 }, { # video embed in a "Streaming Political Ads" box
232 'url': f
'https://{current_instance}/mozilla/status/1321147074491092994#m',
234 'id': '1321147074491092994',
236 'title': 'md5:8290664aabb43b9189145c008386bf12',
237 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
238 'thumbnail': r
're:^https?://.*\.jpg$',
239 'uploader': 'Mozilla',
240 'uploader_id': 'mozilla',
241 'uploader_url': f
'https://{current_instance}/mozilla',
242 'upload_date': '20201027',
243 'timestamp': 1603820940,
247 'comment_count': int,
249 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
250 }, { # not the first tweet but main-tweet
251 'url': f
'https://{current_instance}/firefox/status/1354848277481414657#m',
253 'id': '1354848277481414657',
255 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
256 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
257 'thumbnail': r
're:^https?://.*\.jpg$',
258 'uploader': 'Firefox 🔥',
259 'uploader_id': 'firefox',
260 'uploader_url': f
'https://{current_instance}/firefox',
261 'upload_date': '20210128',
262 'timestamp': 1611855960,
266 'comment_count': int,
271 def _real_extract(self
, url
):
272 video_id
, uploader_id
= self
._match
_valid
_url
(url
).group('id', 'uploader_id')
273 parsed_url
= compat_urlparse
.urlparse(url
)
274 base_url
= f
'{parsed_url.scheme}://{parsed_url.netloc}'
276 self
._set
_cookie
(parsed_url
.netloc
, 'hlsPlayback', 'on')
277 full_webpage
= webpage
= self
._download
_webpage
(url
, video_id
)
279 main_tweet_start
= full_webpage
.find('class="main-tweet"')
280 if main_tweet_start
> 0:
281 webpage
= full_webpage
[main_tweet_start
:]
283 video_url
= '%s%s' % (base_url
, self
._html
_search
_regex
(
284 r
'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage
, 'video url'))
285 ext
= determine_ext(video_url
)
287 if ext
== 'unknown_video':
288 formats
= self
._extract
_m
3u8_formats
(video_url
, video_id
, ext
='mp4')
295 title
= description
= self
._og
_search
_description
(full_webpage
) or self
._html
_search
_regex
(
296 r
'<div class="tweet-content[^>]+>([^<]+)</div>', webpage
, 'title', fatal
=False)
298 uploader_id
= self
._html
_search
_regex
(
299 r
'<a class="username"[^>]+title="@([^"]+)"', webpage
, 'uploader id', fatal
=False) or uploader_id
301 uploader
= self
._html
_search
_regex
(
302 r
'<a class="fullname"[^>]+title="([^"]+)"', webpage
, 'uploader name', fatal
=False)
304 title
= f
'{uploader} - {title}'
307 f
'{x[0]}_count': self
._html
_search
_regex
(
308 fr
'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
309 webpage
, f
'{x[0]} count', fatal
=False)
310 for x
in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
312 counts
= {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
315 self
._html
_search
_meta
('og:image', full_webpage
, 'thumbnail url')
316 or remove_end('%s%s' % (base_url
, self
._html
_search
_regex
(
317 r
'<video[^>]+poster="([^"]+)"', webpage
, 'thumbnail url', fatal
=False)), '%3Asmall'))
320 {'id': id, 'url': f'{thumbnail}
%3A{id}
'}
321 for id in ('thumb
', 'small
', 'large
', 'medium
', 'orig
')
324 date = self._html_search_regex(
325 r'<span
[^
>]+class="tweet-date"[^
>]*><a
[^
>]+title
="([^"]+)"',
326 webpage, 'upload date', default='').replace('·', '')
331 'description': description,
332 'uploader': uploader,
333 'timestamp': unified_timestamp(date),
334 'uploader_id': uploader_id,
335 'uploader_url': f'{base_url}/{uploader_id}',
337 'thumbnails': thumbnails,
338 'thumbnail': thumbnail,