]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/nitter.py
[compat, networking] Deprecate old functions (#2861)
[yt-dlp.git] / yt_dlp / extractor / nitter.py
CommitLineData
bb8a73a0 1from .common import InfoExtractor
2from ..compat import compat_urlparse
3from ..utils import (
4 parse_count,
bb8a73a0 5 unified_timestamp,
6 remove_end,
7 determine_ext,
8)
9import re
a4ddaf23 10import random
bb8a73a0 11
12
13class NitterIE(InfoExtractor):
14 # Taken from https://github.com/zedeus/nitter/wiki/Instances
a4ddaf23 15
16 NON_HTTP_INSTANCES = (
17 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
18 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
19 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
20 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
21 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
22 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
23 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
510809f1 24 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
25 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
26 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
27 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
28 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
29 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
30 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
31 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
32 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
33 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
a4ddaf23 34
35 'nitter.i2p',
36 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
37
38 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
39 )
40
41 HTTP_INSTANCES = (
a9189510 42 'nitter.lacontrevoie.fr',
a4ddaf23 43 'nitter.fdn.fr',
44 'nitter.1d4.us',
45 'nitter.kavin.rocks',
a4ddaf23 46 'nitter.unixfox.eu',
47 'nitter.domain.glass',
a4ddaf23 48 'nitter.namazso.eu',
a4ddaf23 49 'birdsite.xanny.family',
510809f1 50 'nitter.moomoo.me',
a9189510 51 'bird.trom.tf',
510809f1 52 'nitter.it',
53 'twitter.censors.us',
a9189510 54 'nitter.grimneko.de',
510809f1 55 'twitter.076.ne.jp',
510809f1 56 'nitter.fly.dev',
57 'notabird.site',
58 'nitter.weiler.rocks',
510809f1 59 'nitter.sethforprivacy.com',
510809f1 60 'nitter.cutelab.space',
61 'nitter.nl',
62 'nitter.mint.lgbt',
63 'nitter.bus-hit.me',
510809f1 64 'nitter.esmailelbob.xyz',
65 'tw.artemislena.eu',
510809f1 66 'nitter.winscloud.net',
67 'nitter.tiekoetter.com',
68 'nitter.spaceint.fr',
a9189510
O
69 'nitter.privacy.com.de',
70 'nitter.poast.org',
71 'nitter.bird.froth.zone',
72 'nitter.dcs0.hu',
73 'twitter.dr460nf1r3.org',
74 'nitter.garudalinux.org',
75 'twitter.femboy.hu',
76 'nitter.cz',
77 'nitter.privacydev.net',
78 'nitter.evil.site',
79 'tweet.lambda.dance',
80 'nitter.kylrth.com',
81 'nitter.foss.wtf',
82 'nitter.priv.pw',
83 'nitter.tokhmi.xyz',
84 'nitter.catalyst.sx',
85 'unofficialbird.com',
86 'nitter.projectsegfau.lt',
87 'nitter.eu.projectsegfau.lt',
88 'singapore.unofficialbird.com',
89 'canada.unofficialbird.com',
90 'india.unofficialbird.com',
91 'nederland.unofficialbird.com',
92 'uk.unofficialbird.com',
93 'n.l5.ca',
94 'nitter.slipfox.xyz',
95 'nitter.soopy.moe',
96 'nitter.qwik.space',
97 'read.whatever.social',
98 'nitter.rawbit.ninja',
99 'nt.vern.cc',
100 'ntr.odyssey346.dev',
101 'nitter.ir',
102 'nitter.privacytools.io',
103 'nitter.sneed.network',
104 'n.sneed.network',
105 'nitter.manasiwibi.com',
106 'nitter.smnz.de',
107 'nitter.twei.space',
108 'nitter.inpt.fr',
109 'nitter.d420.de',
110 'nitter.caioalonso.com',
111 'nitter.at',
112 'nitter.drivet.xyz',
113 'nitter.pw',
114 'nitter.nicfab.eu',
115 'bird.habedieeh.re',
116 'nitter.hostux.net',
117 'nitter.adminforge.de',
118 'nitter.platypush.tech',
119 'nitter.mask.sh',
120 'nitter.pufe.org',
121 'nitter.us.projectsegfau.lt',
122 'nitter.arcticfoxes.net',
123 't.com.sb',
124 'nitter.kling.gg',
125 'nitter.ktachibana.party',
126 'nitter.riverside.rocks',
127 'nitter.girlboss.ceo',
128 'nitter.lunar.icu',
129 'twitter.moe.ngo',
130 'nitter.freedit.eu',
131 'ntr.frail.duckdns.org',
132 'nitter.librenode.org',
133 'n.opnxng.com',
134 'nitter.plus.st',
a4ddaf23 135 )
136
137 DEAD_INSTANCES = (
138 # maintenance
139 'nitter.ethibox.fr',
140
141 # official, rate limited
142 'nitter.net',
143 # offline
510809f1 144 'is-nitter.resolv.ee',
145 'lu-nitter.resolv.ee',
a4ddaf23 146 'nitter.13ad.de',
510809f1 147 'nitter.40two.app',
148 'nitter.cattube.org',
149 'nitter.cc',
150 'nitter.dark.fail',
151 'nitter.himiko.cloud',
152 'nitter.koyu.space',
153 'nitter.mailstation.de',
154 'nitter.mastodont.cat',
155 'nitter.tedomum.net',
156 'nitter.tokhmi.xyz',
a4ddaf23 157 'nitter.weaponizedhumiliation.com',
510809f1 158 'nitter.vxempire.xyz',
159 'tweet.lambda.dance',
a9189510
O
160 'nitter.ca',
161 'nitter.42l.fr',
162 'nitter.pussthecat.org',
163 'nitter.nixnet.services',
164 'nitter.eu',
165 'nitter.actionsack.com',
166 'nitter.hu',
167 'twitr.gq',
168 'nittereu.moomoo.me',
169 'bird.from.tf',
170 'twitter.grimneko.de',
171 'nitter.alefvanoon.xyz',
172 'n.hyperborea.cloud',
173 'twitter.mstdn.social',
174 'nitter.silkky.cloud',
175 'nttr.stream',
176 'fuckthesacklers.network',
177 'nitter.govt.land',
178 'nitter.datatunnel.xyz',
179 'de.nttr.stream',
180 'twtr.bch.bar',
181 'nitter.exonip.de',
182 'nitter.mastodon.pro',
183 'nitter.notraxx.ch',
184 'nitter.skrep.in',
185 'nitter.snopyta.org',
a4ddaf23 186 )
187
188 INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
bb8a73a0 189
510809f1 190 _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
191 _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
a4ddaf23 192 current_instance = random.choice(HTTP_INSTANCES)
193
bb8a73a0 194 _TESTS = [
195 {
196 # GIF (wrapped in mp4)
510809f1 197 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
bb8a73a0 198 'info_dict': {
199 'id': '1314279897502629888',
200 'ext': 'mp4',
510809f1 201 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
202 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
bb8a73a0 203 'thumbnail': r're:^https?://.*\.jpg$',
204 'uploader': 'Firefox 🔥',
205 'uploader_id': 'firefox',
510809f1 206 'uploader_url': f'https://{current_instance}/firefox',
bb8a73a0 207 'upload_date': '20201008',
208 'timestamp': 1602183720,
510809f1 209 'like_count': int,
210 'repost_count': int,
211 'comment_count': int,
bb8a73a0 212 },
213 }, { # normal video
510809f1 214 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
bb8a73a0 215 'info_dict': {
216 'id': '1299715685392756737',
217 'ext': 'mp4',
510809f1 218 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
a4ddaf23 219 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
bb8a73a0 220 'thumbnail': r're:^https?://.*\.jpg$',
510809f1 221 'uploader': 're:^Le *Doc',
bb8a73a0 222 'uploader_id': 'Le___Doc',
510809f1 223 'uploader_url': f'https://{current_instance}/Le___Doc',
bb8a73a0 224 'upload_date': '20200829',
510809f1 225 'timestamp': 1598711340,
bb8a73a0 226 'view_count': int,
227 'like_count': int,
228 'repost_count': int,
229 'comment_count': int,
230 },
231 }, { # video embed in a "Streaming Political Ads" box
510809f1 232 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
bb8a73a0 233 'info_dict': {
234 'id': '1321147074491092994',
235 'ext': 'mp4',
510809f1 236 'title': 'md5:8290664aabb43b9189145c008386bf12',
237 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
bb8a73a0 238 'thumbnail': r're:^https?://.*\.jpg$',
239 'uploader': 'Mozilla',
240 'uploader_id': 'mozilla',
510809f1 241 'uploader_url': f'https://{current_instance}/mozilla',
bb8a73a0 242 'upload_date': '20201027',
510809f1 243 'timestamp': 1603820940,
244 'view_count': int,
245 'like_count': int,
246 'repost_count': int,
247 'comment_count': int,
bb8a73a0 248 },
510809f1 249 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
a4ddaf23 250 }, { # not the first tweet but main-tweet
510809f1 251 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
a4ddaf23 252 'info_dict': {
510809f1 253 'id': '1354848277481414657',
a4ddaf23 254 'ext': 'mp4',
510809f1 255 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
256 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
a4ddaf23 257 'thumbnail': r're:^https?://.*\.jpg$',
510809f1 258 'uploader': 'Firefox 🔥',
259 'uploader_id': 'firefox',
260 'uploader_url': f'https://{current_instance}/firefox',
261 'upload_date': '20210128',
262 'timestamp': 1611855960,
263 'view_count': int,
264 'like_count': int,
265 'repost_count': int,
266 'comment_count': int,
a4ddaf23 267 }
268 }
bb8a73a0 269 ]
270
271 def _real_extract(self, url):
510809f1 272 video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
bb8a73a0 273 parsed_url = compat_urlparse.urlparse(url)
510809f1 274 base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
bb8a73a0 275
276 self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
510809f1 277 full_webpage = webpage = self._download_webpage(url, video_id)
a4ddaf23 278
279 main_tweet_start = full_webpage.find('class="main-tweet"')
280 if main_tweet_start > 0:
281 webpage = full_webpage[main_tweet_start:]
bb8a73a0 282
510809f1 283 video_url = '%s%s' % (base_url, self._html_search_regex(
284 r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
bb8a73a0 285 ext = determine_ext(video_url)
286
287 if ext == 'unknown_video':
288 formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
289 else:
290 formats = [{
291 'url': video_url,
292 'ext': ext
293 }]
294
510809f1 295 title = description = self._og_search_description(full_webpage) or self._html_search_regex(
296 r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
bb8a73a0 297
510809f1 298 uploader_id = self._html_search_regex(
299 r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
bb8a73a0 300
510809f1 301 uploader = self._html_search_regex(
302 r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
303 if uploader:
304 title = f'{uploader} - {title}'
bb8a73a0 305
510809f1 306 counts = {
307 f'{x[0]}_count': self._html_search_regex(
308 fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
309 webpage, f'{x[0]} count', fatal=False)
310 for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
311 }
312 counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
bb8a73a0 313
510809f1 314 thumbnail = (
315 self._html_search_meta('og:image', full_webpage, 'thumbnail url')
316 or remove_end('%s%s' % (base_url, self._html_search_regex(
317 r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
318
319 thumbnails = [
320 {'id': id, 'url': f'{thumbnail}%3A{id}'}
321 for id in ('thumb', 'small', 'large', 'medium', 'orig')
322 ]
323
324 date = self._html_search_regex(
325 r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
326 webpage, 'upload date', default='').replace('·', '')
bb8a73a0 327
328 return {
329 'id': video_id,
330 'title': title,
331 'description': description,
332 'uploader': uploader,
510809f1 333 'timestamp': unified_timestamp(date),
bb8a73a0 334 'uploader_id': uploader_id,
510809f1 335 'uploader_url': f'{base_url}/{uploader_id}',
bb8a73a0 336 'formats': formats,
337 'thumbnails': thumbnails,
338 'thumbnail': thumbnail,
510809f1 339 **counts,
bb8a73a0 340 }