]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/nitter.py
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)
[yt-dlp.git] / yt_dlp / extractor / nitter.py
CommitLineData
e897bd82
SS
1import random
2import re
3
bb8a73a0 4from .common import InfoExtractor
5from ..compat import compat_urlparse
6from ..utils import (
e897bd82 7 determine_ext,
bb8a73a0 8 parse_count,
bb8a73a0 9 remove_end,
e897bd82 10 unified_timestamp,
bb8a73a0 11)
bb8a73a0 12
13
14class NitterIE(InfoExtractor):
15 # Taken from https://github.com/zedeus/nitter/wiki/Instances
a4ddaf23 16
17 NON_HTTP_INSTANCES = (
18 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
19 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
20 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
21 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
22 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
23 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
24 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
510809f1 25 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
26 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
27 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
28 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
29 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
30 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
31 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
32 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
33 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
34 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
a4ddaf23 35
36 'nitter.i2p',
37 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
38
39 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
40 )
41
42 HTTP_INSTANCES = (
a9189510 43 'nitter.lacontrevoie.fr',
a4ddaf23 44 'nitter.fdn.fr',
45 'nitter.1d4.us',
46 'nitter.kavin.rocks',
a4ddaf23 47 'nitter.unixfox.eu',
48 'nitter.domain.glass',
a4ddaf23 49 'nitter.namazso.eu',
a4ddaf23 50 'birdsite.xanny.family',
510809f1 51 'nitter.moomoo.me',
a9189510 52 'bird.trom.tf',
510809f1 53 'nitter.it',
54 'twitter.censors.us',
a9189510 55 'nitter.grimneko.de',
510809f1 56 'twitter.076.ne.jp',
510809f1 57 'nitter.fly.dev',
58 'notabird.site',
59 'nitter.weiler.rocks',
510809f1 60 'nitter.sethforprivacy.com',
510809f1 61 'nitter.cutelab.space',
62 'nitter.nl',
63 'nitter.mint.lgbt',
64 'nitter.bus-hit.me',
510809f1 65 'nitter.esmailelbob.xyz',
66 'tw.artemislena.eu',
510809f1 67 'nitter.winscloud.net',
68 'nitter.tiekoetter.com',
69 'nitter.spaceint.fr',
a9189510
O
70 'nitter.privacy.com.de',
71 'nitter.poast.org',
72 'nitter.bird.froth.zone',
73 'nitter.dcs0.hu',
74 'twitter.dr460nf1r3.org',
75 'nitter.garudalinux.org',
76 'twitter.femboy.hu',
77 'nitter.cz',
78 'nitter.privacydev.net',
79 'nitter.evil.site',
80 'tweet.lambda.dance',
81 'nitter.kylrth.com',
82 'nitter.foss.wtf',
83 'nitter.priv.pw',
84 'nitter.tokhmi.xyz',
85 'nitter.catalyst.sx',
86 'unofficialbird.com',
87 'nitter.projectsegfau.lt',
88 'nitter.eu.projectsegfau.lt',
89 'singapore.unofficialbird.com',
90 'canada.unofficialbird.com',
91 'india.unofficialbird.com',
92 'nederland.unofficialbird.com',
93 'uk.unofficialbird.com',
94 'n.l5.ca',
95 'nitter.slipfox.xyz',
96 'nitter.soopy.moe',
97 'nitter.qwik.space',
98 'read.whatever.social',
99 'nitter.rawbit.ninja',
100 'nt.vern.cc',
101 'ntr.odyssey346.dev',
102 'nitter.ir',
103 'nitter.privacytools.io',
104 'nitter.sneed.network',
105 'n.sneed.network',
106 'nitter.manasiwibi.com',
107 'nitter.smnz.de',
108 'nitter.twei.space',
109 'nitter.inpt.fr',
110 'nitter.d420.de',
111 'nitter.caioalonso.com',
112 'nitter.at',
113 'nitter.drivet.xyz',
114 'nitter.pw',
115 'nitter.nicfab.eu',
116 'bird.habedieeh.re',
117 'nitter.hostux.net',
118 'nitter.adminforge.de',
119 'nitter.platypush.tech',
120 'nitter.mask.sh',
121 'nitter.pufe.org',
122 'nitter.us.projectsegfau.lt',
123 'nitter.arcticfoxes.net',
124 't.com.sb',
125 'nitter.kling.gg',
126 'nitter.ktachibana.party',
127 'nitter.riverside.rocks',
128 'nitter.girlboss.ceo',
129 'nitter.lunar.icu',
130 'twitter.moe.ngo',
131 'nitter.freedit.eu',
132 'ntr.frail.duckdns.org',
133 'nitter.librenode.org',
134 'n.opnxng.com',
135 'nitter.plus.st',
a4ddaf23 136 )
137
138 DEAD_INSTANCES = (
139 # maintenance
140 'nitter.ethibox.fr',
141
142 # official, rate limited
143 'nitter.net',
144 # offline
510809f1 145 'is-nitter.resolv.ee',
146 'lu-nitter.resolv.ee',
a4ddaf23 147 'nitter.13ad.de',
510809f1 148 'nitter.40two.app',
149 'nitter.cattube.org',
150 'nitter.cc',
151 'nitter.dark.fail',
152 'nitter.himiko.cloud',
153 'nitter.koyu.space',
154 'nitter.mailstation.de',
155 'nitter.mastodont.cat',
156 'nitter.tedomum.net',
157 'nitter.tokhmi.xyz',
a4ddaf23 158 'nitter.weaponizedhumiliation.com',
510809f1 159 'nitter.vxempire.xyz',
160 'tweet.lambda.dance',
a9189510
O
161 'nitter.ca',
162 'nitter.42l.fr',
163 'nitter.pussthecat.org',
164 'nitter.nixnet.services',
165 'nitter.eu',
166 'nitter.actionsack.com',
167 'nitter.hu',
168 'twitr.gq',
169 'nittereu.moomoo.me',
170 'bird.from.tf',
171 'twitter.grimneko.de',
172 'nitter.alefvanoon.xyz',
173 'n.hyperborea.cloud',
174 'twitter.mstdn.social',
175 'nitter.silkky.cloud',
176 'nttr.stream',
177 'fuckthesacklers.network',
178 'nitter.govt.land',
179 'nitter.datatunnel.xyz',
180 'de.nttr.stream',
181 'twtr.bch.bar',
182 'nitter.exonip.de',
183 'nitter.mastodon.pro',
184 'nitter.notraxx.ch',
185 'nitter.skrep.in',
186 'nitter.snopyta.org',
a4ddaf23 187 )
188
189 INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
bb8a73a0 190
510809f1 191 _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
192 _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
a4ddaf23 193 current_instance = random.choice(HTTP_INSTANCES)
194
bb8a73a0 195 _TESTS = [
196 {
197 # GIF (wrapped in mp4)
510809f1 198 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
bb8a73a0 199 'info_dict': {
200 'id': '1314279897502629888',
201 'ext': 'mp4',
510809f1 202 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
203 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
bb8a73a0 204 'thumbnail': r're:^https?://.*\.jpg$',
205 'uploader': 'Firefox 🔥',
206 'uploader_id': 'firefox',
510809f1 207 'uploader_url': f'https://{current_instance}/firefox',
bb8a73a0 208 'upload_date': '20201008',
209 'timestamp': 1602183720,
510809f1 210 'like_count': int,
211 'repost_count': int,
212 'comment_count': int,
bb8a73a0 213 },
214 }, { # normal video
510809f1 215 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
bb8a73a0 216 'info_dict': {
217 'id': '1299715685392756737',
218 'ext': 'mp4',
510809f1 219 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
a4ddaf23 220 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
bb8a73a0 221 'thumbnail': r're:^https?://.*\.jpg$',
510809f1 222 'uploader': 're:^Le *Doc',
bb8a73a0 223 'uploader_id': 'Le___Doc',
510809f1 224 'uploader_url': f'https://{current_instance}/Le___Doc',
bb8a73a0 225 'upload_date': '20200829',
510809f1 226 'timestamp': 1598711340,
bb8a73a0 227 'view_count': int,
228 'like_count': int,
229 'repost_count': int,
230 'comment_count': int,
231 },
232 }, { # video embed in a "Streaming Political Ads" box
510809f1 233 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
bb8a73a0 234 'info_dict': {
235 'id': '1321147074491092994',
236 'ext': 'mp4',
510809f1 237 'title': 'md5:8290664aabb43b9189145c008386bf12',
238 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
bb8a73a0 239 'thumbnail': r're:^https?://.*\.jpg$',
240 'uploader': 'Mozilla',
241 'uploader_id': 'mozilla',
510809f1 242 'uploader_url': f'https://{current_instance}/mozilla',
bb8a73a0 243 'upload_date': '20201027',
510809f1 244 'timestamp': 1603820940,
245 'view_count': int,
246 'like_count': int,
247 'repost_count': int,
248 'comment_count': int,
bb8a73a0 249 },
510809f1 250 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
a4ddaf23 251 }, { # not the first tweet but main-tweet
510809f1 252 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
a4ddaf23 253 'info_dict': {
510809f1 254 'id': '1354848277481414657',
a4ddaf23 255 'ext': 'mp4',
510809f1 256 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
257 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
a4ddaf23 258 'thumbnail': r're:^https?://.*\.jpg$',
510809f1 259 'uploader': 'Firefox 🔥',
260 'uploader_id': 'firefox',
261 'uploader_url': f'https://{current_instance}/firefox',
262 'upload_date': '20210128',
263 'timestamp': 1611855960,
264 'view_count': int,
265 'like_count': int,
266 'repost_count': int,
267 'comment_count': int,
a4ddaf23 268 }
a83da371
A
269 }, { # no OpenGraph title
270 'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
271 'info_dict': {
272 'id': '1678455464038735895',
273 'ext': 'mp4',
274 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
275 'description': 'Local man, what did Romanians ever do to you?',
276 'thumbnail': r're:^https?://.*\.jpg$',
277 'uploader': 'Your Typical Local Man',
278 'uploader_id': 'LocalBateman',
279 'uploader_url': f'https://{current_instance}/LocalBateman',
280 'upload_date': '20230710',
281 'timestamp': 1689009900,
282 'view_count': int,
283 'like_count': int,
284 'repost_count': int,
285 'comment_count': int,
286 },
287 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
288 'params': {'skip_download': 'm3u8'},
a4ddaf23 289 }
bb8a73a0 290 ]
291
292 def _real_extract(self, url):
510809f1 293 video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
bb8a73a0 294 parsed_url = compat_urlparse.urlparse(url)
510809f1 295 base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
bb8a73a0 296
297 self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
510809f1 298 full_webpage = webpage = self._download_webpage(url, video_id)
a4ddaf23 299
300 main_tweet_start = full_webpage.find('class="main-tweet"')
301 if main_tweet_start > 0:
302 webpage = full_webpage[main_tweet_start:]
bb8a73a0 303
510809f1 304 video_url = '%s%s' % (base_url, self._html_search_regex(
305 r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
bb8a73a0 306 ext = determine_ext(video_url)
307
308 if ext == 'unknown_video':
309 formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
310 else:
311 formats = [{
312 'url': video_url,
313 'ext': ext
314 }]
315
a83da371 316 title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex(
510809f1 317 r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
bb8a73a0 318
510809f1 319 uploader_id = self._html_search_regex(
320 r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
bb8a73a0 321
510809f1 322 uploader = self._html_search_regex(
323 r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
324 if uploader:
325 title = f'{uploader} - {title}'
bb8a73a0 326
510809f1 327 counts = {
328 f'{x[0]}_count': self._html_search_regex(
329 fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
330 webpage, f'{x[0]} count', fatal=False)
331 for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
332 }
333 counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
bb8a73a0 334
510809f1 335 thumbnail = (
336 self._html_search_meta('og:image', full_webpage, 'thumbnail url')
337 or remove_end('%s%s' % (base_url, self._html_search_regex(
338 r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
339
340 thumbnails = [
341 {'id': id, 'url': f'{thumbnail}%3A{id}'}
342 for id in ('thumb', 'small', 'large', 'medium', 'orig')
343 ]
344
345 date = self._html_search_regex(
346 r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
347 webpage, 'upload date', default='').replace('·', '')
bb8a73a0 348
349 return {
350 'id': video_id,
351 'title': title,
352 'description': description,
353 'uploader': uploader,
510809f1 354 'timestamp': unified_timestamp(date),
bb8a73a0 355 'uploader_id': uploader_id,
510809f1 356 'uploader_url': f'{base_url}/{uploader_id}',
bb8a73a0 357 'formats': formats,
358 'thumbnails': thumbnails,
359 'thumbnail': thumbnail,
510809f1 360 **counts,
bb8a73a0 361 }