]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/nitter.py
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)
[yt-dlp.git] / yt_dlp / extractor / nitter.py
1 import random
2 import re
3
4 from .common import InfoExtractor
5 from ..compat import compat_urlparse
6 from ..utils import (
7 determine_ext,
8 parse_count,
9 remove_end,
10 unified_timestamp,
11 )
12
13
14 class NitterIE(InfoExtractor):
15 # Taken from https://github.com/zedeus/nitter/wiki/Instances
16
17 NON_HTTP_INSTANCES = (
18 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
19 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
20 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
21 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
22 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
23 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
24 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
25 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
26 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
27 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
28 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
29 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
30 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
31 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
32 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
33 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
34 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
35
36 'nitter.i2p',
37 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
38
39 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
40 )
41
42 HTTP_INSTANCES = (
43 'nitter.lacontrevoie.fr',
44 'nitter.fdn.fr',
45 'nitter.1d4.us',
46 'nitter.kavin.rocks',
47 'nitter.unixfox.eu',
48 'nitter.domain.glass',
49 'nitter.namazso.eu',
50 'birdsite.xanny.family',
51 'nitter.moomoo.me',
52 'bird.trom.tf',
53 'nitter.it',
54 'twitter.censors.us',
55 'nitter.grimneko.de',
56 'twitter.076.ne.jp',
57 'nitter.fly.dev',
58 'notabird.site',
59 'nitter.weiler.rocks',
60 'nitter.sethforprivacy.com',
61 'nitter.cutelab.space',
62 'nitter.nl',
63 'nitter.mint.lgbt',
64 'nitter.bus-hit.me',
65 'nitter.esmailelbob.xyz',
66 'tw.artemislena.eu',
67 'nitter.winscloud.net',
68 'nitter.tiekoetter.com',
69 'nitter.spaceint.fr',
70 'nitter.privacy.com.de',
71 'nitter.poast.org',
72 'nitter.bird.froth.zone',
73 'nitter.dcs0.hu',
74 'twitter.dr460nf1r3.org',
75 'nitter.garudalinux.org',
76 'twitter.femboy.hu',
77 'nitter.cz',
78 'nitter.privacydev.net',
79 'nitter.evil.site',
80 'tweet.lambda.dance',
81 'nitter.kylrth.com',
82 'nitter.foss.wtf',
83 'nitter.priv.pw',
84 'nitter.tokhmi.xyz',
85 'nitter.catalyst.sx',
86 'unofficialbird.com',
87 'nitter.projectsegfau.lt',
88 'nitter.eu.projectsegfau.lt',
89 'singapore.unofficialbird.com',
90 'canada.unofficialbird.com',
91 'india.unofficialbird.com',
92 'nederland.unofficialbird.com',
93 'uk.unofficialbird.com',
94 'n.l5.ca',
95 'nitter.slipfox.xyz',
96 'nitter.soopy.moe',
97 'nitter.qwik.space',
98 'read.whatever.social',
99 'nitter.rawbit.ninja',
100 'nt.vern.cc',
101 'ntr.odyssey346.dev',
102 'nitter.ir',
103 'nitter.privacytools.io',
104 'nitter.sneed.network',
105 'n.sneed.network',
106 'nitter.manasiwibi.com',
107 'nitter.smnz.de',
108 'nitter.twei.space',
109 'nitter.inpt.fr',
110 'nitter.d420.de',
111 'nitter.caioalonso.com',
112 'nitter.at',
113 'nitter.drivet.xyz',
114 'nitter.pw',
115 'nitter.nicfab.eu',
116 'bird.habedieeh.re',
117 'nitter.hostux.net',
118 'nitter.adminforge.de',
119 'nitter.platypush.tech',
120 'nitter.mask.sh',
121 'nitter.pufe.org',
122 'nitter.us.projectsegfau.lt',
123 'nitter.arcticfoxes.net',
124 't.com.sb',
125 'nitter.kling.gg',
126 'nitter.ktachibana.party',
127 'nitter.riverside.rocks',
128 'nitter.girlboss.ceo',
129 'nitter.lunar.icu',
130 'twitter.moe.ngo',
131 'nitter.freedit.eu',
132 'ntr.frail.duckdns.org',
133 'nitter.librenode.org',
134 'n.opnxng.com',
135 'nitter.plus.st',
136 )
137
138 DEAD_INSTANCES = (
139 # maintenance
140 'nitter.ethibox.fr',
141
142 # official, rate limited
143 'nitter.net',
144 # offline
145 'is-nitter.resolv.ee',
146 'lu-nitter.resolv.ee',
147 'nitter.13ad.de',
148 'nitter.40two.app',
149 'nitter.cattube.org',
150 'nitter.cc',
151 'nitter.dark.fail',
152 'nitter.himiko.cloud',
153 'nitter.koyu.space',
154 'nitter.mailstation.de',
155 'nitter.mastodont.cat',
156 'nitter.tedomum.net',
157 'nitter.tokhmi.xyz',
158 'nitter.weaponizedhumiliation.com',
159 'nitter.vxempire.xyz',
160 'tweet.lambda.dance',
161 'nitter.ca',
162 'nitter.42l.fr',
163 'nitter.pussthecat.org',
164 'nitter.nixnet.services',
165 'nitter.eu',
166 'nitter.actionsack.com',
167 'nitter.hu',
168 'twitr.gq',
169 'nittereu.moomoo.me',
170 'bird.from.tf',
171 'twitter.grimneko.de',
172 'nitter.alefvanoon.xyz',
173 'n.hyperborea.cloud',
174 'twitter.mstdn.social',
175 'nitter.silkky.cloud',
176 'nttr.stream',
177 'fuckthesacklers.network',
178 'nitter.govt.land',
179 'nitter.datatunnel.xyz',
180 'de.nttr.stream',
181 'twtr.bch.bar',
182 'nitter.exonip.de',
183 'nitter.mastodon.pro',
184 'nitter.notraxx.ch',
185 'nitter.skrep.in',
186 'nitter.snopyta.org',
187 )
188
189 INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
190
191 _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
192 _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
193 current_instance = random.choice(HTTP_INSTANCES)
194
195 _TESTS = [
196 {
197 # GIF (wrapped in mp4)
198 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
199 'info_dict': {
200 'id': '1314279897502629888',
201 'ext': 'mp4',
202 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
203 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
204 'thumbnail': r're:^https?://.*\.jpg$',
205 'uploader': 'Firefox 🔥',
206 'uploader_id': 'firefox',
207 'uploader_url': f'https://{current_instance}/firefox',
208 'upload_date': '20201008',
209 'timestamp': 1602183720,
210 'like_count': int,
211 'repost_count': int,
212 'comment_count': int,
213 },
214 }, { # normal video
215 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
216 'info_dict': {
217 'id': '1299715685392756737',
218 'ext': 'mp4',
219 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
220 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
221 'thumbnail': r're:^https?://.*\.jpg$',
222 'uploader': 're:^Le *Doc',
223 'uploader_id': 'Le___Doc',
224 'uploader_url': f'https://{current_instance}/Le___Doc',
225 'upload_date': '20200829',
226 'timestamp': 1598711340,
227 'view_count': int,
228 'like_count': int,
229 'repost_count': int,
230 'comment_count': int,
231 },
232 }, { # video embed in a "Streaming Political Ads" box
233 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
234 'info_dict': {
235 'id': '1321147074491092994',
236 'ext': 'mp4',
237 'title': 'md5:8290664aabb43b9189145c008386bf12',
238 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
239 'thumbnail': r're:^https?://.*\.jpg$',
240 'uploader': 'Mozilla',
241 'uploader_id': 'mozilla',
242 'uploader_url': f'https://{current_instance}/mozilla',
243 'upload_date': '20201027',
244 'timestamp': 1603820940,
245 'view_count': int,
246 'like_count': int,
247 'repost_count': int,
248 'comment_count': int,
249 },
250 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
251 }, { # not the first tweet but main-tweet
252 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
253 'info_dict': {
254 'id': '1354848277481414657',
255 'ext': 'mp4',
256 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
257 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
258 'thumbnail': r're:^https?://.*\.jpg$',
259 'uploader': 'Firefox 🔥',
260 'uploader_id': 'firefox',
261 'uploader_url': f'https://{current_instance}/firefox',
262 'upload_date': '20210128',
263 'timestamp': 1611855960,
264 'view_count': int,
265 'like_count': int,
266 'repost_count': int,
267 'comment_count': int,
268 }
269 }, { # no OpenGraph title
270 'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
271 'info_dict': {
272 'id': '1678455464038735895',
273 'ext': 'mp4',
274 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
275 'description': 'Local man, what did Romanians ever do to you?',
276 'thumbnail': r're:^https?://.*\.jpg$',
277 'uploader': 'Your Typical Local Man',
278 'uploader_id': 'LocalBateman',
279 'uploader_url': f'https://{current_instance}/LocalBateman',
280 'upload_date': '20230710',
281 'timestamp': 1689009900,
282 'view_count': int,
283 'like_count': int,
284 'repost_count': int,
285 'comment_count': int,
286 },
287 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
288 'params': {'skip_download': 'm3u8'},
289 }
290 ]
291
292 def _real_extract(self, url):
293 video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
294 parsed_url = compat_urlparse.urlparse(url)
295 base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
296
297 self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
298 full_webpage = webpage = self._download_webpage(url, video_id)
299
300 main_tweet_start = full_webpage.find('class="main-tweet"')
301 if main_tweet_start > 0:
302 webpage = full_webpage[main_tweet_start:]
303
304 video_url = '%s%s' % (base_url, self._html_search_regex(
305 r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
306 ext = determine_ext(video_url)
307
308 if ext == 'unknown_video':
309 formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
310 else:
311 formats = [{
312 'url': video_url,
313 'ext': ext
314 }]
315
316 title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex(
317 r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
318
319 uploader_id = self._html_search_regex(
320 r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
321
322 uploader = self._html_search_regex(
323 r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
324 if uploader:
325 title = f'{uploader} - {title}'
326
327 counts = {
328 f'{x[0]}_count': self._html_search_regex(
329 fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
330 webpage, f'{x[0]} count', fatal=False)
331 for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
332 }
333 counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
334
335 thumbnail = (
336 self._html_search_meta('og:image', full_webpage, 'thumbnail url')
337 or remove_end('%s%s' % (base_url, self._html_search_regex(
338 r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
339
340 thumbnails = [
341 {'id': id, 'url': f'{thumbnail}%3A{id}'}
342 for id in ('thumb', 'small', 'large', 'medium', 'orig')
343 ]
344
345 date = self._html_search_regex(
346 r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
347 webpage, 'upload date', default='').replace('·', '')
348
349 return {
350 'id': video_id,
351 'title': title,
352 'description': description,
353 'uploader': uploader,
354 'timestamp': unified_timestamp(date),
355 'uploader_id': uploader_id,
356 'uploader_url': f'{base_url}/{uploader_id}',
357 'formats': formats,
358 'thumbnails': thumbnails,
359 'thumbnail': thumbnail,
360 **counts,
361 }