]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/nitter.py
[extractor/rutube] Extract chapters from description (#6345)
[yt-dlp.git] / yt_dlp / extractor / nitter.py
1 from .common import InfoExtractor
2 from ..compat import compat_urlparse
3 from ..utils import (
4 parse_count,
5 unified_timestamp,
6 remove_end,
7 determine_ext,
8 )
9 import re
10 import random
11
12
13 class NitterIE(InfoExtractor):
14 # Taken from https://github.com/zedeus/nitter/wiki/Instances
15
16 NON_HTTP_INSTANCES = (
17 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
18 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
19 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
20 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
21 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
22 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
23 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
24 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
25 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
26 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
27 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
28 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
29 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
30 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
31 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
32 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
33 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
34
35 'nitter.i2p',
36 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
37
38 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
39 )
40
41 HTTP_INSTANCES = (
42 'nitter.lacontrevoie.fr',
43 'nitter.fdn.fr',
44 'nitter.1d4.us',
45 'nitter.kavin.rocks',
46 'nitter.unixfox.eu',
47 'nitter.domain.glass',
48 'nitter.namazso.eu',
49 'birdsite.xanny.family',
50 'nitter.moomoo.me',
51 'bird.trom.tf',
52 'nitter.it',
53 'twitter.censors.us',
54 'nitter.grimneko.de',
55 'twitter.076.ne.jp',
56 'nitter.fly.dev',
57 'notabird.site',
58 'nitter.weiler.rocks',
59 'nitter.sethforprivacy.com',
60 'nitter.cutelab.space',
61 'nitter.nl',
62 'nitter.mint.lgbt',
63 'nitter.bus-hit.me',
64 'nitter.esmailelbob.xyz',
65 'tw.artemislena.eu',
66 'nitter.winscloud.net',
67 'nitter.tiekoetter.com',
68 'nitter.spaceint.fr',
69 'nitter.privacy.com.de',
70 'nitter.poast.org',
71 'nitter.bird.froth.zone',
72 'nitter.dcs0.hu',
73 'twitter.dr460nf1r3.org',
74 'nitter.garudalinux.org',
75 'twitter.femboy.hu',
76 'nitter.cz',
77 'nitter.privacydev.net',
78 'nitter.evil.site',
79 'tweet.lambda.dance',
80 'nitter.kylrth.com',
81 'nitter.foss.wtf',
82 'nitter.priv.pw',
83 'nitter.tokhmi.xyz',
84 'nitter.catalyst.sx',
85 'unofficialbird.com',
86 'nitter.projectsegfau.lt',
87 'nitter.eu.projectsegfau.lt',
88 'singapore.unofficialbird.com',
89 'canada.unofficialbird.com',
90 'india.unofficialbird.com',
91 'nederland.unofficialbird.com',
92 'uk.unofficialbird.com',
93 'n.l5.ca',
94 'nitter.slipfox.xyz',
95 'nitter.soopy.moe',
96 'nitter.qwik.space',
97 'read.whatever.social',
98 'nitter.rawbit.ninja',
99 'nt.vern.cc',
100 'ntr.odyssey346.dev',
101 'nitter.ir',
102 'nitter.privacytools.io',
103 'nitter.sneed.network',
104 'n.sneed.network',
105 'nitter.manasiwibi.com',
106 'nitter.smnz.de',
107 'nitter.twei.space',
108 'nitter.inpt.fr',
109 'nitter.d420.de',
110 'nitter.caioalonso.com',
111 'nitter.at',
112 'nitter.drivet.xyz',
113 'nitter.pw',
114 'nitter.nicfab.eu',
115 'bird.habedieeh.re',
116 'nitter.hostux.net',
117 'nitter.adminforge.de',
118 'nitter.platypush.tech',
119 'nitter.mask.sh',
120 'nitter.pufe.org',
121 'nitter.us.projectsegfau.lt',
122 'nitter.arcticfoxes.net',
123 't.com.sb',
124 'nitter.kling.gg',
125 'nitter.ktachibana.party',
126 'nitter.riverside.rocks',
127 'nitter.girlboss.ceo',
128 'nitter.lunar.icu',
129 'twitter.moe.ngo',
130 'nitter.freedit.eu',
131 'ntr.frail.duckdns.org',
132 'nitter.librenode.org',
133 'n.opnxng.com',
134 'nitter.plus.st',
135 )
136
137 DEAD_INSTANCES = (
138 # maintenance
139 'nitter.ethibox.fr',
140
141 # official, rate limited
142 'nitter.net',
143 # offline
144 'is-nitter.resolv.ee',
145 'lu-nitter.resolv.ee',
146 'nitter.13ad.de',
147 'nitter.40two.app',
148 'nitter.cattube.org',
149 'nitter.cc',
150 'nitter.dark.fail',
151 'nitter.himiko.cloud',
152 'nitter.koyu.space',
153 'nitter.mailstation.de',
154 'nitter.mastodont.cat',
155 'nitter.tedomum.net',
156 'nitter.tokhmi.xyz',
157 'nitter.weaponizedhumiliation.com',
158 'nitter.vxempire.xyz',
159 'tweet.lambda.dance',
160 'nitter.ca',
161 'nitter.42l.fr',
162 'nitter.pussthecat.org',
163 'nitter.nixnet.services',
164 'nitter.eu',
165 'nitter.actionsack.com',
166 'nitter.hu',
167 'twitr.gq',
168 'nittereu.moomoo.me',
169 'bird.from.tf',
170 'twitter.grimneko.de',
171 'nitter.alefvanoon.xyz',
172 'n.hyperborea.cloud',
173 'twitter.mstdn.social',
174 'nitter.silkky.cloud',
175 'nttr.stream',
176 'fuckthesacklers.network',
177 'nitter.govt.land',
178 'nitter.datatunnel.xyz',
179 'de.nttr.stream',
180 'twtr.bch.bar',
181 'nitter.exonip.de',
182 'nitter.mastodon.pro',
183 'nitter.notraxx.ch',
184 'nitter.skrep.in',
185 'nitter.snopyta.org',
186 )
187
188 INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
189
190 _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
191 _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
192 current_instance = random.choice(HTTP_INSTANCES)
193
194 _TESTS = [
195 {
196 # GIF (wrapped in mp4)
197 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
198 'info_dict': {
199 'id': '1314279897502629888',
200 'ext': 'mp4',
201 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
202 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
203 'thumbnail': r're:^https?://.*\.jpg$',
204 'uploader': 'Firefox 🔥',
205 'uploader_id': 'firefox',
206 'uploader_url': f'https://{current_instance}/firefox',
207 'upload_date': '20201008',
208 'timestamp': 1602183720,
209 'like_count': int,
210 'repost_count': int,
211 'comment_count': int,
212 },
213 }, { # normal video
214 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
215 'info_dict': {
216 'id': '1299715685392756737',
217 'ext': 'mp4',
218 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
219 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
220 'thumbnail': r're:^https?://.*\.jpg$',
221 'uploader': 're:^Le *Doc',
222 'uploader_id': 'Le___Doc',
223 'uploader_url': f'https://{current_instance}/Le___Doc',
224 'upload_date': '20200829',
225 'timestamp': 1598711340,
226 'view_count': int,
227 'like_count': int,
228 'repost_count': int,
229 'comment_count': int,
230 },
231 }, { # video embed in a "Streaming Political Ads" box
232 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
233 'info_dict': {
234 'id': '1321147074491092994',
235 'ext': 'mp4',
236 'title': 'md5:8290664aabb43b9189145c008386bf12',
237 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
238 'thumbnail': r're:^https?://.*\.jpg$',
239 'uploader': 'Mozilla',
240 'uploader_id': 'mozilla',
241 'uploader_url': f'https://{current_instance}/mozilla',
242 'upload_date': '20201027',
243 'timestamp': 1603820940,
244 'view_count': int,
245 'like_count': int,
246 'repost_count': int,
247 'comment_count': int,
248 },
249 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
250 }, { # not the first tweet but main-tweet
251 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
252 'info_dict': {
253 'id': '1354848277481414657',
254 'ext': 'mp4',
255 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
256 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
257 'thumbnail': r're:^https?://.*\.jpg$',
258 'uploader': 'Firefox 🔥',
259 'uploader_id': 'firefox',
260 'uploader_url': f'https://{current_instance}/firefox',
261 'upload_date': '20210128',
262 'timestamp': 1611855960,
263 'view_count': int,
264 'like_count': int,
265 'repost_count': int,
266 'comment_count': int,
267 }
268 }
269 ]
270
271 def _real_extract(self, url):
272 video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
273 parsed_url = compat_urlparse.urlparse(url)
274 base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
275
276 self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
277 full_webpage = webpage = self._download_webpage(url, video_id)
278
279 main_tweet_start = full_webpage.find('class="main-tweet"')
280 if main_tweet_start > 0:
281 webpage = full_webpage[main_tweet_start:]
282
283 video_url = '%s%s' % (base_url, self._html_search_regex(
284 r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
285 ext = determine_ext(video_url)
286
287 if ext == 'unknown_video':
288 formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
289 else:
290 formats = [{
291 'url': video_url,
292 'ext': ext
293 }]
294
295 title = description = self._og_search_description(full_webpage) or self._html_search_regex(
296 r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
297
298 uploader_id = self._html_search_regex(
299 r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
300
301 uploader = self._html_search_regex(
302 r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
303 if uploader:
304 title = f'{uploader} - {title}'
305
306 counts = {
307 f'{x[0]}_count': self._html_search_regex(
308 fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
309 webpage, f'{x[0]} count', fatal=False)
310 for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
311 }
312 counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
313
314 thumbnail = (
315 self._html_search_meta('og:image', full_webpage, 'thumbnail url')
316 or remove_end('%s%s' % (base_url, self._html_search_regex(
317 r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
318
319 thumbnails = [
320 {'id': id, 'url': f'{thumbnail}%3A{id}'}
321 for id in ('thumb', 'small', 'large', 'medium', 'orig')
322 ]
323
324 date = self._html_search_regex(
325 r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
326 webpage, 'upload date', default='').replace('·', '')
327
328 return {
329 'id': video_id,
330 'title': title,
331 'description': description,
332 'uploader': uploader,
333 'timestamp': unified_timestamp(date),
334 'uploader_id': uploader_id,
335 'uploader_url': f'{base_url}/{uploader_id}',
336 'formats': formats,
337 'thumbnails': thumbnails,
338 'thumbnail': thumbnail,
339 **counts,
340 }