]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/nitter.py
[youtube] Add extractor-arg to skip auto-translated subs
[yt-dlp.git] / yt_dlp / extractor / nitter.py
CommitLineData
bb8a73a0 1# coding: utf-8
2from __future__ import unicode_literals
3
4from .common import InfoExtractor
5from ..compat import compat_urlparse
6from ..utils import (
7 parse_count,
bb8a73a0 8 unified_timestamp,
9 remove_end,
10 determine_ext,
11)
12import re
a4ddaf23 13import random
bb8a73a0 14
15
16class NitterIE(InfoExtractor):
17 # Taken from https://github.com/zedeus/nitter/wiki/Instances
a4ddaf23 18
19 NON_HTTP_INSTANCES = (
20 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
21 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
22 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
23 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
24 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
25 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
26 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
510809f1 27 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
28 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
29 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
30 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
31 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
32 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
33 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
34 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
35 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
36 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
a4ddaf23 37
38 'nitter.i2p',
39 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
40
41 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
42 )
43
44 HTTP_INSTANCES = (
45 'nitter.42l.fr',
46 'nitter.pussthecat.org',
47 'nitter.nixnet.services',
a4ddaf23 48 'nitter.fdn.fr',
49 'nitter.1d4.us',
50 'nitter.kavin.rocks',
a4ddaf23 51 'nitter.unixfox.eu',
52 'nitter.domain.glass',
a4ddaf23 53 'nitter.eu',
54 'nitter.namazso.eu',
a4ddaf23 55 'nitter.actionsack.com',
a4ddaf23 56 'birdsite.xanny.family',
510809f1 57 'nitter.hu',
58 'twitr.gq',
59 'nitter.moomoo.me',
60 'nittereu.moomoo.me',
61 'bird.from.tf',
62 'nitter.it',
63 'twitter.censors.us',
64 'twitter.grimneko.de',
65 'nitter.alefvanoon.xyz',
66 'n.hyperborea.cloud',
67 'nitter.ca',
68 'twitter.076.ne.jp',
69 'twitter.mstdn.social',
70 'nitter.fly.dev',
71 'notabird.site',
72 'nitter.weiler.rocks',
73 'nitter.silkky.cloud',
74 'nitter.sethforprivacy.com',
75 'nttr.stream',
76 'nitter.cutelab.space',
77 'nitter.nl',
78 'nitter.mint.lgbt',
79 'nitter.bus-hit.me',
80 'fuckthesacklers.network',
81 'nitter.govt.land',
82 'nitter.datatunnel.xyz',
83 'nitter.esmailelbob.xyz',
84 'tw.artemislena.eu',
85 'de.nttr.stream',
86 'nitter.winscloud.net',
87 'nitter.tiekoetter.com',
88 'nitter.spaceint.fr',
89 'twtr.bch.bar',
90 'nitter.exonip.de',
91 'nitter.mastodon.pro',
92 'nitter.notraxx.ch',
93
a4ddaf23 94
95 # not in the list anymore
510809f1 96 'nitter.skrep.in',
a4ddaf23 97 'nitter.snopyta.org',
98 )
99
100 DEAD_INSTANCES = (
101 # maintenance
102 'nitter.ethibox.fr',
103
104 # official, rate limited
105 'nitter.net',
106 # offline
510809f1 107 'is-nitter.resolv.ee',
108 'lu-nitter.resolv.ee',
a4ddaf23 109 'nitter.13ad.de',
510809f1 110 'nitter.40two.app',
111 'nitter.cattube.org',
112 'nitter.cc',
113 'nitter.dark.fail',
114 'nitter.himiko.cloud',
115 'nitter.koyu.space',
116 'nitter.mailstation.de',
117 'nitter.mastodont.cat',
118 'nitter.tedomum.net',
119 'nitter.tokhmi.xyz',
a4ddaf23 120 'nitter.weaponizedhumiliation.com',
510809f1 121 'nitter.vxempire.xyz',
122 'tweet.lambda.dance',
a4ddaf23 123 )
124
125 INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
bb8a73a0 126
510809f1 127 _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
128 _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
a4ddaf23 129 current_instance = random.choice(HTTP_INSTANCES)
130
bb8a73a0 131 _TESTS = [
132 {
133 # GIF (wrapped in mp4)
510809f1 134 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
bb8a73a0 135 'info_dict': {
136 'id': '1314279897502629888',
137 'ext': 'mp4',
510809f1 138 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
139 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
bb8a73a0 140 'thumbnail': r're:^https?://.*\.jpg$',
141 'uploader': 'Firefox 🔥',
142 'uploader_id': 'firefox',
510809f1 143 'uploader_url': f'https://{current_instance}/firefox',
bb8a73a0 144 'upload_date': '20201008',
145 'timestamp': 1602183720,
510809f1 146 'like_count': int,
147 'repost_count': int,
148 'comment_count': int,
bb8a73a0 149 },
150 }, { # normal video
510809f1 151 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
bb8a73a0 152 'info_dict': {
153 'id': '1299715685392756737',
154 'ext': 'mp4',
510809f1 155 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
a4ddaf23 156 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
bb8a73a0 157 'thumbnail': r're:^https?://.*\.jpg$',
510809f1 158 'uploader': 're:^Le *Doc',
bb8a73a0 159 'uploader_id': 'Le___Doc',
510809f1 160 'uploader_url': f'https://{current_instance}/Le___Doc',
bb8a73a0 161 'upload_date': '20200829',
510809f1 162 'timestamp': 1598711340,
bb8a73a0 163 'view_count': int,
164 'like_count': int,
165 'repost_count': int,
166 'comment_count': int,
167 },
168 }, { # video embed in a "Streaming Political Ads" box
510809f1 169 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
bb8a73a0 170 'info_dict': {
171 'id': '1321147074491092994',
172 'ext': 'mp4',
510809f1 173 'title': 'md5:8290664aabb43b9189145c008386bf12',
174 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
bb8a73a0 175 'thumbnail': r're:^https?://.*\.jpg$',
176 'uploader': 'Mozilla',
177 'uploader_id': 'mozilla',
510809f1 178 'uploader_url': f'https://{current_instance}/mozilla',
bb8a73a0 179 'upload_date': '20201027',
510809f1 180 'timestamp': 1603820940,
181 'view_count': int,
182 'like_count': int,
183 'repost_count': int,
184 'comment_count': int,
bb8a73a0 185 },
510809f1 186 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
a4ddaf23 187 }, { # not the first tweet but main-tweet
510809f1 188 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
a4ddaf23 189 'info_dict': {
510809f1 190 'id': '1354848277481414657',
a4ddaf23 191 'ext': 'mp4',
510809f1 192 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
193 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
a4ddaf23 194 'thumbnail': r're:^https?://.*\.jpg$',
510809f1 195 'uploader': 'Firefox 🔥',
196 'uploader_id': 'firefox',
197 'uploader_url': f'https://{current_instance}/firefox',
198 'upload_date': '20210128',
199 'timestamp': 1611855960,
200 'view_count': int,
201 'like_count': int,
202 'repost_count': int,
203 'comment_count': int,
a4ddaf23 204 }
205 }
bb8a73a0 206 ]
207
208 def _real_extract(self, url):
510809f1 209 video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
bb8a73a0 210 parsed_url = compat_urlparse.urlparse(url)
510809f1 211 base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
bb8a73a0 212
213 self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
510809f1 214 full_webpage = webpage = self._download_webpage(url, video_id)
a4ddaf23 215
216 main_tweet_start = full_webpage.find('class="main-tweet"')
217 if main_tweet_start > 0:
218 webpage = full_webpage[main_tweet_start:]
bb8a73a0 219
510809f1 220 video_url = '%s%s' % (base_url, self._html_search_regex(
221 r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
bb8a73a0 222 ext = determine_ext(video_url)
223
224 if ext == 'unknown_video':
225 formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
226 else:
227 formats = [{
228 'url': video_url,
229 'ext': ext
230 }]
231
510809f1 232 title = description = self._og_search_description(full_webpage) or self._html_search_regex(
233 r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
bb8a73a0 234
510809f1 235 uploader_id = self._html_search_regex(
236 r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
bb8a73a0 237
510809f1 238 uploader = self._html_search_regex(
239 r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
240 if uploader:
241 title = f'{uploader} - {title}'
bb8a73a0 242
510809f1 243 counts = {
244 f'{x[0]}_count': self._html_search_regex(
245 fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
246 webpage, f'{x[0]} count', fatal=False)
247 for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
248 }
249 counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
bb8a73a0 250
510809f1 251 thumbnail = (
252 self._html_search_meta('og:image', full_webpage, 'thumbnail url')
253 or remove_end('%s%s' % (base_url, self._html_search_regex(
254 r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
255
256 thumbnails = [
257 {'id': id, 'url': f'{thumbnail}%3A{id}'}
258 for id in ('thumb', 'small', 'large', 'medium', 'orig')
259 ]
260
261 date = self._html_search_regex(
262 r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
263 webpage, 'upload date', default='').replace('·', '')
bb8a73a0 264
265 return {
266 'id': video_id,
267 'title': title,
268 'description': description,
269 'uploader': uploader,
510809f1 270 'timestamp': unified_timestamp(date),
bb8a73a0 271 'uploader_id': uploader_id,
510809f1 272 'uploader_url': f'{base_url}/{uploader_id}',
bb8a73a0 273 'formats': formats,
274 'thumbnails': thumbnails,
275 'thumbnail': thumbnail,
510809f1 276 **counts,
bb8a73a0 277 }