yt_dlp/extractor/nitter.py

   1 from .common import InfoExtractor
   2 from ..compat import compat_urlparse
   3 from ..utils import (
   4     parse_count,
   5     unified_timestamp,
   6     remove_end,
   7     determine_ext,
   8 )
   9 import re
  10 import random
  11
  12
  13 class NitterIE(InfoExtractor):
  14     # Taken from https://github.com/zedeus/nitter/wiki/Instances
  15
  16     NON_HTTP_INSTANCES = (
  17         '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
  18         'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
  19         'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
  20         'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
  21         'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
  22         'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
  23         '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
  24         'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
  25         'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
  26         'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
  27         'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
  28         'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
  29         'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
  30         'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
  31         'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
  32         'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
  33         'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
  34
  35         'nitter.i2p',
  36         'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
  37
  38         'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
  39     )
  40
  41     HTTP_INSTANCES = (
  42         'nitter.lacontrevoie.fr',
  43         'nitter.fdn.fr',
  44         'nitter.1d4.us',
  45         'nitter.kavin.rocks',
  46         'nitter.unixfox.eu',
  47         'nitter.domain.glass',
  48         'nitter.namazso.eu',
  49         'birdsite.xanny.family',
  50         'nitter.moomoo.me',
  51         'bird.trom.tf',
  52         'nitter.it',
  53         'twitter.censors.us',
  54         'nitter.grimneko.de',
  55         'twitter.076.ne.jp',
  56         'nitter.fly.dev',
  57         'notabird.site',
  58         'nitter.weiler.rocks',
  59         'nitter.sethforprivacy.com',
  60         'nitter.cutelab.space',
  61         'nitter.nl',
  62         'nitter.mint.lgbt',
  63         'nitter.bus-hit.me',
  64         'nitter.esmailelbob.xyz',
  65         'tw.artemislena.eu',
  66         'nitter.winscloud.net',
  67         'nitter.tiekoetter.com',
  68         'nitter.spaceint.fr',
  69         'nitter.privacy.com.de',
  70         'nitter.poast.org',
  71         'nitter.bird.froth.zone',
  72         'nitter.dcs0.hu',
  73         'twitter.dr460nf1r3.org',
  74         'nitter.garudalinux.org',
  75         'twitter.femboy.hu',
  76         'nitter.cz',
  77         'nitter.privacydev.net',
  78         'nitter.evil.site',
  79         'tweet.lambda.dance',
  80         'nitter.kylrth.com',
  81         'nitter.foss.wtf',
  82         'nitter.priv.pw',
  83         'nitter.tokhmi.xyz',
  84         'nitter.catalyst.sx',
  85         'unofficialbird.com',
  86         'nitter.projectsegfau.lt',
  87         'nitter.eu.projectsegfau.lt',
  88         'singapore.unofficialbird.com',
  89         'canada.unofficialbird.com',
  90         'india.unofficialbird.com',
  91         'nederland.unofficialbird.com',
  92         'uk.unofficialbird.com',
  93         'n.l5.ca',
  94         'nitter.slipfox.xyz',
  95         'nitter.soopy.moe',
  96         'nitter.qwik.space',
  97         'read.whatever.social',
  98         'nitter.rawbit.ninja',
  99         'nt.vern.cc',
 100         'ntr.odyssey346.dev',
 101         'nitter.ir',
 102         'nitter.privacytools.io',
 103         'nitter.sneed.network',
 104         'n.sneed.network',
 105         'nitter.manasiwibi.com',
 106         'nitter.smnz.de',
 107         'nitter.twei.space',
 108         'nitter.inpt.fr',
 109         'nitter.d420.de',
 110         'nitter.caioalonso.com',
 111         'nitter.at',
 112         'nitter.drivet.xyz',
 113         'nitter.pw',
 114         'nitter.nicfab.eu',
 115         'bird.habedieeh.re',
 116         'nitter.hostux.net',
 117         'nitter.adminforge.de',
 118         'nitter.platypush.tech',
 119         'nitter.mask.sh',
 120         'nitter.pufe.org',
 121         'nitter.us.projectsegfau.lt',
 122         'nitter.arcticfoxes.net',
 123         't.com.sb',
 124         'nitter.kling.gg',
 125         'nitter.ktachibana.party',
 126         'nitter.riverside.rocks',
 127         'nitter.girlboss.ceo',
 128         'nitter.lunar.icu',
 129         'twitter.moe.ngo',
 130         'nitter.freedit.eu',
 131         'ntr.frail.duckdns.org',
 132         'nitter.librenode.org',
 133         'n.opnxng.com',
 134         'nitter.plus.st',
 135     )
 136
 137     DEAD_INSTANCES = (
 138         # maintenance
 139         'nitter.ethibox.fr',
 140
 141         # official, rate limited
 142         'nitter.net',
 143         # offline
 144         'is-nitter.resolv.ee',
 145         'lu-nitter.resolv.ee',
 146         'nitter.13ad.de',
 147         'nitter.40two.app',
 148         'nitter.cattube.org',
 149         'nitter.cc',
 150         'nitter.dark.fail',
 151         'nitter.himiko.cloud',
 152         'nitter.koyu.space',
 153         'nitter.mailstation.de',
 154         'nitter.mastodont.cat',
 155         'nitter.tedomum.net',
 156         'nitter.tokhmi.xyz',
 157         'nitter.weaponizedhumiliation.com',
 158         'nitter.vxempire.xyz',
 159         'tweet.lambda.dance',
 160         'nitter.ca',
 161         'nitter.42l.fr',
 162         'nitter.pussthecat.org',
 163         'nitter.nixnet.services',
 164         'nitter.eu',
 165         'nitter.actionsack.com',
 166         'nitter.hu',
 167         'twitr.gq',
 168         'nittereu.moomoo.me',
 169         'bird.from.tf',
 170         'twitter.grimneko.de',
 171         'nitter.alefvanoon.xyz',
 172         'n.hyperborea.cloud',
 173         'twitter.mstdn.social',
 174         'nitter.silkky.cloud',
 175         'nttr.stream',
 176         'fuckthesacklers.network',
 177         'nitter.govt.land',
 178         'nitter.datatunnel.xyz',
 179         'de.nttr.stream',
 180         'twtr.bch.bar',
 181         'nitter.exonip.de',
 182         'nitter.mastodon.pro',
 183         'nitter.notraxx.ch',
 184         'nitter.skrep.in',
 185         'nitter.snopyta.org',
 186     )
 187
 188     INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
 189
 190     _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
 191     _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
 192     current_instance = random.choice(HTTP_INSTANCES)
 193
 194     _TESTS = [
 195         {
 196             # GIF (wrapped in mp4)
 197             'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
 198             'info_dict': {
 199                 'id': '1314279897502629888',
 200                 'ext': 'mp4',
 201                 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
 202                 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
 203                 'thumbnail': r're:^https?://.*\.jpg$',
 204                 'uploader': 'Firefox 🔥',
 205                 'uploader_id': 'firefox',
 206                 'uploader_url': f'https://{current_instance}/firefox',
 207                 'upload_date': '20201008',
 208                 'timestamp': 1602183720,
 209                 'like_count': int,
 210                 'repost_count': int,
 211                 'comment_count': int,
 212             },
 213         }, {  # normal video
 214             'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
 215             'info_dict': {
 216                 'id': '1299715685392756737',
 217                 'ext': 'mp4',
 218                 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
 219                 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
 220                 'thumbnail': r're:^https?://.*\.jpg$',
 221                 'uploader': 're:^Le *Doc',
 222                 'uploader_id': 'Le___Doc',
 223                 'uploader_url': f'https://{current_instance}/Le___Doc',
 224                 'upload_date': '20200829',
 225                 'timestamp': 1598711340,
 226                 'view_count': int,
 227                 'like_count': int,
 228                 'repost_count': int,
 229                 'comment_count': int,
 230             },
 231         }, {  # video embed in a "Streaming Political Ads" box
 232             'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
 233             'info_dict': {
 234                 'id': '1321147074491092994',
 235                 'ext': 'mp4',
 236                 'title': 'md5:8290664aabb43b9189145c008386bf12',
 237                 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
 238                 'thumbnail': r're:^https?://.*\.jpg$',
 239                 'uploader': 'Mozilla',
 240                 'uploader_id': 'mozilla',
 241                 'uploader_url': f'https://{current_instance}/mozilla',
 242                 'upload_date': '20201027',
 243                 'timestamp': 1603820940,
 244                 'view_count': int,
 245                 'like_count': int,
 246                 'repost_count': int,
 247                 'comment_count': int,
 248             },
 249             'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
 250         }, {  # not the first tweet but main-tweet
 251             'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
 252             'info_dict': {
 253                 'id': '1354848277481414657',
 254                 'ext': 'mp4',
 255                 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
 256                 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
 257                 'thumbnail': r're:^https?://.*\.jpg$',
 258                 'uploader': 'Firefox 🔥',
 259                 'uploader_id': 'firefox',
 260                 'uploader_url': f'https://{current_instance}/firefox',
 261                 'upload_date': '20210128',
 262                 'timestamp': 1611855960,
 263                 'view_count': int,
 264                 'like_count': int,
 265                 'repost_count': int,
 266                 'comment_count': int,
 267             }
 268         }, {  # no OpenGraph title
 269             'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
 270             'info_dict': {
 271                 'id': '1678455464038735895',
 272                 'ext': 'mp4',
 273                 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
 274                 'description': 'Local man, what did Romanians ever do to you?',
 275                 'thumbnail': r're:^https?://.*\.jpg$',
 276                 'uploader': 'Your Typical Local Man',
 277                 'uploader_id': 'LocalBateman',
 278                 'uploader_url': f'https://{current_instance}/LocalBateman',
 279                 'upload_date': '20230710',
 280                 'timestamp': 1689009900,
 281                 'view_count': int,
 282                 'like_count': int,
 283                 'repost_count': int,
 284                 'comment_count': int,
 285             },
 286             'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
 287             'params': {'skip_download': 'm3u8'},
 288         }
 289     ]
 290
 291     def _real_extract(self, url):
 292         video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
 293         parsed_url = compat_urlparse.urlparse(url)
 294         base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
 295
 296         self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
 297         full_webpage = webpage = self._download_webpage(url, video_id)
 298
 299         main_tweet_start = full_webpage.find('class="main-tweet"')
 300         if main_tweet_start > 0:
 301             webpage = full_webpage[main_tweet_start:]
 302
 303         video_url = '%s%s' % (base_url, self._html_search_regex(
 304             r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
 305         ext = determine_ext(video_url)
 306
 307         if ext == 'unknown_video':
 308             formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
 309         else:
 310             formats = [{
 311                 'url': video_url,
 312                 'ext': ext
 313             }]
 314
 315         title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex(
 316             r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
 317
 318         uploader_id = self._html_search_regex(
 319             r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
 320
 321         uploader = self._html_search_regex(
 322             r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
 323         if uploader:
 324             title = f'{uploader} - {title}'
 325
 326         counts = {
 327             f'{x[0]}_count': self._html_search_regex(
 328                 fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
 329                 webpage, f'{x[0]} count', fatal=False)
 330             for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
 331         }
 332         counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
 333
 334         thumbnail = (
 335             self._html_search_meta('og:image', full_webpage, 'thumbnail url')
 336             or remove_end('%s%s' % (base_url, self._html_search_regex(
 337                 r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
 338
 339         thumbnails = [
 340             {'id': id, 'url': f'{thumbnail}%3A{id}'}
 341             for id in ('thumb', 'small', 'large', 'medium', 'orig')
 342         ]
 343
 344         date = self._html_search_regex(
 345             r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
 346             webpage, 'upload date', default='').replace('·', '')
 347
 348         return {
 349             'id': video_id,
 350             'title': title,
 351             'description': description,
 352             'uploader': uploader,
 353             'timestamp': unified_timestamp(date),
 354             'uploader_id': uploader_id,
 355             'uploader_url': f'{base_url}/{uploader_id}',
 356             'formats': formats,
 357             'thumbnails': thumbnails,
 358             'thumbnail': thumbnail,
 359             **counts,
 360         }