yt_dlp/extractor/nitter.py

   1 import random
   2 import re
   3
   4 from .common import InfoExtractor
   5 from ..compat import compat_urlparse
   6 from ..utils import (
   7     determine_ext,
   8     parse_count,
   9     remove_end,
  10     unified_timestamp,
  11 )
  12
  13
  14 class NitterIE(InfoExtractor):
  15     # Taken from https://github.com/zedeus/nitter/wiki/Instances
  16
  17     NON_HTTP_INSTANCES = (
  18         '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
  19         'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
  20         'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
  21         'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
  22         'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
  23         'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
  24         '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
  25         'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
  26         'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
  27         'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
  28         'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
  29         'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
  30         'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
  31         'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
  32         'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
  33         'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
  34         'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
  35
  36         'nitter.i2p',
  37         'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
  38
  39         'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
  40     )
  41
  42     HTTP_INSTANCES = (
  43         'nitter.lacontrevoie.fr',
  44         'nitter.fdn.fr',
  45         'nitter.1d4.us',
  46         'nitter.kavin.rocks',
  47         'nitter.unixfox.eu',
  48         'nitter.domain.glass',
  49         'nitter.namazso.eu',
  50         'birdsite.xanny.family',
  51         'nitter.moomoo.me',
  52         'bird.trom.tf',
  53         'nitter.it',
  54         'twitter.censors.us',
  55         'nitter.grimneko.de',
  56         'twitter.076.ne.jp',
  57         'nitter.fly.dev',
  58         'notabird.site',
  59         'nitter.weiler.rocks',
  60         'nitter.sethforprivacy.com',
  61         'nitter.cutelab.space',
  62         'nitter.nl',
  63         'nitter.mint.lgbt',
  64         'nitter.bus-hit.me',
  65         'nitter.esmailelbob.xyz',
  66         'tw.artemislena.eu',
  67         'nitter.winscloud.net',
  68         'nitter.tiekoetter.com',
  69         'nitter.spaceint.fr',
  70         'nitter.privacy.com.de',
  71         'nitter.poast.org',
  72         'nitter.bird.froth.zone',
  73         'nitter.dcs0.hu',
  74         'twitter.dr460nf1r3.org',
  75         'nitter.garudalinux.org',
  76         'twitter.femboy.hu',
  77         'nitter.cz',
  78         'nitter.privacydev.net',
  79         'nitter.evil.site',
  80         'tweet.lambda.dance',
  81         'nitter.kylrth.com',
  82         'nitter.foss.wtf',
  83         'nitter.priv.pw',
  84         'nitter.tokhmi.xyz',
  85         'nitter.catalyst.sx',
  86         'unofficialbird.com',
  87         'nitter.projectsegfau.lt',
  88         'nitter.eu.projectsegfau.lt',
  89         'singapore.unofficialbird.com',
  90         'canada.unofficialbird.com',
  91         'india.unofficialbird.com',
  92         'nederland.unofficialbird.com',
  93         'uk.unofficialbird.com',
  94         'n.l5.ca',
  95         'nitter.slipfox.xyz',
  96         'nitter.soopy.moe',
  97         'nitter.qwik.space',
  98         'read.whatever.social',
  99         'nitter.rawbit.ninja',
 100         'nt.vern.cc',
 101         'ntr.odyssey346.dev',
 102         'nitter.ir',
 103         'nitter.privacytools.io',
 104         'nitter.sneed.network',
 105         'n.sneed.network',
 106         'nitter.manasiwibi.com',
 107         'nitter.smnz.de',
 108         'nitter.twei.space',
 109         'nitter.inpt.fr',
 110         'nitter.d420.de',
 111         'nitter.caioalonso.com',
 112         'nitter.at',
 113         'nitter.drivet.xyz',
 114         'nitter.pw',
 115         'nitter.nicfab.eu',
 116         'bird.habedieeh.re',
 117         'nitter.hostux.net',
 118         'nitter.adminforge.de',
 119         'nitter.platypush.tech',
 120         'nitter.mask.sh',
 121         'nitter.pufe.org',
 122         'nitter.us.projectsegfau.lt',
 123         'nitter.arcticfoxes.net',
 124         't.com.sb',
 125         'nitter.kling.gg',
 126         'nitter.ktachibana.party',
 127         'nitter.riverside.rocks',
 128         'nitter.girlboss.ceo',
 129         'nitter.lunar.icu',
 130         'twitter.moe.ngo',
 131         'nitter.freedit.eu',
 132         'ntr.frail.duckdns.org',
 133         'nitter.librenode.org',
 134         'n.opnxng.com',
 135         'nitter.plus.st',
 136     )
 137
 138     DEAD_INSTANCES = (
 139         # maintenance
 140         'nitter.ethibox.fr',
 141
 142         # official, rate limited
 143         'nitter.net',
 144         # offline
 145         'is-nitter.resolv.ee',
 146         'lu-nitter.resolv.ee',
 147         'nitter.13ad.de',
 148         'nitter.40two.app',
 149         'nitter.cattube.org',
 150         'nitter.cc',
 151         'nitter.dark.fail',
 152         'nitter.himiko.cloud',
 153         'nitter.koyu.space',
 154         'nitter.mailstation.de',
 155         'nitter.mastodont.cat',
 156         'nitter.tedomum.net',
 157         'nitter.tokhmi.xyz',
 158         'nitter.weaponizedhumiliation.com',
 159         'nitter.vxempire.xyz',
 160         'tweet.lambda.dance',
 161         'nitter.ca',
 162         'nitter.42l.fr',
 163         'nitter.pussthecat.org',
 164         'nitter.nixnet.services',
 165         'nitter.eu',
 166         'nitter.actionsack.com',
 167         'nitter.hu',
 168         'twitr.gq',
 169         'nittereu.moomoo.me',
 170         'bird.from.tf',
 171         'twitter.grimneko.de',
 172         'nitter.alefvanoon.xyz',
 173         'n.hyperborea.cloud',
 174         'twitter.mstdn.social',
 175         'nitter.silkky.cloud',
 176         'nttr.stream',
 177         'fuckthesacklers.network',
 178         'nitter.govt.land',
 179         'nitter.datatunnel.xyz',
 180         'de.nttr.stream',
 181         'twtr.bch.bar',
 182         'nitter.exonip.de',
 183         'nitter.mastodon.pro',
 184         'nitter.notraxx.ch',
 185         'nitter.skrep.in',
 186         'nitter.snopyta.org',
 187     )
 188
 189     INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
 190
 191     _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
 192     _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
 193     current_instance = random.choice(HTTP_INSTANCES)
 194
 195     _TESTS = [
 196         {
 197             # GIF (wrapped in mp4)
 198             'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
 199             'info_dict': {
 200                 'id': '1314279897502629888',
 201                 'ext': 'mp4',
 202                 'title': 'md5:7890a9277da4639ab624dd899424c5d8',
 203                 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
 204                 'thumbnail': r're:^https?://.*\.jpg$',
 205                 'uploader': 'Firefox 🔥',
 206                 'uploader_id': 'firefox',
 207                 'uploader_url': f'https://{current_instance}/firefox',
 208                 'upload_date': '20201008',
 209                 'timestamp': 1602183720,
 210                 'like_count': int,
 211                 'repost_count': int,
 212                 'comment_count': int,
 213             },
 214         }, {  # normal video
 215             'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
 216             'info_dict': {
 217                 'id': '1299715685392756737',
 218                 'ext': 'mp4',
 219                 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
 220                 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
 221                 'thumbnail': r're:^https?://.*\.jpg$',
 222                 'uploader': 're:^Le *Doc',
 223                 'uploader_id': 'Le___Doc',
 224                 'uploader_url': f'https://{current_instance}/Le___Doc',
 225                 'upload_date': '20200829',
 226                 'timestamp': 1598711340,
 227                 'view_count': int,
 228                 'like_count': int,
 229                 'repost_count': int,
 230                 'comment_count': int,
 231             },
 232         }, {  # video embed in a "Streaming Political Ads" box
 233             'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
 234             'info_dict': {
 235                 'id': '1321147074491092994',
 236                 'ext': 'mp4',
 237                 'title': 'md5:8290664aabb43b9189145c008386bf12',
 238                 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
 239                 'thumbnail': r're:^https?://.*\.jpg$',
 240                 'uploader': 'Mozilla',
 241                 'uploader_id': 'mozilla',
 242                 'uploader_url': f'https://{current_instance}/mozilla',
 243                 'upload_date': '20201027',
 244                 'timestamp': 1603820940,
 245                 'view_count': int,
 246                 'like_count': int,
 247                 'repost_count': int,
 248                 'comment_count': int,
 249             },
 250             'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
 251         }, {  # not the first tweet but main-tweet
 252             'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
 253             'info_dict': {
 254                 'id': '1354848277481414657',
 255                 'ext': 'mp4',
 256                 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
 257                 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
 258                 'thumbnail': r're:^https?://.*\.jpg$',
 259                 'uploader': 'Firefox 🔥',
 260                 'uploader_id': 'firefox',
 261                 'uploader_url': f'https://{current_instance}/firefox',
 262                 'upload_date': '20210128',
 263                 'timestamp': 1611855960,
 264                 'view_count': int,
 265                 'like_count': int,
 266                 'repost_count': int,
 267                 'comment_count': int,
 268             }
 269         }, {  # no OpenGraph title
 270             'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
 271             'info_dict': {
 272                 'id': '1678455464038735895',
 273                 'ext': 'mp4',
 274                 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
 275                 'description': 'Local man, what did Romanians ever do to you?',
 276                 'thumbnail': r're:^https?://.*\.jpg$',
 277                 'uploader': 'Your Typical Local Man',
 278                 'uploader_id': 'LocalBateman',
 279                 'uploader_url': f'https://{current_instance}/LocalBateman',
 280                 'upload_date': '20230710',
 281                 'timestamp': 1689009900,
 282                 'view_count': int,
 283                 'like_count': int,
 284                 'repost_count': int,
 285                 'comment_count': int,
 286             },
 287             'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
 288             'params': {'skip_download': 'm3u8'},
 289         }
 290     ]
 291
 292     def _real_extract(self, url):
 293         video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
 294         parsed_url = compat_urlparse.urlparse(url)
 295         base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
 296
 297         self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
 298         full_webpage = webpage = self._download_webpage(url, video_id)
 299
 300         main_tweet_start = full_webpage.find('class="main-tweet"')
 301         if main_tweet_start > 0:
 302             webpage = full_webpage[main_tweet_start:]
 303
 304         video_url = '%s%s' % (base_url, self._html_search_regex(
 305             r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
 306         ext = determine_ext(video_url)
 307
 308         if ext == 'unknown_video':
 309             formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
 310         else:
 311             formats = [{
 312                 'url': video_url,
 313                 'ext': ext
 314             }]
 315
 316         title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex(
 317             r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
 318
 319         uploader_id = self._html_search_regex(
 320             r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
 321
 322         uploader = self._html_search_regex(
 323             r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
 324         if uploader:
 325             title = f'{uploader} - {title}'
 326
 327         counts = {
 328             f'{x[0]}_count': self._html_search_regex(
 329                 fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
 330                 webpage, f'{x[0]} count', fatal=False)
 331             for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
 332         }
 333         counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
 334
 335         thumbnail = (
 336             self._html_search_meta('og:image', full_webpage, 'thumbnail url')
 337             or remove_end('%s%s' % (base_url, self._html_search_regex(
 338                 r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
 339
 340         thumbnails = [
 341             {'id': id, 'url': f'{thumbnail}%3A{id}'}
 342             for id in ('thumb', 'small', 'large', 'medium', 'orig')
 343         ]
 344
 345         date = self._html_search_regex(
 346             r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
 347             webpage, 'upload date', default='').replace('·', '')
 348
 349         return {
 350             'id': video_id,
 351             'title': title,
 352             'description': description,
 353             'uploader': uploader,
 354             'timestamp': unified_timestamp(date),
 355             'uploader_id': uploader_id,
 356             'uploader_url': f'{base_url}/{uploader_id}',
 357             'formats': formats,
 358             'thumbnails': thumbnails,
 359             'thumbnail': thumbnail,
 360             **counts,
 361         }