[yt-dlp.git] / yt_dlp / extractor / nitter.py

from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
    parse_count,
    unified_timestamp,
    remove_end,
    determine_ext,
)
import re
import random


class NitterIE(InfoExtractor):
    # Taken from https://github.com/zedeus/nitter/wiki/Instances

    NON_HTTP_INSTANCES = (
        '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
        'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
        'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
        'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
        'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
        'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
        '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
        'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
        'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
        'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
        'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
        'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
        'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
        'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
        'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
        'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
        'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',

        'nitter.i2p',
        'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',

        'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
    )

    HTTP_INSTANCES = (
        'nitter.lacontrevoie.fr',
        'nitter.fdn.fr',
        'nitter.1d4.us',
        'nitter.kavin.rocks',
        'nitter.unixfox.eu',
        'nitter.domain.glass',
        'nitter.namazso.eu',
        'birdsite.xanny.family',
        'nitter.moomoo.me',
        'bird.trom.tf',
        'nitter.it',
        'twitter.censors.us',
        'nitter.grimneko.de',
        'twitter.076.ne.jp',
        'nitter.fly.dev',
        'notabird.site',
        'nitter.weiler.rocks',
        'nitter.sethforprivacy.com',
        'nitter.cutelab.space',
        'nitter.nl',
        'nitter.mint.lgbt',
        'nitter.bus-hit.me',
        'nitter.esmailelbob.xyz',
        'tw.artemislena.eu',
        'nitter.winscloud.net',
        'nitter.tiekoetter.com',
        'nitter.spaceint.fr',
        'nitter.privacy.com.de',
        'nitter.poast.org',
        'nitter.bird.froth.zone',
        'nitter.dcs0.hu',
        'twitter.dr460nf1r3.org',
        'nitter.garudalinux.org',
        'twitter.femboy.hu',
        'nitter.cz',
        'nitter.privacydev.net',
        'nitter.evil.site',
        'tweet.lambda.dance',
        'nitter.kylrth.com',
        'nitter.foss.wtf',
        'nitter.priv.pw',
        'nitter.tokhmi.xyz',
        'nitter.catalyst.sx',
        'unofficialbird.com',
        'nitter.projectsegfau.lt',
        'nitter.eu.projectsegfau.lt',
        'singapore.unofficialbird.com',
        'canada.unofficialbird.com',
        'india.unofficialbird.com',
        'nederland.unofficialbird.com',
        'uk.unofficialbird.com',
        'n.l5.ca',
        'nitter.slipfox.xyz',
        'nitter.soopy.moe',
        'nitter.qwik.space',
        'read.whatever.social',
        'nitter.rawbit.ninja',
        'nt.vern.cc',
        'ntr.odyssey346.dev',
        'nitter.ir',
        'nitter.privacytools.io',
        'nitter.sneed.network',
        'n.sneed.network',
        'nitter.manasiwibi.com',
        'nitter.smnz.de',
        'nitter.twei.space',
        'nitter.inpt.fr',
        'nitter.d420.de',
        'nitter.caioalonso.com',
        'nitter.at',
        'nitter.drivet.xyz',
        'nitter.pw',
        'nitter.nicfab.eu',
        'bird.habedieeh.re',
        'nitter.hostux.net',
        'nitter.adminforge.de',
        'nitter.platypush.tech',
        'nitter.mask.sh',
        'nitter.pufe.org',
        'nitter.us.projectsegfau.lt',
        'nitter.arcticfoxes.net',
        't.com.sb',
        'nitter.kling.gg',
        'nitter.ktachibana.party',
        'nitter.riverside.rocks',
        'nitter.girlboss.ceo',
        'nitter.lunar.icu',
        'twitter.moe.ngo',
        'nitter.freedit.eu',
        'ntr.frail.duckdns.org',
        'nitter.librenode.org',
        'n.opnxng.com',
        'nitter.plus.st',
    )

    DEAD_INSTANCES = (
        # maintenance
        'nitter.ethibox.fr',

        # official, rate limited
        'nitter.net',
        # offline
        'is-nitter.resolv.ee',
        'lu-nitter.resolv.ee',
        'nitter.13ad.de',
        'nitter.40two.app',
        'nitter.cattube.org',
        'nitter.cc',
        'nitter.dark.fail',
        'nitter.himiko.cloud',
        'nitter.koyu.space',
        'nitter.mailstation.de',
        'nitter.mastodont.cat',
        'nitter.tedomum.net',
        'nitter.tokhmi.xyz',
        'nitter.weaponizedhumiliation.com',
        'nitter.vxempire.xyz',
        'tweet.lambda.dance',
        'nitter.ca',
        'nitter.42l.fr',
        'nitter.pussthecat.org',
        'nitter.nixnet.services',
        'nitter.eu',
        'nitter.actionsack.com',
        'nitter.hu',
        'twitr.gq',
        'nittereu.moomoo.me',
        'bird.from.tf',
        'twitter.grimneko.de',
        'nitter.alefvanoon.xyz',
        'n.hyperborea.cloud',
        'twitter.mstdn.social',
        'nitter.silkky.cloud',
        'nttr.stream',
        'fuckthesacklers.network',
        'nitter.govt.land',
        'nitter.datatunnel.xyz',
        'de.nttr.stream',
        'twtr.bch.bar',
        'nitter.exonip.de',
        'nitter.mastodon.pro',
        'nitter.notraxx.ch',
        'nitter.skrep.in',
        'nitter.snopyta.org',
    )

    INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES

    _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
    _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
    current_instance = random.choice(HTTP_INSTANCES)

    _TESTS = [
        {
            # GIF (wrapped in mp4)
            'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
            'info_dict': {
                'id': '1314279897502629888',
                'ext': 'mp4',
                'title': 'md5:7890a9277da4639ab624dd899424c5d8',
                'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
                'thumbnail': r're:^https?://.*\.jpg$',
                'uploader': 'Firefox 🔥',
                'uploader_id': 'firefox',
                'uploader_url': f'https://{current_instance}/firefox',
                'upload_date': '20201008',
                'timestamp': 1602183720,
                'like_count': int,
                'repost_count': int,
                'comment_count': int,
            },
        }, {  # normal video
            'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
            'info_dict': {
                'id': '1299715685392756737',
                'ext': 'mp4',
                'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
                'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
                'thumbnail': r're:^https?://.*\.jpg$',
                'uploader': 're:^Le *Doc',
                'uploader_id': 'Le___Doc',
                'uploader_url': f'https://{current_instance}/Le___Doc',
                'upload_date': '20200829',
                'timestamp': 1598711340,
                'view_count': int,
                'like_count': int,
                'repost_count': int,
                'comment_count': int,
            },
        }, {  # video embed in a "Streaming Political Ads" box
            'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
            'info_dict': {
                'id': '1321147074491092994',
                'ext': 'mp4',
                'title': 'md5:8290664aabb43b9189145c008386bf12',
                'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
                'thumbnail': r're:^https?://.*\.jpg$',
                'uploader': 'Mozilla',
                'uploader_id': 'mozilla',
                'uploader_url': f'https://{current_instance}/mozilla',
                'upload_date': '20201027',
                'timestamp': 1603820940,
                'view_count': int,
                'like_count': int,
                'repost_count': int,
                'comment_count': int,
            },
            'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
        }, {  # not the first tweet but main-tweet
            'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
            'info_dict': {
                'id': '1354848277481414657',
                'ext': 'mp4',
                'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
                'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
                'thumbnail': r're:^https?://.*\.jpg$',
                'uploader': 'Firefox 🔥',
                'uploader_id': 'firefox',
                'uploader_url': f'https://{current_instance}/firefox',
                'upload_date': '20210128',
                'timestamp': 1611855960,
                'view_count': int,
                'like_count': int,
                'repost_count': int,
                'comment_count': int,
            }
        }, {  # no OpenGraph title
            'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
            'info_dict': {
                'id': '1678455464038735895',
                'ext': 'mp4',
                'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
                'description': 'Local man, what did Romanians ever do to you?',
                'thumbnail': r're:^https?://.*\.jpg$',
                'uploader': 'Your Typical Local Man',
                'uploader_id': 'LocalBateman',
                'uploader_url': f'https://{current_instance}/LocalBateman',
                'upload_date': '20230710',
                'timestamp': 1689009900,
                'view_count': int,
                'like_count': int,
                'repost_count': int,
                'comment_count': int,
            },
            'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
            'params': {'skip_download': 'm3u8'},
        }
    ]

    def _real_extract(self, url):
        video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
        parsed_url = compat_urlparse.urlparse(url)
        base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'

        self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
        full_webpage = webpage = self._download_webpage(url, video_id)

        main_tweet_start = full_webpage.find('class="main-tweet"')
        if main_tweet_start > 0:
            webpage = full_webpage[main_tweet_start:]

        video_url = '%s%s' % (base_url, self._html_search_regex(
            r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
        ext = determine_ext(video_url)

        if ext == 'unknown_video':
            formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
        else:
            formats = [{
                'url': video_url,
                'ext': ext
            }]

        title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex(
            r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)

        uploader_id = self._html_search_regex(
            r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id

        uploader = self._html_search_regex(
            r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
        if uploader:
            title = f'{uploader} - {title}'

        counts = {
            f'{x[0]}_count': self._html_search_regex(
                fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
                webpage, f'{x[0]} count', fatal=False)
            for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
        }
        counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}

        thumbnail = (
            self._html_search_meta('og:image', full_webpage, 'thumbnail url')
            or remove_end('%s%s' % (base_url, self._html_search_regex(
                r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))

        thumbnails = [
            {'id': id, 'url': f'{thumbnail}%3A{id}'}
            for id in ('thumb', 'small', 'large', 'medium', 'orig')
        ]

        date = self._html_search_regex(
            r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
            webpage, 'upload date', default='').replace('·', '')

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'uploader': uploader,
            'timestamp': unified_timestamp(date),
            'uploader_id': uploader_id,
            'uploader_url': f'{base_url}/{uploader_id}',
            'formats': formats,
            'thumbnails': thumbnails,
            'thumbnail': thumbnail,
            **counts,
        }
Commit	Line	Data
bb8a73a0	1	from .common import InfoExtractor
	2	from ..compat import compat_urlparse
	3	from ..utils import (
	4	parse_count,
bb8a73a0	5	unified_timestamp,
	6	remove_end,
	7	determine_ext,
	8	)
	9	import re
a4ddaf23	10	import random
bb8a73a0	11
	12
	13	class NitterIE(InfoExtractor):
	14	# Taken from https://github.com/zedeus/nitter/wiki/Instances
a4ddaf23	15
	16	NON_HTTP_INSTANCES = (
	17	'3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
	18	'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
	19	'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
	20	'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
	21	'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
	22	'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
	23	'26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
510809f1	24	'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
	25	'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
	26	'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
	27	'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
	28	'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
	29	'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
	30	'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
	31	'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
	32	'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
	33	'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
a4ddaf23	34
	35	'nitter.i2p',
	36	'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
	37
	38	'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
	39	)
	40
	41	HTTP_INSTANCES = (
a9189510	42	'nitter.lacontrevoie.fr',
a4ddaf23	43	'nitter.fdn.fr',
	44	'nitter.1d4.us',
	45	'nitter.kavin.rocks',
a4ddaf23	46	'nitter.unixfox.eu',
a4ddaf23	47	'nitter.domain.glass',
a4ddaf23	48	'nitter.namazso.eu',
a4ddaf23	49	'birdsite.xanny.family',
510809f1	50	'nitter.moomoo.me',
a9189510	51	'bird.trom.tf',
510809f1	52	'nitter.it',
510809f1	53	'twitter.censors.us',
a9189510	54	'nitter.grimneko.de',
510809f1	55	'twitter.076.ne.jp',
510809f1	56	'nitter.fly.dev',
	57	'notabird.site',
	58	'nitter.weiler.rocks',
510809f1	59	'nitter.sethforprivacy.com',
510809f1	60	'nitter.cutelab.space',
	61	'nitter.nl',
	62	'nitter.mint.lgbt',
	63	'nitter.bus-hit.me',
510809f1	64	'nitter.esmailelbob.xyz',
510809f1	65	'tw.artemislena.eu',
510809f1	66	'nitter.winscloud.net',
	67	'nitter.tiekoetter.com',
	68	'nitter.spaceint.fr',
a9189510 O	69	'nitter.privacy.com.de',
	70	'nitter.poast.org',
	71	'nitter.bird.froth.zone',
	72	'nitter.dcs0.hu',
	73	'twitter.dr460nf1r3.org',
	74	'nitter.garudalinux.org',
	75	'twitter.femboy.hu',
	76	'nitter.cz',
	77	'nitter.privacydev.net',
	78	'nitter.evil.site',
	79	'tweet.lambda.dance',
	80	'nitter.kylrth.com',
	81	'nitter.foss.wtf',
	82	'nitter.priv.pw',
	83	'nitter.tokhmi.xyz',
	84	'nitter.catalyst.sx',
	85	'unofficialbird.com',
	86	'nitter.projectsegfau.lt',
	87	'nitter.eu.projectsegfau.lt',
	88	'singapore.unofficialbird.com',
	89	'canada.unofficialbird.com',
	90	'india.unofficialbird.com',
	91	'nederland.unofficialbird.com',
	92	'uk.unofficialbird.com',
	93	'n.l5.ca',
	94	'nitter.slipfox.xyz',
	95	'nitter.soopy.moe',
	96	'nitter.qwik.space',
	97	'read.whatever.social',
	98	'nitter.rawbit.ninja',
	99	'nt.vern.cc',
	100	'ntr.odyssey346.dev',
	101	'nitter.ir',
	102	'nitter.privacytools.io',
	103	'nitter.sneed.network',
	104	'n.sneed.network',
	105	'nitter.manasiwibi.com',
	106	'nitter.smnz.de',
	107	'nitter.twei.space',
	108	'nitter.inpt.fr',
	109	'nitter.d420.de',
	110	'nitter.caioalonso.com',
	111	'nitter.at',
	112	'nitter.drivet.xyz',
	113	'nitter.pw',
	114	'nitter.nicfab.eu',
	115	'bird.habedieeh.re',
	116	'nitter.hostux.net',
	117	'nitter.adminforge.de',
	118	'nitter.platypush.tech',
	119	'nitter.mask.sh',
	120	'nitter.pufe.org',
	121	'nitter.us.projectsegfau.lt',
	122	'nitter.arcticfoxes.net',
	123	't.com.sb',
	124	'nitter.kling.gg',
	125	'nitter.ktachibana.party',
	126	'nitter.riverside.rocks',
	127	'nitter.girlboss.ceo',
	128	'nitter.lunar.icu',
	129	'twitter.moe.ngo',
	130	'nitter.freedit.eu',
	131	'ntr.frail.duckdns.org',
	132	'nitter.librenode.org',
133	'n.opnxng.com',
134	'nitter.plus.st',
a4ddaf23	135	)
	136
	137	DEAD_INSTANCES = (
	138	# maintenance
	139	'nitter.ethibox.fr',
	140
	141	# official, rate limited
	142	'nitter.net',
	143	# offline
510809f1	144	'is-nitter.resolv.ee',
510809f1	145	'lu-nitter.resolv.ee',
a4ddaf23	146	'nitter.13ad.de',
510809f1	147	'nitter.40two.app',
	148	'nitter.cattube.org',
	149	'nitter.cc',
	150	'nitter.dark.fail',
	151	'nitter.himiko.cloud',
	152	'nitter.koyu.space',
	153	'nitter.mailstation.de',
	154	'nitter.mastodont.cat',
	155	'nitter.tedomum.net',
	156	'nitter.tokhmi.xyz',
a4ddaf23	157	'nitter.weaponizedhumiliation.com',
510809f1	158	'nitter.vxempire.xyz',
510809f1	159	'tweet.lambda.dance',
a9189510 O	160	'nitter.ca',
	161	'nitter.42l.fr',
	162	'nitter.pussthecat.org',
	163	'nitter.nixnet.services',
	164	'nitter.eu',
	165	'nitter.actionsack.com',
	166	'nitter.hu',
	167	'twitr.gq',
	168	'nittereu.moomoo.me',
	169	'bird.from.tf',
	170	'twitter.grimneko.de',
	171	'nitter.alefvanoon.xyz',
	172	'n.hyperborea.cloud',
	173	'twitter.mstdn.social',
	174	'nitter.silkky.cloud',
	175	'nttr.stream',
	176	'fuckthesacklers.network',
	177	'nitter.govt.land',
	178	'nitter.datatunnel.xyz',
	179	'de.nttr.stream',
	180	'twtr.bch.bar',
	181	'nitter.exonip.de',
	182	'nitter.mastodon.pro',
	183	'nitter.notraxx.ch',
	184	'nitter.skrep.in',
	185	'nitter.snopyta.org',
a4ddaf23	186	)
	187
	188	INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
bb8a73a0	189
510809f1	190	_INSTANCES_RE = f'(?:{"\|".join(map(re.escape, INSTANCES))})'
510809f1	191	_VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
a4ddaf23	192	current_instance = random.choice(HTTP_INSTANCES)
a4ddaf23	193
bb8a73a0	194	_TESTS = [
	195	{
	196	# GIF (wrapped in mp4)
510809f1	197	'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
bb8a73a0	198	'info_dict': {
	199	'id': '1314279897502629888',
	200	'ext': 'mp4',
510809f1	201	'title': 'md5:7890a9277da4639ab624dd899424c5d8',
510809f1	202	'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
bb8a73a0	203	'thumbnail': r're:^https?://.*\.jpg$',
	204	'uploader': 'Firefox 🔥',
	205	'uploader_id': 'firefox',
510809f1	206	'uploader_url': f'https://{current_instance}/firefox',
bb8a73a0	207	'upload_date': '20201008',
bb8a73a0	208	'timestamp': 1602183720,
510809f1	209	'like_count': int,
	210	'repost_count': int,
	211	'comment_count': int,
bb8a73a0	212	},
bb8a73a0	213	}, { # normal video
510809f1	214	'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
bb8a73a0	215	'info_dict': {
	216	'id': '1299715685392756737',
	217	'ext': 'mp4',
510809f1	218	'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
a4ddaf23	219	'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
bb8a73a0	220	'thumbnail': r're:^https?://.*\.jpg$',
510809f1	221	'uploader': 're:^Le *Doc',
bb8a73a0	222	'uploader_id': 'Le___Doc',
510809f1	223	'uploader_url': f'https://{current_instance}/Le___Doc',
bb8a73a0	224	'upload_date': '20200829',
510809f1	225	'timestamp': 1598711340,
bb8a73a0	226	'view_count': int,
	227	'like_count': int,
	228	'repost_count': int,
	229	'comment_count': int,
	230	},
	231	}, { # video embed in a "Streaming Political Ads" box
510809f1	232	'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
bb8a73a0	233	'info_dict': {
	234	'id': '1321147074491092994',
	235	'ext': 'mp4',
510809f1	236	'title': 'md5:8290664aabb43b9189145c008386bf12',
510809f1	237	'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
bb8a73a0	238	'thumbnail': r're:^https?://.*\.jpg$',
	239	'uploader': 'Mozilla',
	240	'uploader_id': 'mozilla',
510809f1	241	'uploader_url': f'https://{current_instance}/mozilla',
bb8a73a0	242	'upload_date': '20201027',
510809f1	243	'timestamp': 1603820940,
	244	'view_count': int,
	245	'like_count': int,
	246	'repost_count': int,
	247	'comment_count': int,
bb8a73a0	248	},
510809f1	249	'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
a4ddaf23	250	}, { # not the first tweet but main-tweet
510809f1	251	'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
a4ddaf23	252	'info_dict': {
510809f1	253	'id': '1354848277481414657',
a4ddaf23	254	'ext': 'mp4',
510809f1	255	'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
510809f1	256	'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
a4ddaf23	257	'thumbnail': r're:^https?://.*\.jpg$',
510809f1	258	'uploader': 'Firefox 🔥',
	259	'uploader_id': 'firefox',
	260	'uploader_url': f'https://{current_instance}/firefox',
	261	'upload_date': '20210128',
	262	'timestamp': 1611855960,
	263	'view_count': int,
	264	'like_count': int,
	265	'repost_count': int,
	266	'comment_count': int,
a4ddaf23	267	}
a83da371 A	268	}, { # no OpenGraph title
	269	'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m',
	270	'info_dict': {
	271	'id': '1678455464038735895',
	272	'ext': 'mp4',
	273	'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?',
	274	'description': 'Local man, what did Romanians ever do to you?',
	275	'thumbnail': r're:^https?://.*\.jpg$',
	276	'uploader': 'Your Typical Local Man',
	277	'uploader_id': 'LocalBateman',
	278	'uploader_url': f'https://{current_instance}/LocalBateman',
	279	'upload_date': '20230710',
	280	'timestamp': 1689009900,
	281	'view_count': int,
	282	'like_count': int,
	283	'repost_count': int,
	284	'comment_count': int,
	285	},
	286	'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
	287	'params': {'skip_download': 'm3u8'},
a4ddaf23	288	}
bb8a73a0	289	]
	290
	291	def _real_extract(self, url):
510809f1	292	video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
bb8a73a0	293	parsed_url = compat_urlparse.urlparse(url)
510809f1	294	base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
bb8a73a0	295
bb8a73a0	296	self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
510809f1	297	full_webpage = webpage = self._download_webpage(url, video_id)
a4ddaf23	298
	299	main_tweet_start = full_webpage.find('class="main-tweet"')
	300	if main_tweet_start > 0:
	301	webpage = full_webpage[main_tweet_start:]
bb8a73a0	302
510809f1	303	video_url = '%s%s' % (base_url, self._html_search_regex(
510809f1	304	r'(?:<video[^>]+data-url\|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
bb8a73a0	305	ext = determine_ext(video_url)
	306
	307	if ext == 'unknown_video':
	308	formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
	309	else:
	310	formats = [{
	311	'url': video_url,
	312	'ext': ext
	313	}]
	314
a83da371	315	title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex(
510809f1	316	r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
bb8a73a0	317
510809f1	318	uploader_id = self._html_search_regex(
510809f1	319	r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
bb8a73a0	320
510809f1	321	uploader = self._html_search_regex(
	322	r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
	323	if uploader:
	324	title = f'{uploader} - {title}'
bb8a73a0	325
510809f1	326	counts = {
	327	f'{x[0]}_count': self._html_search_regex(
	328	fr'<span[^>]+class="icon-{x[1]}[^>]></span>([^<])</div>',
	329	webpage, f'{x[0]} count', fatal=False)
	330	for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
	331	}
	332	counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
bb8a73a0	333
510809f1	334	thumbnail = (
	335	self._html_search_meta('og:image', full_webpage, 'thumbnail url')
	336	or remove_end('%s%s' % (base_url, self._html_search_regex(
	337	r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
	338
	339	thumbnails = [
	340	{'id': id, 'url': f'{thumbnail}%3A{id}'}
	341	for id in ('thumb', 'small', 'large', 'medium', 'orig')
	342	]
	343
	344	date = self._html_search_regex(
	345	r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
	346	webpage, 'upload date', default='').replace('·', '')
bb8a73a0	347
	348	return {
	349	'id': video_id,
	350	'title': title,
	351	'description': description,
	352	'uploader': uploader,
510809f1	353	'timestamp': unified_timestamp(date),
bb8a73a0	354	'uploader_id': uploader_id,
510809f1	355	'uploader_url': f'{base_url}/{uploader_id}',
bb8a73a0	356	'formats': formats,
	357	'thumbnails': thumbnails,
	358	'thumbnail': thumbnail,
510809f1	359	**counts,
bb8a73a0	360	}