[yt-dlp.git] / yt_dlp / extractor / nitter.py

from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
    parse_count,
    unified_timestamp,
    remove_end,
    determine_ext,
)
import re
import random


class NitterIE(InfoExtractor):
    # Taken from https://github.com/zedeus/nitter/wiki/Instances

    NON_HTTP_INSTANCES = (
        '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
        'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
        'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
        'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
        'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
        'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
        '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
        'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
        'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
        'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
        'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
        'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
        'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
        'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
        'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
        'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
        'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',

        'nitter.i2p',
        'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',

        'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
    )

    HTTP_INSTANCES = (
        'nitter.lacontrevoie.fr',
        'nitter.fdn.fr',
        'nitter.1d4.us',
        'nitter.kavin.rocks',
        'nitter.unixfox.eu',
        'nitter.domain.glass',
        'nitter.namazso.eu',
        'birdsite.xanny.family',
        'nitter.moomoo.me',
        'bird.trom.tf',
        'nitter.it',
        'twitter.censors.us',
        'nitter.grimneko.de',
        'twitter.076.ne.jp',
        'nitter.fly.dev',
        'notabird.site',
        'nitter.weiler.rocks',
        'nitter.sethforprivacy.com',
        'nitter.cutelab.space',
        'nitter.nl',
        'nitter.mint.lgbt',
        'nitter.bus-hit.me',
        'nitter.esmailelbob.xyz',
        'tw.artemislena.eu',
        'nitter.winscloud.net',
        'nitter.tiekoetter.com',
        'nitter.spaceint.fr',
        'nitter.privacy.com.de',
        'nitter.poast.org',
        'nitter.bird.froth.zone',
        'nitter.dcs0.hu',
        'twitter.dr460nf1r3.org',
        'nitter.garudalinux.org',
        'twitter.femboy.hu',
        'nitter.cz',
        'nitter.privacydev.net',
        'nitter.evil.site',
        'tweet.lambda.dance',
        'nitter.kylrth.com',
        'nitter.foss.wtf',
        'nitter.priv.pw',
        'nitter.tokhmi.xyz',
        'nitter.catalyst.sx',
        'unofficialbird.com',
        'nitter.projectsegfau.lt',
        'nitter.eu.projectsegfau.lt',
        'singapore.unofficialbird.com',
        'canada.unofficialbird.com',
        'india.unofficialbird.com',
        'nederland.unofficialbird.com',
        'uk.unofficialbird.com',
        'n.l5.ca',
        'nitter.slipfox.xyz',
        'nitter.soopy.moe',
        'nitter.qwik.space',
        'read.whatever.social',
        'nitter.rawbit.ninja',
        'nt.vern.cc',
        'ntr.odyssey346.dev',
        'nitter.ir',
        'nitter.privacytools.io',
        'nitter.sneed.network',
        'n.sneed.network',
        'nitter.manasiwibi.com',
        'nitter.smnz.de',
        'nitter.twei.space',
        'nitter.inpt.fr',
        'nitter.d420.de',
        'nitter.caioalonso.com',
        'nitter.at',
        'nitter.drivet.xyz',
        'nitter.pw',
        'nitter.nicfab.eu',
        'bird.habedieeh.re',
        'nitter.hostux.net',
        'nitter.adminforge.de',
        'nitter.platypush.tech',
        'nitter.mask.sh',
        'nitter.pufe.org',
        'nitter.us.projectsegfau.lt',
        'nitter.arcticfoxes.net',
        't.com.sb',
        'nitter.kling.gg',
        'nitter.ktachibana.party',
        'nitter.riverside.rocks',
        'nitter.girlboss.ceo',
        'nitter.lunar.icu',
        'twitter.moe.ngo',
        'nitter.freedit.eu',
        'ntr.frail.duckdns.org',
        'nitter.librenode.org',
        'n.opnxng.com',
        'nitter.plus.st',
    )

    DEAD_INSTANCES = (
        # maintenance
        'nitter.ethibox.fr',

        # official, rate limited
        'nitter.net',
        # offline
        'is-nitter.resolv.ee',
        'lu-nitter.resolv.ee',
        'nitter.13ad.de',
        'nitter.40two.app',
        'nitter.cattube.org',
        'nitter.cc',
        'nitter.dark.fail',
        'nitter.himiko.cloud',
        'nitter.koyu.space',
        'nitter.mailstation.de',
        'nitter.mastodont.cat',
        'nitter.tedomum.net',
        'nitter.tokhmi.xyz',
        'nitter.weaponizedhumiliation.com',
        'nitter.vxempire.xyz',
        'tweet.lambda.dance',
        'nitter.ca',
        'nitter.42l.fr',
        'nitter.pussthecat.org',
        'nitter.nixnet.services',
        'nitter.eu',
        'nitter.actionsack.com',
        'nitter.hu',
        'twitr.gq',
        'nittereu.moomoo.me',
        'bird.from.tf',
        'twitter.grimneko.de',
        'nitter.alefvanoon.xyz',
        'n.hyperborea.cloud',
        'twitter.mstdn.social',
        'nitter.silkky.cloud',
        'nttr.stream',
        'fuckthesacklers.network',
        'nitter.govt.land',
        'nitter.datatunnel.xyz',
        'de.nttr.stream',
        'twtr.bch.bar',
        'nitter.exonip.de',
        'nitter.mastodon.pro',
        'nitter.notraxx.ch',
        'nitter.skrep.in',
        'nitter.snopyta.org',
    )

    INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES

    _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})'
    _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
    current_instance = random.choice(HTTP_INSTANCES)

    _TESTS = [
        {
            # GIF (wrapped in mp4)
            'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
            'info_dict': {
                'id': '1314279897502629888',
                'ext': 'mp4',
                'title': 'md5:7890a9277da4639ab624dd899424c5d8',
                'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
                'thumbnail': r're:^https?://.*\.jpg$',
                'uploader': 'Firefox 🔥',
                'uploader_id': 'firefox',
                'uploader_url': f'https://{current_instance}/firefox',
                'upload_date': '20201008',
                'timestamp': 1602183720,
                'like_count': int,
                'repost_count': int,
                'comment_count': int,
            },
        }, {  # normal video
            'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
            'info_dict': {
                'id': '1299715685392756737',
                'ext': 'mp4',
                'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
                'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
                'thumbnail': r're:^https?://.*\.jpg$',
                'uploader': 're:^Le *Doc',
                'uploader_id': 'Le___Doc',
                'uploader_url': f'https://{current_instance}/Le___Doc',
                'upload_date': '20200829',
                'timestamp': 1598711340,
                'view_count': int,
                'like_count': int,
                'repost_count': int,
                'comment_count': int,
            },
        }, {  # video embed in a "Streaming Political Ads" box
            'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
            'info_dict': {
                'id': '1321147074491092994',
                'ext': 'mp4',
                'title': 'md5:8290664aabb43b9189145c008386bf12',
                'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
                'thumbnail': r're:^https?://.*\.jpg$',
                'uploader': 'Mozilla',
                'uploader_id': 'mozilla',
                'uploader_url': f'https://{current_instance}/mozilla',
                'upload_date': '20201027',
                'timestamp': 1603820940,
                'view_count': int,
                'like_count': int,
                'repost_count': int,
                'comment_count': int,
            },
            'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
        }, {  # not the first tweet but main-tweet
            'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
            'info_dict': {
                'id': '1354848277481414657',
                'ext': 'mp4',
                'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
                'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
                'thumbnail': r're:^https?://.*\.jpg$',
                'uploader': 'Firefox 🔥',
                'uploader_id': 'firefox',
                'uploader_url': f'https://{current_instance}/firefox',
                'upload_date': '20210128',
                'timestamp': 1611855960,
                'view_count': int,
                'like_count': int,
                'repost_count': int,
                'comment_count': int,
            }
        }
    ]

    def _real_extract(self, url):
        video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
        parsed_url = compat_urlparse.urlparse(url)
        base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'

        self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
        full_webpage = webpage = self._download_webpage(url, video_id)

        main_tweet_start = full_webpage.find('class="main-tweet"')
        if main_tweet_start > 0:
            webpage = full_webpage[main_tweet_start:]

        video_url = '%s%s' % (base_url, self._html_search_regex(
            r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
        ext = determine_ext(video_url)

        if ext == 'unknown_video':
            formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
        else:
            formats = [{
                'url': video_url,
                'ext': ext
            }]

        title = description = self._og_search_description(full_webpage) or self._html_search_regex(
            r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)

        uploader_id = self._html_search_regex(
            r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id

        uploader = self._html_search_regex(
            r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
        if uploader:
            title = f'{uploader} - {title}'

        counts = {
            f'{x[0]}_count': self._html_search_regex(
                fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>',
                webpage, f'{x[0]} count', fatal=False)
            for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
        }
        counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}

        thumbnail = (
            self._html_search_meta('og:image', full_webpage, 'thumbnail url')
            or remove_end('%s%s' % (base_url, self._html_search_regex(
                r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))

        thumbnails = [
            {'id': id, 'url': f'{thumbnail}%3A{id}'}
            for id in ('thumb', 'small', 'large', 'medium', 'orig')
        ]

        date = self._html_search_regex(
            r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
            webpage, 'upload date', default='').replace('·', '')

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'uploader': uploader,
            'timestamp': unified_timestamp(date),
            'uploader_id': uploader_id,
            'uploader_url': f'{base_url}/{uploader_id}',
            'formats': formats,
            'thumbnails': thumbnails,
            'thumbnail': thumbnail,
            **counts,
        }
Commit	Line	Data
bb8a73a0	1	from .common import InfoExtractor
	2	from ..compat import compat_urlparse
	3	from ..utils import (
	4	parse_count,
bb8a73a0	5	unified_timestamp,
	6	remove_end,
	7	determine_ext,
	8	)
	9	import re
a4ddaf23	10	import random
bb8a73a0	11
	12
	13	class NitterIE(InfoExtractor):
	14	# Taken from https://github.com/zedeus/nitter/wiki/Instances
a4ddaf23	15
	16	NON_HTTP_INSTANCES = (
	17	'3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
	18	'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
	19	'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
	20	'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
	21	'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
	22	'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
	23	'26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
510809f1	24	'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion',
	25	'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion',
	26	'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion',
	27	'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion',
	28	'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion',
	29	'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion',
	30	'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion',
	31	'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion',
	32	'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion',
	33	'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion',
a4ddaf23	34
	35	'nitter.i2p',
	36	'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
	37
	38	'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
	39	)
	40
	41	HTTP_INSTANCES = (
a9189510	42	'nitter.lacontrevoie.fr',
a4ddaf23	43	'nitter.fdn.fr',
	44	'nitter.1d4.us',
	45	'nitter.kavin.rocks',
a4ddaf23	46	'nitter.unixfox.eu',
a4ddaf23	47	'nitter.domain.glass',
a4ddaf23	48	'nitter.namazso.eu',
a4ddaf23	49	'birdsite.xanny.family',
510809f1	50	'nitter.moomoo.me',
a9189510	51	'bird.trom.tf',
510809f1	52	'nitter.it',
510809f1	53	'twitter.censors.us',
a9189510	54	'nitter.grimneko.de',
510809f1	55	'twitter.076.ne.jp',
510809f1	56	'nitter.fly.dev',
	57	'notabird.site',
	58	'nitter.weiler.rocks',
510809f1	59	'nitter.sethforprivacy.com',
510809f1	60	'nitter.cutelab.space',
	61	'nitter.nl',
	62	'nitter.mint.lgbt',
	63	'nitter.bus-hit.me',
510809f1	64	'nitter.esmailelbob.xyz',
510809f1	65	'tw.artemislena.eu',
510809f1	66	'nitter.winscloud.net',
	67	'nitter.tiekoetter.com',
	68	'nitter.spaceint.fr',
a9189510 O	69	'nitter.privacy.com.de',
	70	'nitter.poast.org',
	71	'nitter.bird.froth.zone',
	72	'nitter.dcs0.hu',
	73	'twitter.dr460nf1r3.org',
	74	'nitter.garudalinux.org',
	75	'twitter.femboy.hu',
	76	'nitter.cz',
	77	'nitter.privacydev.net',
	78	'nitter.evil.site',
	79	'tweet.lambda.dance',
	80	'nitter.kylrth.com',
	81	'nitter.foss.wtf',
	82	'nitter.priv.pw',
	83	'nitter.tokhmi.xyz',
	84	'nitter.catalyst.sx',
	85	'unofficialbird.com',
	86	'nitter.projectsegfau.lt',
	87	'nitter.eu.projectsegfau.lt',
	88	'singapore.unofficialbird.com',
	89	'canada.unofficialbird.com',
	90	'india.unofficialbird.com',
	91	'nederland.unofficialbird.com',
	92	'uk.unofficialbird.com',
	93	'n.l5.ca',
	94	'nitter.slipfox.xyz',
	95	'nitter.soopy.moe',
	96	'nitter.qwik.space',
	97	'read.whatever.social',
	98	'nitter.rawbit.ninja',
	99	'nt.vern.cc',
	100	'ntr.odyssey346.dev',
	101	'nitter.ir',
	102	'nitter.privacytools.io',
	103	'nitter.sneed.network',
	104	'n.sneed.network',
	105	'nitter.manasiwibi.com',
	106	'nitter.smnz.de',
	107	'nitter.twei.space',
	108	'nitter.inpt.fr',
	109	'nitter.d420.de',
	110	'nitter.caioalonso.com',
	111	'nitter.at',
	112	'nitter.drivet.xyz',
	113	'nitter.pw',
	114	'nitter.nicfab.eu',
	115	'bird.habedieeh.re',
	116	'nitter.hostux.net',
	117	'nitter.adminforge.de',
	118	'nitter.platypush.tech',
	119	'nitter.mask.sh',
	120	'nitter.pufe.org',
	121	'nitter.us.projectsegfau.lt',
	122	'nitter.arcticfoxes.net',
	123	't.com.sb',
	124	'nitter.kling.gg',
	125	'nitter.ktachibana.party',
	126	'nitter.riverside.rocks',
	127	'nitter.girlboss.ceo',
	128	'nitter.lunar.icu',
	129	'twitter.moe.ngo',
	130	'nitter.freedit.eu',
	131	'ntr.frail.duckdns.org',
	132	'nitter.librenode.org',
133	'n.opnxng.com',
134	'nitter.plus.st',
a4ddaf23	135	)
	136
	137	DEAD_INSTANCES = (
	138	# maintenance
	139	'nitter.ethibox.fr',
	140
	141	# official, rate limited
	142	'nitter.net',
	143	# offline
510809f1	144	'is-nitter.resolv.ee',
510809f1	145	'lu-nitter.resolv.ee',
a4ddaf23	146	'nitter.13ad.de',
510809f1	147	'nitter.40two.app',
	148	'nitter.cattube.org',
	149	'nitter.cc',
	150	'nitter.dark.fail',
	151	'nitter.himiko.cloud',
	152	'nitter.koyu.space',
	153	'nitter.mailstation.de',
	154	'nitter.mastodont.cat',
	155	'nitter.tedomum.net',
	156	'nitter.tokhmi.xyz',
a4ddaf23	157	'nitter.weaponizedhumiliation.com',
510809f1	158	'nitter.vxempire.xyz',
510809f1	159	'tweet.lambda.dance',
a9189510 O	160	'nitter.ca',
	161	'nitter.42l.fr',
	162	'nitter.pussthecat.org',
	163	'nitter.nixnet.services',
	164	'nitter.eu',
	165	'nitter.actionsack.com',
	166	'nitter.hu',
	167	'twitr.gq',
	168	'nittereu.moomoo.me',
	169	'bird.from.tf',
	170	'twitter.grimneko.de',
	171	'nitter.alefvanoon.xyz',
	172	'n.hyperborea.cloud',
	173	'twitter.mstdn.social',
	174	'nitter.silkky.cloud',
	175	'nttr.stream',
	176	'fuckthesacklers.network',
	177	'nitter.govt.land',
	178	'nitter.datatunnel.xyz',
	179	'de.nttr.stream',
	180	'twtr.bch.bar',
	181	'nitter.exonip.de',
	182	'nitter.mastodon.pro',
	183	'nitter.notraxx.ch',
	184	'nitter.skrep.in',
	185	'nitter.snopyta.org',
a4ddaf23	186	)
	187
	188	INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
bb8a73a0	189
510809f1	190	_INSTANCES_RE = f'(?:{"\|".join(map(re.escape, INSTANCES))})'
510809f1	191	_VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?'
a4ddaf23	192	current_instance = random.choice(HTTP_INSTANCES)
a4ddaf23	193
bb8a73a0	194	_TESTS = [
	195	{
	196	# GIF (wrapped in mp4)
510809f1	197	'url': f'https://{current_instance}/firefox/status/1314279897502629888#m',
bb8a73a0	198	'info_dict': {
	199	'id': '1314279897502629888',
	200	'ext': 'mp4',
510809f1	201	'title': 'md5:7890a9277da4639ab624dd899424c5d8',
510809f1	202	'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe',
bb8a73a0	203	'thumbnail': r're:^https?://.*\.jpg$',
	204	'uploader': 'Firefox 🔥',
	205	'uploader_id': 'firefox',
510809f1	206	'uploader_url': f'https://{current_instance}/firefox',
bb8a73a0	207	'upload_date': '20201008',
bb8a73a0	208	'timestamp': 1602183720,
510809f1	209	'like_count': int,
	210	'repost_count': int,
	211	'comment_count': int,
bb8a73a0	212	},
bb8a73a0	213	}, { # normal video
510809f1	214	'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m',
bb8a73a0	215	'info_dict': {
	216	'id': '1299715685392756737',
	217	'ext': 'mp4',
510809f1	218	'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
a4ddaf23	219	'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
bb8a73a0	220	'thumbnail': r're:^https?://.*\.jpg$',
510809f1	221	'uploader': 're:^Le *Doc',
bb8a73a0	222	'uploader_id': 'Le___Doc',
510809f1	223	'uploader_url': f'https://{current_instance}/Le___Doc',
bb8a73a0	224	'upload_date': '20200829',
510809f1	225	'timestamp': 1598711340,
bb8a73a0	226	'view_count': int,
	227	'like_count': int,
	228	'repost_count': int,
	229	'comment_count': int,
	230	},
	231	}, { # video embed in a "Streaming Political Ads" box
510809f1	232	'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m',
bb8a73a0	233	'info_dict': {
	234	'id': '1321147074491092994',
	235	'ext': 'mp4',
510809f1	236	'title': 'md5:8290664aabb43b9189145c008386bf12',
510809f1	237	'description': 'md5:9cf2762d49674bc416a191a689fb2aaa',
bb8a73a0	238	'thumbnail': r're:^https?://.*\.jpg$',
	239	'uploader': 'Mozilla',
	240	'uploader_id': 'mozilla',
510809f1	241	'uploader_url': f'https://{current_instance}/mozilla',
bb8a73a0	242	'upload_date': '20201027',
510809f1	243	'timestamp': 1603820940,
	244	'view_count': int,
	245	'like_count': int,
	246	'repost_count': int,
	247	'comment_count': int,
bb8a73a0	248	},
510809f1	249	'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
a4ddaf23	250	}, { # not the first tweet but main-tweet
510809f1	251	'url': f'https://{current_instance}/firefox/status/1354848277481414657#m',
a4ddaf23	252	'info_dict': {
510809f1	253	'id': '1354848277481414657',
a4ddaf23	254	'ext': 'mp4',
510809f1	255	'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700',
510809f1	256	'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f',
a4ddaf23	257	'thumbnail': r're:^https?://.*\.jpg$',
510809f1	258	'uploader': 'Firefox 🔥',
	259	'uploader_id': 'firefox',
	260	'uploader_url': f'https://{current_instance}/firefox',
	261	'upload_date': '20210128',
	262	'timestamp': 1611855960,
	263	'view_count': int,
	264	'like_count': int,
	265	'repost_count': int,
	266	'comment_count': int,
a4ddaf23	267	}
a4ddaf23	268	}
bb8a73a0	269	]
	270
	271	def _real_extract(self, url):
510809f1	272	video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id')
bb8a73a0	273	parsed_url = compat_urlparse.urlparse(url)
510809f1	274	base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
bb8a73a0	275
bb8a73a0	276	self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
510809f1	277	full_webpage = webpage = self._download_webpage(url, video_id)
a4ddaf23	278
	279	main_tweet_start = full_webpage.find('class="main-tweet"')
	280	if main_tweet_start > 0:
	281	webpage = full_webpage[main_tweet_start:]
bb8a73a0	282
510809f1	283	video_url = '%s%s' % (base_url, self._html_search_regex(
510809f1	284	r'(?:<video[^>]+data-url\|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
bb8a73a0	285	ext = determine_ext(video_url)
	286
	287	if ext == 'unknown_video':
	288	formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
	289	else:
	290	formats = [{
	291	'url': video_url,
	292	'ext': ext
	293	}]
	294
510809f1	295	title = description = self._og_search_description(full_webpage) or self._html_search_regex(
510809f1	296	r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False)
bb8a73a0	297
510809f1	298	uploader_id = self._html_search_regex(
510809f1	299	r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id
bb8a73a0	300
510809f1	301	uploader = self._html_search_regex(
	302	r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
	303	if uploader:
	304	title = f'{uploader} - {title}'
bb8a73a0	305
510809f1	306	counts = {
	307	f'{x[0]}_count': self._html_search_regex(
	308	fr'<span[^>]+class="icon-{x[1]}[^>]></span>([^<])</div>',
	309	webpage, f'{x[0]} count', fatal=False)
	310	for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment'))
	311	}
	312	counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()}
bb8a73a0	313
510809f1	314	thumbnail = (
	315	self._html_search_meta('og:image', full_webpage, 'thumbnail url')
	316	or remove_end('%s%s' % (base_url, self._html_search_regex(
	317	r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall'))
	318
	319	thumbnails = [
	320	{'id': id, 'url': f'{thumbnail}%3A{id}'}
	321	for id in ('thumb', 'small', 'large', 'medium', 'orig')
	322	]
	323
	324	date = self._html_search_regex(
	325	r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"',
	326	webpage, 'upload date', default='').replace('·', '')
bb8a73a0	327
	328	return {
	329	'id': video_id,
	330	'title': title,
	331	'description': description,
	332	'uploader': uploader,
510809f1	333	'timestamp': unified_timestamp(date),
bb8a73a0	334	'uploader_id': uploader_id,
510809f1	335	'uploader_url': f'{base_url}/{uploader_id}',
bb8a73a0	336	'formats': formats,
	337	'thumbnails': thumbnails,
	338	'thumbnail': thumbnail,
510809f1	339	**counts,
bb8a73a0	340	}