[yt-dlp.git] / youtube_dlc / extractor / nitter.py

# coding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
    parse_count,
    unified_strdate,
    unified_timestamp,
    remove_end,
    determine_ext,
)
import re


class NitterIE(InfoExtractor):
    # Taken from https://github.com/zedeus/nitter/wiki/Instances
    INSTANCES = ('nitter.net',
                 'nitter.snopyta.org',
                 'nitter.42l.fr',
                 'nitter.nixnet.services',
                 'nitter.13ad.de',
                 'nitter.pussthecat.org',
                 'nitter.mastodont.cat',
                 'nitter.dark.fail',
                 'nitter.tedomum.net',
                 'nitter.cattube.org',
                 'nitter.fdn.fr',
                 'nitter.1d4.us',
                 'nitter.kavin.rocks',
                 'tweet.lambda.dance',
                 'nitter.cc',
                 'nitter.weaponizedhumiliation.com',
                 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
                 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
                 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion')

    _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')'
    _VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE}
    current_instance = INSTANCES[0]  # the test and official instance
    _TESTS = [
        {
            # GIF (wrapped in mp4)
            'url': 'https://' + current_instance + '/firefox/status/1314279897502629888#m',
            'info_dict': {
                'id': '1314279897502629888',
                'ext': 'mp4',
                'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension.   Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg  #UnfckTheInternet',
                'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension.   Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg  #UnfckTheInternet',
                'thumbnail': r're:^https?://.*\.jpg$',
                'uploader': 'Firefox 🔥',
                'uploader_id': 'firefox',
                'uploader_url': 'https://' + current_instance + '/firefox',
                'upload_date': '20201008',
                'timestamp': 1602183720,
            },
        }, {  # normal video
            'url': 'https://' + current_instance + '/Le___Doc/status/1299715685392756737#m',
            'info_dict': {
                'id': '1299715685392756737',
                'ext': 'mp4',
                'title': 'Le Doc - "Je ne prédis jamais rien" D Raoult, Août 2020...',
                'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...',
                'thumbnail': r're:^https?://.*\.jpg$',
                'uploader': 'Le Doc',
                'uploader_id': 'Le___Doc',
                'uploader_url': 'https://' + current_instance + '/Le___Doc',
                'upload_date': '20200829',
                'timestamp': 1598711341,
                'view_count': int,
                'like_count': int,
                'repost_count': int,
                'comment_count': int,
            },
        }, {  # video embed in a "Streaming Political Ads" box
            'url': 'https://' + current_instance + '/mozilla/status/1321147074491092994#m',
            'info_dict': {
                'id': '1321147074491092994',
                'ext': 'mp4',
                'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?  This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few.   Learn more ➡️ https://mzl.la/StreamingAds",
                'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?  This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few.   Learn more ➡️ https://mzl.la/StreamingAds",
                'thumbnail': r're:^https?://.*\.jpg$',
                'uploader': 'Mozilla',
                'uploader_id': 'mozilla',
                'uploader_url': 'https://' + current_instance + '/mozilla',
                'upload_date': '20201027',
                'timestamp': 1603820982
            },
        },
    ]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        parsed_url = compat_urlparse.urlparse(url)
        base_url = parsed_url.scheme + '://' + parsed_url.netloc

        self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
        webpage = self._download_webpage(url, video_id)

        video_url = base_url + self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url')
        ext = determine_ext(video_url)

        if ext == 'unknown_video':
            formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
        else:
            formats = [{
                'url': video_url,
                'ext': ext
            }]

        title = (
            self._og_search_description(webpage).replace('\n', ' ')
            or self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title'))
        description = title

        mobj = re.match(self._VALID_URL, url)
        uploader_id = (
            mobj.group('uploader_id')
            or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False))

        if uploader_id:
            uploader_url = base_url + '/' + uploader_id

        uploader = self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)

        if uploader:
            title = uploader + ' - ' + title

        view_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage, 'view count', fatal=False))
        like_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage, 'like count', fatal=False))
        repost_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
        comment_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))

        thumbnail = base_url + (self._html_search_meta('og:image', webpage, 'thumbnail url')
                                or self._html_search_regex(r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False))

        thumbnail = remove_end(thumbnail, '%3Asmall')  # if parsed with regex, it should contain this

        thumbnails = []
        thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig')
        for id in thumbnail_ids:
            thumbnails.append({
                'id': id,
                'url': thumbnail + '%3A' + id,
            })

        date = self._html_search_regex(r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False)
        upload_date = unified_strdate(date)
        timestamp = unified_timestamp(date)

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'uploader': uploader,
            'timestamp': timestamp,
            'uploader_id': uploader_id,
            'uploader_url': uploader_url,
            'view_count': view_count,
            'like_count': like_count,
            'repost_count': repost_count,
            'comment_count': comment_count,
            'formats': formats,
            'thumbnails': thumbnails,
            'thumbnail': thumbnail,
            'upload_date': upload_date,
        }
Commit	Line	Data
bb8a73a0	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	from .common import InfoExtractor
	5	from ..compat import compat_urlparse
	6	from ..utils import (
	7	parse_count,
	8	unified_strdate,
	9	unified_timestamp,
	10	remove_end,
	11	determine_ext,
	12	)
	13	import re
	14
	15
	16	class NitterIE(InfoExtractor):
	17	# Taken from https://github.com/zedeus/nitter/wiki/Instances
	18	INSTANCES = ('nitter.net',
	19	'nitter.snopyta.org',
	20	'nitter.42l.fr',
	21	'nitter.nixnet.services',
	22	'nitter.13ad.de',
	23	'nitter.pussthecat.org',
	24	'nitter.mastodont.cat',
	25	'nitter.dark.fail',
	26	'nitter.tedomum.net',
	27	'nitter.cattube.org',
	28	'nitter.fdn.fr',
	29	'nitter.1d4.us',
	30	'nitter.kavin.rocks',
	31	'tweet.lambda.dance',
	32	'nitter.cc',
	33	'nitter.weaponizedhumiliation.com',
	34	'3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
	35	'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
	36	'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion')
	37
	38	_INSTANCES_RE = '(?:' + '\|'.join([re.escape(instance) for instance in INSTANCES]) + ')'
	39	_VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE}
	40	current_instance = INSTANCES[0] # the test and official instance
	41	_TESTS = [
	42	{
	43	# GIF (wrapped in mp4)
	44	'url': 'https://' + current_instance + '/firefox/status/1314279897502629888#m',
	45	'info_dict': {
	46	'id': '1314279897502629888',
	47	'ext': 'mp4',
	48	'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet',
	49	'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet',
	50	'thumbnail': r're:^https?://.*\.jpg$',
	51	'uploader': 'Firefox 🔥',
	52	'uploader_id': 'firefox',
	53	'uploader_url': 'https://' + current_instance + '/firefox',
	54	'upload_date': '20201008',
	55	'timestamp': 1602183720,
	56	},
	57	}, { # normal video
	58	'url': 'https://' + current_instance + '/Le___Doc/status/1299715685392756737#m',
	59	'info_dict': {
	60	'id': '1299715685392756737',
	61	'ext': 'mp4',
	62	'title': 'Le Doc - "Je ne prédis jamais rien" D Raoult, Août 2020...',
	63	'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...',
	64	'thumbnail': r're:^https?://.*\.jpg$',
65	'uploader': 'Le Doc',
66	'uploader_id': 'Le___Doc',
67	'uploader_url': 'https://' + current_instance + '/Le___Doc',
68	'upload_date': '20200829',
69	'timestamp': 1598711341,
70	'view_count': int,
71	'like_count': int,
72	'repost_count': int,
73	'comment_count': int,
74	},
75	}, { # video embed in a "Streaming Political Ads" box
76	'url': 'https://' + current_instance + '/mozilla/status/1321147074491092994#m',
77	'info_dict': {
78	'id': '1321147074491092994',
79	'ext': 'mp4',
80	'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds",
81	'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds",
82	'thumbnail': r're:^https?://.*\.jpg$',
83	'uploader': 'Mozilla',
84	'uploader_id': 'mozilla',
85	'uploader_url': 'https://' + current_instance + '/mozilla',
86	'upload_date': '20201027',
87	'timestamp': 1603820982
88	},
89	},
90	]
91
92	def _real_extract(self, url):
93	video_id = self._match_id(url)
94	parsed_url = compat_urlparse.urlparse(url)
95	base_url = parsed_url.scheme + '://' + parsed_url.netloc
96
97	self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
98	webpage = self._download_webpage(url, video_id)
99
100	video_url = base_url + self._html_search_regex(r'(?:<video[^>]+data-url\|<source[^>]+src)="([^"]+)"', webpage, 'video url')
101	ext = determine_ext(video_url)
102
103	if ext == 'unknown_video':
104	formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
105	else:
106	formats = [{
107	'url': video_url,
108	'ext': ext
109	}]
110
111	title = (
112	self._og_search_description(webpage).replace('\n', ' ')
113	or self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title'))
114	description = title
115
116	mobj = re.match(self._VALID_URL, url)
117	uploader_id = (
118	mobj.group('uploader_id')
119	or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False))
120
121	if uploader_id:
122	uploader_url = base_url + '/' + uploader_id
123
124	uploader = self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
125
126	if uploader:
127	title = uploader + ' - ' + title
128
129	view_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage, 'view count', fatal=False))
130	like_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage, 'like count', fatal=False))
131	repost_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
132	comment_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
133
134	thumbnail = base_url + (self._html_search_meta('og:image', webpage, 'thumbnail url')
135	or self._html_search_regex(r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False))
136
137	thumbnail = remove_end(thumbnail, '%3Asmall') # if parsed with regex, it should contain this
138
139	thumbnails = []
140	thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig')
141	for id in thumbnail_ids:
142	thumbnails.append({
143	'id': id,
144	'url': thumbnail + '%3A' + id,
145	})
146
147	date = self._html_search_regex(r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False)
148	upload_date = unified_strdate(date)
149	timestamp = unified_timestamp(date)
150
151	return {
152	'id': video_id,
153	'title': title,
154	'description': description,
155	'uploader': uploader,
156	'timestamp': timestamp,
157	'uploader_id': uploader_id,
158	'uploader_url': uploader_url,
159	'view_count': view_count,
160	'like_count': like_count,
161	'repost_count': repost_count,
162	'comment_count': comment_count,
163	'formats': formats,
164	'thumbnails': thumbnails,
165	'thumbnail': thumbnail,
166	'upload_date': upload_date,
167	}