]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/nitter.py
[bitchute] Fix test (#758)
[yt-dlp.git] / yt_dlp / extractor / nitter.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 from .common import InfoExtractor
5 from ..compat import compat_urlparse
6 from ..utils import (
7 parse_count,
8 unified_strdate,
9 unified_timestamp,
10 remove_end,
11 determine_ext,
12 )
13 import re
14 import random
15
16
17 class NitterIE(InfoExtractor):
18 # Taken from https://github.com/zedeus/nitter/wiki/Instances
19
20 NON_HTTP_INSTANCES = (
21 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
22 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
23 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
24 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
25 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
26 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
27 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
28
29 'nitter.i2p',
30 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
31
32 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
33 )
34
35 HTTP_INSTANCES = (
36 'nitter.42l.fr',
37 'nitter.pussthecat.org',
38 'nitter.nixnet.services',
39 'nitter.mastodont.cat',
40 'nitter.tedomum.net',
41 'nitter.fdn.fr',
42 'nitter.1d4.us',
43 'nitter.kavin.rocks',
44 'tweet.lambda.dance',
45 'nitter.cc',
46 'nitter.vxempire.xyz',
47 'nitter.unixfox.eu',
48 'nitter.domain.glass',
49 'nitter.himiko.cloud',
50 'nitter.eu',
51 'nitter.namazso.eu',
52 'nitter.mailstation.de',
53 'nitter.actionsack.com',
54 'nitter.cattube.org',
55 'nitter.dark.fail',
56 'birdsite.xanny.family',
57 'nitter.40two.app',
58 'nitter.skrep.in',
59
60 # not in the list anymore
61 'nitter.snopyta.org',
62 )
63
64 DEAD_INSTANCES = (
65 # maintenance
66 'nitter.ethibox.fr',
67
68 # official, rate limited
69 'nitter.net',
70 # offline
71 'nitter.13ad.de',
72 'nitter.weaponizedhumiliation.com',
73 )
74
75 INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES
76
77 _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')'
78 _VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE}
79 current_instance = random.choice(HTTP_INSTANCES)
80
81 _TESTS = [
82 {
83 # GIF (wrapped in mp4)
84 'url': 'https://%s/firefox/status/1314279897502629888#m' % current_instance,
85 'info_dict': {
86 'id': '1314279897502629888',
87 'ext': 'mp4',
88 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet',
89 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet',
90 'thumbnail': r're:^https?://.*\.jpg$',
91 'uploader': 'Firefox 🔥',
92 'uploader_id': 'firefox',
93 'uploader_url': 'https://%s/firefox' % current_instance,
94 'upload_date': '20201008',
95 'timestamp': 1602183720,
96 },
97 }, { # normal video
98 'url': 'https://%s/Le___Doc/status/1299715685392756737#m' % current_instance,
99 'info_dict': {
100 'id': '1299715685392756737',
101 'ext': 'mp4',
102 'title': 'Le Doc - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
103 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
104 'thumbnail': r're:^https?://.*\.jpg$',
105 'uploader': 'Le Doc',
106 'uploader_id': 'Le___Doc',
107 'uploader_url': 'https://%s/Le___Doc' % current_instance,
108 'upload_date': '20200829',
109 'timestamp': 1598711341,
110 'view_count': int,
111 'like_count': int,
112 'repost_count': int,
113 'comment_count': int,
114 },
115 }, { # video embed in a "Streaming Political Ads" box
116 'url': 'https://%s/mozilla/status/1321147074491092994#m' % current_instance,
117 'info_dict': {
118 'id': '1321147074491092994',
119 'ext': 'mp4',
120 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds",
121 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds",
122 'thumbnail': r're:^https?://.*\.jpg$',
123 'uploader': 'Mozilla',
124 'uploader_id': 'mozilla',
125 'uploader_url': 'https://%s/mozilla' % current_instance,
126 'upload_date': '20201027',
127 'timestamp': 1603820982
128 },
129 }, { # not the first tweet but main-tweet
130 'url': 'https://%s/TheNaturalNu/status/1379050895539724290#m' % current_instance,
131 'info_dict': {
132 'id': '1379050895539724290',
133 'ext': 'mp4',
134 'title': 'Dorothy Zbornak - This had me hollering!!',
135 'description': 'This had me hollering!!',
136 'thumbnail': r're:^https?://.*\.jpg$',
137 'uploader': 'Dorothy Zbornak',
138 'uploader_id': 'TheNaturalNu',
139 'uploader_url': 'https://%s/TheNaturalNu' % current_instance,
140 'timestamp': 1617626329,
141 'upload_date': '20210405'
142 }
143 }
144 ]
145
146 def _real_extract(self, url):
147 video_id = self._match_id(url)
148 parsed_url = compat_urlparse.urlparse(url)
149 base_url = '%s://%s' % (parsed_url.scheme, parsed_url.netloc)
150
151 self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
152 full_webpage = self._download_webpage(url, video_id)
153
154 main_tweet_start = full_webpage.find('class="main-tweet"')
155 if main_tweet_start > 0:
156 webpage = full_webpage[main_tweet_start:]
157 if not webpage:
158 webpage = full_webpage
159
160 video_url = '%s%s' % (base_url, self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url'))
161 ext = determine_ext(video_url)
162
163 if ext == 'unknown_video':
164 formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
165 else:
166 formats = [{
167 'url': video_url,
168 'ext': ext
169 }]
170
171 title = self._og_search_description(full_webpage)
172 if not title:
173 title = self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title')
174 description = title
175
176 mobj = self._match_valid_url(url)
177 uploader_id = (
178 mobj.group('uploader_id')
179 or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
180 )
181
182 if uploader_id:
183 uploader_url = '%s/%s' % (base_url, uploader_id)
184
185 uploader = self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
186
187 if uploader:
188 title = '%s - %s' % (uploader, title)
189
190 view_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage, 'view count', fatal=False))
191 like_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage, 'like count', fatal=False))
192 repost_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
193 comment_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
194
195 thumbnail = self._html_search_meta('og:image', full_webpage, 'thumbnail url')
196 if not thumbnail:
197 thumbnail = '%s%s' % (base_url, self._html_search_regex(r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False))
198 thumbnail = remove_end(thumbnail, '%3Asmall')
199
200 thumbnails = []
201 thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig')
202 for id in thumbnail_ids:
203 thumbnails.append({
204 'id': id,
205 'url': thumbnail + '%3A' + id,
206 })
207
208 date = self._html_search_regex(r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False)
209 upload_date = unified_strdate(date)
210 timestamp = unified_timestamp(date)
211
212 return {
213 'id': video_id,
214 'title': title,
215 'description': description,
216 'uploader': uploader,
217 'timestamp': timestamp,
218 'uploader_id': uploader_id,
219 'uploader_url': uploader_url,
220 'view_count': view_count,
221 'like_count': like_count,
222 'repost_count': repost_count,
223 'comment_count': comment_count,
224 'formats': formats,
225 'thumbnails': thumbnails,
226 'thumbnail': thumbnail,
227 'upload_date': upload_date,
228 }