2 from __future__
import unicode_literals
4 from .common
import InfoExtractor
5 from ..compat
import compat_urlparse
17 class NitterIE(InfoExtractor
):
18 # Taken from https://github.com/zedeus/nitter/wiki/Instances
20 NON_HTTP_INSTANCES
= (
21 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
22 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
23 'nitter7bryz3jv7e3uekphigvmoyoem4al3fynerxkj22dmoxoq553qd.onion',
24 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion',
25 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion',
26 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion',
27 '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion',
30 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p',
32 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion',
37 'nitter.pussthecat.org',
38 'nitter.nixnet.services',
39 'nitter.mastodont.cat',
46 'nitter.vxempire.xyz',
48 'nitter.domain.glass',
49 'nitter.himiko.cloud',
52 'nitter.mailstation.de',
53 'nitter.actionsack.com',
56 'birdsite.xanny.family',
60 # not in the list anymore
68 # official, rate limited
72 'nitter.weaponizedhumiliation.com',
75 INSTANCES
= NON_HTTP_INSTANCES
+ HTTP_INSTANCES
+ DEAD_INSTANCES
77 _INSTANCES_RE
= '(?:' + '|'.join([re
.escape(instance
) for instance
in INSTANCES
]) + ')'
78 _VALID_URL
= r
'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE}
79 current_instance
= random
.choice(HTTP_INSTANCES
)
83 # GIF (wrapped in mp4)
84 'url': 'https://%s/firefox/status/1314279897502629888#m' % current_instance
,
86 'id': '1314279897502629888',
88 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet',
89 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet',
90 'thumbnail': r
're:^https?://.*\.jpg$',
91 'uploader': 'Firefox 🔥',
92 'uploader_id': 'firefox',
93 'uploader_url': 'https://%s/firefox' % current_instance
,
94 'upload_date': '20201008',
95 'timestamp': 1602183720,
98 'url': 'https://%s/Le___Doc/status/1299715685392756737#m' % current_instance
,
100 'id': '1299715685392756737',
102 'title': 'Le Doc - "Je ne prédis jamais rien"\nD Raoult, Août 2020...',
103 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...',
104 'thumbnail': r
're:^https?://.*\.jpg$',
105 'uploader': 'Le Doc',
106 'uploader_id': 'Le___Doc',
107 'uploader_url': 'https://%s/Le___Doc' % current_instance
,
108 'upload_date': '20200829',
109 'timestamp': 1598711341,
113 'comment_count': int,
115 }, { # video embed in a "Streaming Political Ads" box
116 'url': 'https://%s/mozilla/status/1321147074491092994#m' % current_instance
,
118 'id': '1321147074491092994',
120 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds",
121 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds",
122 'thumbnail': r
're:^https?://.*\.jpg$',
123 'uploader': 'Mozilla',
124 'uploader_id': 'mozilla',
125 'uploader_url': 'https://%s/mozilla' % current_instance
,
126 'upload_date': '20201027',
127 'timestamp': 1603820982
129 }, { # not the first tweet but main-tweet
130 'url': 'https://%s/TheNaturalNu/status/1379050895539724290#m' % current_instance
,
132 'id': '1379050895539724290',
134 'title': 'Dorothy Zbornak - This had me hollering!!',
135 'description': 'This had me hollering!!',
136 'thumbnail': r
're:^https?://.*\.jpg$',
137 'uploader': 'Dorothy Zbornak',
138 'uploader_id': 'TheNaturalNu',
139 'uploader_url': 'https://%s/TheNaturalNu' % current_instance
,
140 'timestamp': 1617626329,
141 'upload_date': '20210405'
146 def _real_extract(self
, url
):
147 video_id
= self
._match
_id
(url
)
148 parsed_url
= compat_urlparse
.urlparse(url
)
149 base_url
= '%s://%s' % (parsed_url
.scheme
, parsed_url
.netloc
)
151 self
._set
_cookie
(parsed_url
.netloc
, 'hlsPlayback', 'on')
152 full_webpage
= self
._download
_webpage
(url
, video_id
)
154 main_tweet_start
= full_webpage
.find('class="main-tweet"')
155 if main_tweet_start
> 0:
156 webpage
= full_webpage
[main_tweet_start
:]
158 webpage
= full_webpage
160 video_url
= '%s%s' % (base_url
, self
._html
_search
_regex
(r
'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage
, 'video url'))
161 ext
= determine_ext(video_url
)
163 if ext
== 'unknown_video':
164 formats
= self
._extract
_m
3u8_formats
(video_url
, video_id
, ext
='mp4')
171 title
= self
._og
_search
_description
(full_webpage
)
173 title
= self
._html
_search
_regex
(r
'<div class="tweet-content[^>]+>([^<]+)</div>', webpage
, 'title')
176 mobj
= self
._match
_valid
_url
(url
)
178 mobj
.group('uploader_id')
179 or self
._html
_search
_regex
(r
'<a class="fullname"[^>]+title="([^"]+)"', webpage
, 'uploader name', fatal
=False)
183 uploader_url
= '%s/%s' % (base_url
, uploader_id
)
185 uploader
= self
._html
_search
_regex
(r
'<a class="fullname"[^>]+title="([^"]+)"', webpage
, 'uploader name', fatal
=False)
188 title
= '%s - %s' % (uploader
, title
)
190 view_count
= parse_count(self
._html
_search
_regex
(r
'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage
, 'view count', fatal
=False))
191 like_count
= parse_count(self
._html
_search
_regex
(r
'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage
, 'like count', fatal
=False))
192 repost_count
= parse_count(self
._html
_search
_regex
(r
'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage
, 'repost count', fatal
=False))
193 comment_count
= parse_count(self
._html
_search
_regex
(r
'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage
, 'repost count', fatal
=False))
195 thumbnail
= self
._html
_search
_meta
('og:image', full_webpage
, 'thumbnail url')
197 thumbnail
= '%s%s' % (base_url
, self
._html
_search
_regex
(r
'<video[^>]+poster="([^"]+)"', webpage
, 'thumbnail url', fatal
=False))
198 thumbnail
= remove_end(thumbnail
, '%3Asmall')
201 thumbnail_ids
= ('thumb', 'small', 'large', 'medium', 'orig')
202 for id in thumbnail_ids
:
205 'url': thumbnail
+ '%3A' + id,
208 date
= self
._html
_search
_regex
(r
'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage
, 'upload date', fatal
=False)
209 upload_date
= unified_strdate(date
)
210 timestamp
= unified_timestamp(date
)
215 'description': description
,
216 'uploader': uploader
,
217 'timestamp': timestamp
,
218 'uploader_id': uploader_id
,
219 'uploader_url': uploader_url
,
220 'view_count': view_count
,
221 'like_count': like_count
,
222 'repost_count': repost_count
,
223 'comment_count': comment_count
,
225 'thumbnails': thumbnails
,
226 'thumbnail': thumbnail
,
227 'upload_date': upload_date
,