2 from __future__
import unicode_literals
4 from .common
import InfoExtractor
5 from ..compat
import compat_urlparse
16 class NitterIE(InfoExtractor
):
17 # Taken from https://github.com/zedeus/nitter/wiki/Instances
18 INSTANCES
= ('nitter.net',
21 'nitter.nixnet.services',
23 'nitter.pussthecat.org',
24 'nitter.mastodont.cat',
33 'nitter.weaponizedhumiliation.com',
34 '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
35 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
36 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion')
38 _INSTANCES_RE
= '(?:' + '|'.join([re
.escape(instance
) for instance
in INSTANCES
]) + ')'
39 _VALID_URL
= r
'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE}
40 current_instance
= INSTANCES
[0] # the test and official instance
43 # GIF (wrapped in mp4)
44 'url': 'https://' + current_instance
+ '/firefox/status/1314279897502629888#m',
46 'id': '1314279897502629888',
48 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet',
49 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet',
50 'thumbnail': r
're:^https?://.*\.jpg$',
51 'uploader': 'Firefox 🔥',
52 'uploader_id': 'firefox',
53 'uploader_url': 'https://' + current_instance
+ '/firefox',
54 'upload_date': '20201008',
55 'timestamp': 1602183720,
58 'url': 'https://' + current_instance
+ '/Le___Doc/status/1299715685392756737#m',
60 'id': '1299715685392756737',
62 'title': 'Le Doc - "Je ne prédis jamais rien" D Raoult, Août 2020...',
63 'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...',
64 'thumbnail': r
're:^https?://.*\.jpg$',
66 'uploader_id': 'Le___Doc',
67 'uploader_url': 'https://' + current_instance
+ '/Le___Doc',
68 'upload_date': '20200829',
69 'timestamp': 1598711341,
75 }, { # video embed in a "Streaming Political Ads" box
76 'url': 'https://' + current_instance
+ '/mozilla/status/1321147074491092994#m',
78 'id': '1321147074491092994',
80 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds",
81 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds",
82 'thumbnail': r
're:^https?://.*\.jpg$',
83 'uploader': 'Mozilla',
84 'uploader_id': 'mozilla',
85 'uploader_url': 'https://' + current_instance
+ '/mozilla',
86 'upload_date': '20201027',
87 'timestamp': 1603820982
92 def _real_extract(self
, url
):
93 video_id
= self
._match
_id
(url
)
94 parsed_url
= compat_urlparse
.urlparse(url
)
95 base_url
= parsed_url
.scheme
+ '://' + parsed_url
.netloc
97 self
._set
_cookie
(parsed_url
.netloc
, 'hlsPlayback', 'on')
98 webpage
= self
._download
_webpage
(url
, video_id
)
100 video_url
= base_url
+ self
._html
_search
_regex
(r
'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage
, 'video url')
101 ext
= determine_ext(video_url
)
103 if ext
== 'unknown_video':
104 formats
= self
._extract
_m
3u8_formats
(video_url
, video_id
, ext
='mp4')
112 self
._og
_search
_description
(webpage
).replace('\n', ' ')
113 or self
._html
_search
_regex
(r
'<div class="tweet-content[^>]+>([^<]+)</div>', webpage
, 'title'))
116 mobj
= re
.match(self
._VALID
_URL
, url
)
118 mobj
.group('uploader_id')
119 or self
._html
_search
_regex
(r
'<a class="fullname"[^>]+title="([^"]+)"', webpage
, 'uploader name', fatal
=False))
122 uploader_url
= base_url
+ '/' + uploader_id
124 uploader
= self
._html
_search
_regex
(r
'<a class="fullname"[^>]+title="([^"]+)"', webpage
, 'uploader name', fatal
=False)
127 title
= uploader
+ ' - ' + title
129 view_count
= parse_count(self
._html
_search
_regex
(r
'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage
, 'view count', fatal
=False))
130 like_count
= parse_count(self
._html
_search
_regex
(r
'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage
, 'like count', fatal
=False))
131 repost_count
= parse_count(self
._html
_search
_regex
(r
'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage
, 'repost count', fatal
=False))
132 comment_count
= parse_count(self
._html
_search
_regex
(r
'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage
, 'repost count', fatal
=False))
134 thumbnail
= base_url
+ (self
._html
_search
_meta
('og:image', webpage
, 'thumbnail url')
135 or self
._html
_search
_regex
(r
'<video[^>]+poster="([^"]+)"', webpage
, 'thumbnail url', fatal
=False))
137 thumbnail
= remove_end(thumbnail
, '%3Asmall') # if parsed with regex, it should contain this
140 thumbnail_ids
= ('thumb', 'small', 'large', 'medium', 'orig')
141 for id in thumbnail_ids
:
144 'url': thumbnail
+ '%3A' + id,
147 date
= self
._html
_search
_regex
(r
'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage
, 'upload date', fatal
=False)
148 upload_date
= unified_strdate(date
)
149 timestamp
= unified_timestamp(date
)
154 'description': description
,
155 'uploader': uploader
,
156 'timestamp': timestamp
,
157 'uploader_id': uploader_id
,
158 'uploader_url': uploader_url
,
159 'view_count': view_count
,
160 'like_count': like_count
,
161 'repost_count': repost_count
,
162 'comment_count': comment_count
,
164 'thumbnails': thumbnails
,
165 'thumbnail': thumbnail
,
166 'upload_date': upload_date
,