]>
Commit | Line | Data |
---|---|---|
bb8a73a0 | 1 | # coding: utf-8 |
2 | from __future__ import unicode_literals | |
3 | ||
4 | from .common import InfoExtractor | |
5 | from ..compat import compat_urlparse | |
6 | from ..utils import ( | |
7 | parse_count, | |
8 | unified_strdate, | |
9 | unified_timestamp, | |
10 | remove_end, | |
11 | determine_ext, | |
12 | ) | |
13 | import re | |
14 | ||
15 | ||
16 | class NitterIE(InfoExtractor): | |
17 | # Taken from https://github.com/zedeus/nitter/wiki/Instances | |
18 | INSTANCES = ('nitter.net', | |
19 | 'nitter.snopyta.org', | |
20 | 'nitter.42l.fr', | |
21 | 'nitter.nixnet.services', | |
22 | 'nitter.13ad.de', | |
23 | 'nitter.pussthecat.org', | |
24 | 'nitter.mastodont.cat', | |
25 | 'nitter.dark.fail', | |
26 | 'nitter.tedomum.net', | |
27 | 'nitter.cattube.org', | |
28 | 'nitter.fdn.fr', | |
29 | 'nitter.1d4.us', | |
30 | 'nitter.kavin.rocks', | |
31 | 'tweet.lambda.dance', | |
32 | 'nitter.cc', | |
33 | 'nitter.weaponizedhumiliation.com', | |
34 | '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion', | |
35 | 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion', | |
36 | 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion') | |
37 | ||
38 | _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')' | |
39 | _VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE} | |
40 | current_instance = INSTANCES[0] # the test and official instance | |
41 | _TESTS = [ | |
42 | { | |
43 | # GIF (wrapped in mp4) | |
44 | 'url': 'https://' + current_instance + '/firefox/status/1314279897502629888#m', | |
45 | 'info_dict': { | |
46 | 'id': '1314279897502629888', | |
47 | 'ext': 'mp4', | |
48 | 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet', | |
49 | 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet', | |
50 | 'thumbnail': r're:^https?://.*\.jpg$', | |
51 | 'uploader': 'Firefox 🔥', | |
52 | 'uploader_id': 'firefox', | |
53 | 'uploader_url': 'https://' + current_instance + '/firefox', | |
54 | 'upload_date': '20201008', | |
55 | 'timestamp': 1602183720, | |
56 | }, | |
57 | }, { # normal video | |
58 | 'url': 'https://' + current_instance + '/Le___Doc/status/1299715685392756737#m', | |
59 | 'info_dict': { | |
60 | 'id': '1299715685392756737', | |
61 | 'ext': 'mp4', | |
62 | 'title': 'Le Doc - "Je ne prédis jamais rien" D Raoult, Août 2020...', | |
63 | 'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...', | |
64 | 'thumbnail': r're:^https?://.*\.jpg$', | |
65 | 'uploader': 'Le Doc', | |
66 | 'uploader_id': 'Le___Doc', | |
67 | 'uploader_url': 'https://' + current_instance + '/Le___Doc', | |
68 | 'upload_date': '20200829', | |
69 | 'timestamp': 1598711341, | |
70 | 'view_count': int, | |
71 | 'like_count': int, | |
72 | 'repost_count': int, | |
73 | 'comment_count': int, | |
74 | }, | |
75 | }, { # video embed in a "Streaming Political Ads" box | |
76 | 'url': 'https://' + current_instance + '/mozilla/status/1321147074491092994#m', | |
77 | 'info_dict': { | |
78 | 'id': '1321147074491092994', | |
79 | 'ext': 'mp4', | |
80 | 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds", | |
81 | 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds", | |
82 | 'thumbnail': r're:^https?://.*\.jpg$', | |
83 | 'uploader': 'Mozilla', | |
84 | 'uploader_id': 'mozilla', | |
85 | 'uploader_url': 'https://' + current_instance + '/mozilla', | |
86 | 'upload_date': '20201027', | |
87 | 'timestamp': 1603820982 | |
88 | }, | |
89 | }, | |
90 | ] | |
91 | ||
92 | def _real_extract(self, url): | |
93 | video_id = self._match_id(url) | |
94 | parsed_url = compat_urlparse.urlparse(url) | |
95 | base_url = parsed_url.scheme + '://' + parsed_url.netloc | |
96 | ||
97 | self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on') | |
98 | webpage = self._download_webpage(url, video_id) | |
99 | ||
100 | video_url = base_url + self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url') | |
101 | ext = determine_ext(video_url) | |
102 | ||
103 | if ext == 'unknown_video': | |
104 | formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4') | |
105 | else: | |
106 | formats = [{ | |
107 | 'url': video_url, | |
108 | 'ext': ext | |
109 | }] | |
110 | ||
111 | title = ( | |
112 | self._og_search_description(webpage).replace('\n', ' ') | |
113 | or self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title')) | |
114 | description = title | |
115 | ||
116 | mobj = re.match(self._VALID_URL, url) | |
117 | uploader_id = ( | |
118 | mobj.group('uploader_id') | |
119 | or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)) | |
120 | ||
121 | if uploader_id: | |
122 | uploader_url = base_url + '/' + uploader_id | |
123 | ||
124 | uploader = self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False) | |
125 | ||
126 | if uploader: | |
127 | title = uploader + ' - ' + title | |
128 | ||
129 | view_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage, 'view count', fatal=False)) | |
130 | like_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage, 'like count', fatal=False)) | |
131 | repost_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False)) | |
132 | comment_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False)) | |
133 | ||
134 | thumbnail = base_url + (self._html_search_meta('og:image', webpage, 'thumbnail url') | |
135 | or self._html_search_regex(r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)) | |
136 | ||
137 | thumbnail = remove_end(thumbnail, '%3Asmall') # if parsed with regex, it should contain this | |
138 | ||
139 | thumbnails = [] | |
140 | thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig') | |
141 | for id in thumbnail_ids: | |
142 | thumbnails.append({ | |
143 | 'id': id, | |
144 | 'url': thumbnail + '%3A' + id, | |
145 | }) | |
146 | ||
147 | date = self._html_search_regex(r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False) | |
148 | upload_date = unified_strdate(date) | |
149 | timestamp = unified_timestamp(date) | |
150 | ||
151 | return { | |
152 | 'id': video_id, | |
153 | 'title': title, | |
154 | 'description': description, | |
155 | 'uploader': uploader, | |
156 | 'timestamp': timestamp, | |
157 | 'uploader_id': uploader_id, | |
158 | 'uploader_url': uploader_url, | |
159 | 'view_count': view_count, | |
160 | 'like_count': like_count, | |
161 | 'repost_count': repost_count, | |
162 | 'comment_count': comment_count, | |
163 | 'formats': formats, | |
164 | 'thumbnails': thumbnails, | |
165 | 'thumbnail': thumbnail, | |
166 | 'upload_date': upload_date, | |
167 | } |