]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/iwara.py
[cleanup] Consistent style for file heads
[yt-dlp.git] / yt_dlp / extractor / iwara.py
CommitLineData
661e7253 1import itertools
8eb7ba82 2import re
ac668111 3import urllib.parse
001a5fd3
YCH
4
5from .common import InfoExtractor
caf0f5f8
YCH
6from ..utils import (
7 int_or_none,
8 mimetype2ext,
9 remove_end,
54007a45 10 strip_or_none,
11 unified_strdate,
4ecf300d 12 url_or_none,
ff4d7860 13 urljoin,
caf0f5f8 14)
001a5fd3
YCH
15
16
ff4d7860 17class IwaraBaseIE(InfoExtractor):
18 _BASE_REGEX = r'(?P<base_url>https?://(?:www\.|ecchi\.)?iwara\.tv)'
19
20 def _extract_playlist(self, base_url, webpage):
21 for path in re.findall(r'class="title">\s*<a[^<]+href="([^"]+)', webpage):
22 yield self.url_result(urljoin(base_url, path))
23
24
25class IwaraIE(IwaraBaseIE):
26 _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/videos/(?P<id>[a-zA-Z0-9]+)'
001a5fd3
YCH
27 _TESTS = [{
28 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD',
caf0f5f8 29 # md5 is unstable
001a5fd3
YCH
30 'info_dict': {
31 'id': 'amVwUl1EHpAD9RD',
32 'ext': 'mp4',
33 'title': '【MMD R-18】ガールフレンド carry_me_off',
34 'age_limit': 18,
8eb7ba82
B
35 'thumbnail': 'https://i.iwara.tv/sites/default/files/videos/thumbnails/7951/thumbnail-7951_0001.png',
36 'uploader': 'Reimu丨Action',
37 'upload_date': '20150828',
38 'description': 'md5:1d4905ce48c66c9299c617f08e106e0f',
001a5fd3
YCH
39 },
40 }, {
41 'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO',
42 'md5': '7e5f1f359cd51a027ba4a7b7710a50f0',
43 'info_dict': {
44 'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc',
45 'ext': 'mp4',
caf0f5f8 46 'title': '[3D Hentai] Kyonyu × Genkai × Emaki Shinobi Girls.mp4',
001a5fd3
YCH
47 'age_limit': 18,
48 },
49 'add_ie': ['GoogleDrive'],
50 }, {
51 'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq',
caf0f5f8 52 # md5 is unstable
001a5fd3
YCH
53 'info_dict': {
54 'id': '6liAP9s2Ojc',
55 'ext': 'mp4',
caf0f5f8 56 'age_limit': 18,
001a5fd3
YCH
57 'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)',
58 'description': 'md5:590c12c0df1443d833fbebe05da8c47a',
59 'upload_date': '20160910',
60 'uploader': 'aMMDsork',
61 'uploader_id': 'UCVOFyOSCyFkXTYYHITtqB7A',
62 },
63 'add_ie': ['Youtube'],
64 }]
65
66 def _real_extract(self, url):
67 video_id = self._match_id(url)
68
69 webpage, urlh = self._download_webpage_handle(url, video_id)
70
ff4d7860 71 hostname = urllib.parse.urlparse(urlh.geturl()).hostname
001a5fd3
YCH
72 # ecchi is 'sexy' in Japanese
73 age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0
74
caf0f5f8 75 video_data = self._download_json('http://www.iwara.tv/api/video/%s' % video_id, video_id)
001a5fd3 76
caf0f5f8 77 if not video_data:
001a5fd3
YCH
78 iframe_url = self._html_search_regex(
79 r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1',
80 webpage, 'iframe URL', group='url')
81 return {
82 '_type': 'url_transparent',
83 'url': iframe_url,
84 'age_limit': age_limit,
85 }
86
04f3fd2c 87 title = remove_end(self._html_extract_title(webpage), ' | Iwara')
001a5fd3 88
63b1ad0f 89 thumbnail = self._html_search_regex(
8eb7ba82
B
90 r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None)
91
92 uploader = self._html_search_regex(
93 r'class="username">([^<]+)', webpage, 'uploader', fatal=False)
94
95 upload_date = unified_strdate(self._html_search_regex(
96 r'作成日:([^\s]+)', webpage, 'upload_date', fatal=False))
97
98 description = strip_or_none(self._search_regex(
99 r'<p>(.+?(?=</div))', webpage, 'description', fatal=False,
100 flags=re.DOTALL))
63b1ad0f 101
caf0f5f8
YCH
102 formats = []
103 for a_format in video_data:
4ecf300d
S
104 format_uri = url_or_none(a_format.get('uri'))
105 if not format_uri:
106 continue
caf0f5f8
YCH
107 format_id = a_format.get('resolution')
108 height = int_or_none(self._search_regex(
109 r'(\d+)p', format_id, 'height', default=None))
110 formats.append({
4ecf300d 111 'url': self._proto_relative_url(format_uri, 'https:'),
caf0f5f8
YCH
112 'format_id': format_id,
113 'ext': mimetype2ext(a_format.get('mime')) or 'mp4',
114 'height': height,
2ab2c0d1 115 'width': int_or_none(height / 9.0 * 16.0 if height else None),
caf0f5f8
YCH
116 'quality': 1 if format_id == 'Source' else 0,
117 })
118
119 self._sort_formats(formats)
120
121 return {
001a5fd3
YCH
122 'id': video_id,
123 'title': title,
124 'age_limit': age_limit,
caf0f5f8 125 'formats': formats,
63b1ad0f 126 'thumbnail': self._proto_relative_url(thumbnail, 'https:'),
8eb7ba82
B
127 'uploader': uploader,
128 'upload_date': upload_date,
129 'description': description,
caf0f5f8 130 }
ff4d7860 131
132
133class IwaraPlaylistIE(IwaraBaseIE):
134 _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/playlist/(?P<id>[^/?#&]+)'
135 IE_NAME = 'iwara:playlist'
136
137 _TESTS = [{
138 'url': 'https://ecchi.iwara.tv/playlist/best-enf',
139 'info_dict': {
140 'title': 'Best enf',
141 'uploader': 'Jared98112',
142 'id': 'best-enf',
143 },
144 'playlist_mincount': 1097,
145 }, {
146 # urlencoded
147 'url': 'https://ecchi.iwara.tv/playlist/%E3%83%97%E3%83%AC%E3%82%A4%E3%83%AA%E3%82%B9%E3%83%88-2',
148 'info_dict': {
149 'id': 'プレイリスト-2',
150 'title': 'プレイリスト',
151 'uploader': 'mainyu',
152 },
153 'playlist_mincount': 91,
154 }]
155
156 def _real_extract(self, url):
157 playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url')
158 playlist_id = urllib.parse.unquote(playlist_id)
159 webpage = self._download_webpage(url, playlist_id)
160
161 return {
162 '_type': 'playlist',
163 'id': playlist_id,
164 'title': self._html_search_regex(r'class="title"[^>]*>([^<]+)', webpage, 'title', fatal=False),
165 'uploader': self._html_search_regex(r'<h2>([^<]+)', webpage, 'uploader', fatal=False),
166 'entries': self._extract_playlist(base_url, webpage),
167 }
168
169
170class IwaraUserIE(IwaraBaseIE):
171 _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/users/(?P<id>[^/?#&]+)'
172 IE_NAME = 'iwara:user'
173
174 _TESTS = [{
661e7253
L
175 'note': 'number of all videos page is just 1 page. less than 40 videos',
176 'url': 'https://ecchi.iwara.tv/users/infinityyukarip',
ff4d7860 177 'info_dict': {
661e7253
L
178 'title': 'Uploaded videos from Infinity_YukariP',
179 'id': 'infinityyukarip',
180 'uploader': 'Infinity_YukariP',
181 'uploader_id': 'infinityyukarip',
ff4d7860 182 },
661e7253 183 'playlist_mincount': 39,
ff4d7860 184 }, {
661e7253
L
185 'note': 'no even all videos page. probably less than 10 videos',
186 'url': 'https://ecchi.iwara.tv/users/mmd-quintet',
187 'info_dict': {
188 'title': 'Uploaded videos from mmd quintet',
189 'id': 'mmd-quintet',
190 'uploader': 'mmd quintet',
191 'uploader_id': 'mmd-quintet',
192 },
193 'playlist_mincount': 6,
194 }, {
195 'note': 'has paging. more than 40 videos',
196 'url': 'https://ecchi.iwara.tv/users/theblackbirdcalls',
197 'info_dict': {
198 'title': 'Uploaded videos from TheBlackbirdCalls',
199 'id': 'theblackbirdcalls',
200 'uploader': 'TheBlackbirdCalls',
201 'uploader_id': 'theblackbirdcalls',
202 },
203 'playlist_mincount': 420,
204 }, {
205 'note': 'foreign chars in URL. there must be foreign characters in URL',
206 'url': 'https://ecchi.iwara.tv/users/ぶた丼',
ff4d7860 207 'info_dict': {
661e7253
L
208 'title': 'Uploaded videos from ぶた丼',
209 'id': 'ぶた丼',
210 'uploader': 'ぶた丼',
211 'uploader_id': 'ぶた丼',
ff4d7860 212 },
661e7253 213 'playlist_mincount': 170,
ff4d7860 214 }]
215
661e7253
L
216 def _entries(self, playlist_id, base_url):
217 webpage = self._download_webpage(
218 f'{base_url}/users/{playlist_id}', playlist_id)
219 videos_url = self._search_regex(r'<a href="(/users/[^/]+/videos)(?:\?[^"]+)?">', webpage, 'all videos url', default=None)
220 if not videos_url:
221 yield from self._extract_playlist(base_url, webpage)
222 return
223
224 videos_url = urljoin(base_url, videos_url)
225
226 for n in itertools.count(1):
227 page = self._download_webpage(
228 videos_url, playlist_id, note=f'Downloading playlist page {n}',
229 query={'page': str(n - 1)} if n > 1 else {})
ff4d7860 230 yield from self._extract_playlist(
661e7253
L
231 base_url, page)
232
233 if f'page={n}' not in page:
234 break
ff4d7860 235
236 def _real_extract(self, url):
237 playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url')
238 playlist_id = urllib.parse.unquote(playlist_id)
239
ff4d7860 240 return self.playlist_result(
661e7253 241 self._entries(playlist_id, base_url), playlist_id)