]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/newgrounds.py
Completely change project name to yt-dlp (#85)
[yt-dlp.git] / yt_dlp / extractor / newgrounds.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7 ExtractorError,
8 extract_attributes,
9 int_or_none,
10 parse_duration,
11 parse_filesize,
12 unified_timestamp,
13 )
14
15
16 class NewgroundsIE(InfoExtractor):
17 _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>[0-9]+)'
18 _TESTS = [{
19 'url': 'https://www.newgrounds.com/audio/listen/549479',
20 'md5': 'fe6033d297591288fa1c1f780386f07a',
21 'info_dict': {
22 'id': '549479',
23 'ext': 'mp3',
24 'title': 'Burn7 - B7 - BusMode',
25 'uploader': 'Burn7',
26 'timestamp': 1378878540,
27 'upload_date': '20130911',
28 'duration': 143,
29 },
30 }, {
31 'url': 'https://www.newgrounds.com/portal/view/1',
32 'md5': 'fbfb40e2dc765a7e830cb251d370d981',
33 'info_dict': {
34 'id': '1',
35 'ext': 'mp4',
36 'title': 'Brian-Beaton - Scrotum 1',
37 'uploader': 'Brian-Beaton',
38 'timestamp': 955064100,
39 'upload_date': '20000406',
40 },
41 }, {
42 # source format unavailable, additional mp4 formats
43 'url': 'http://www.newgrounds.com/portal/view/689400',
44 'info_dict': {
45 'id': '689400',
46 'ext': 'mp4',
47 'title': 'Bennettthesage - ZTV News Episode 8',
48 'uploader': 'BennettTheSage',
49 'timestamp': 1487965140,
50 'upload_date': '20170224',
51 },
52 'params': {
53 'skip_download': True,
54 },
55 }]
56
57 def _real_extract(self, url):
58 media_id = self._match_id(url)
59 formats = []
60 uploader = None
61 webpage = self._download_webpage(url, media_id)
62
63 title = self._html_search_regex(
64 r'<title>([^>]+)</title>', webpage, 'title')
65
66 media_url_string = self._search_regex(
67 r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None, fatal=False)
68
69 if media_url_string:
70 media_url = self._parse_json(media_url_string, media_id)
71 formats = [{
72 'url': media_url,
73 'format_id': 'source',
74 'quality': 1,
75 }]
76
77 max_resolution = int_or_none(self._search_regex(
78 r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution',
79 default=None))
80 if max_resolution:
81 url_base = media_url.rpartition('.')[0]
82 for resolution in (360, 720, 1080):
83 if resolution > max_resolution:
84 break
85 formats.append({
86 'url': '%s.%dp.mp4' % (url_base, resolution),
87 'format_id': '%dp' % resolution,
88 'height': resolution,
89 })
90 else:
91 video_id = int_or_none(self._search_regex(
92 r'data-movie-id=\\"([0-9]+)\\"', webpage, ''))
93 if not video_id:
94 raise ExtractorError('Could not extract media data')
95
96 url_video_data = 'https://www.newgrounds.com/portal/video/%s' % video_id
97 headers = {
98 'Accept': 'application/json',
99 'Referer': url,
100 'X-Requested-With': 'XMLHttpRequest'
101 }
102 json_video = self._download_json(url_video_data, video_id, headers=headers, fatal=False)
103 if not json_video:
104 raise ExtractorError('Could not fetch media data')
105
106 uploader = json_video.get('author')
107 title = json_video.get('title')
108 media_formats = json_video.get('sources', [])
109 for media_format in media_formats:
110 media_sources = media_formats[media_format]
111 for source in media_sources:
112 formats.append({
113 'format_id': media_format,
114 'quality': int_or_none(media_format[:-1]),
115 'url': source.get('src')
116 })
117
118 self._check_formats(formats, media_id)
119 self._sort_formats(formats)
120
121 if not uploader:
122 uploader = self._html_search_regex(
123 (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*(?:Author|Artist)\s*</em>',
124 r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader',
125 fatal=False)
126
127 timestamp = unified_timestamp(self._html_search_regex(
128 (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)',
129 r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp',
130 default=None))
131 duration = parse_duration(self._search_regex(
132 r'(?s)<dd>\s*Song\s*</dd>\s*<dd>.+?</dd>\s*<dd>([^<]+)', webpage,
133 'duration', default=None))
134
135 filesize_approx = parse_filesize(self._html_search_regex(
136 r'(?s)<dd>\s*Song\s*</dd>\s*<dd>(.+?)</dd>', webpage, 'filesize',
137 default=None))
138 if len(formats) == 1:
139 formats[0]['filesize_approx'] = filesize_approx
140
141 if '<dd>Song' in webpage:
142 formats[0]['vcodec'] = 'none'
143
144 if uploader:
145 title = "%s - %s" % (uploader, title)
146
147 return {
148 'id': media_id,
149 'title': title,
150 'uploader': uploader,
151 'timestamp': timestamp,
152 'duration': duration,
153 'formats': formats,
154 }
155
156
157 class NewgroundsPlaylistIE(InfoExtractor):
158 _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P<id>[^/?#&]+)'
159 _TESTS = [{
160 'url': 'https://www.newgrounds.com/collection/cats',
161 'info_dict': {
162 'id': 'cats',
163 'title': 'Cats',
164 },
165 'playlist_mincount': 46,
166 }, {
167 'url': 'http://www.newgrounds.com/portal/search/author/ZONE-SAMA',
168 'info_dict': {
169 'id': 'ZONE-SAMA',
170 'title': 'Portal Search: ZONE-SAMA',
171 },
172 'playlist_mincount': 47,
173 }, {
174 'url': 'http://www.newgrounds.com/audio/search/title/cats',
175 'only_matching': True,
176 }]
177
178 def _real_extract(self, url):
179 playlist_id = self._match_id(url)
180
181 webpage = self._download_webpage(url, playlist_id)
182
183 title = self._search_regex(
184 r'<title>([^>]+)</title>', webpage, 'title', default=None)
185
186 # cut left menu
187 webpage = self._search_regex(
188 r'(?s)<div[^>]+\bclass=["\']column wide(.+)',
189 webpage, 'wide column', default=webpage)
190
191 entries = []
192 for a, path, media_id in re.findall(
193 r'(<a[^>]+\bhref=["\']/?((?:portal/view|audio/listen)/(\d+))[^>]+>)',
194 webpage):
195 a_class = extract_attributes(a).get('class')
196 if a_class not in ('item-portalsubmission', 'item-audiosubmission'):
197 continue
198 entries.append(
199 self.url_result(
200 'https://www.newgrounds.com/%s' % path,
201 ie=NewgroundsIE.ie_key(), video_id=media_id))
202
203 return self.playlist_result(entries, playlist_id, title)