yt_dlp/extractor/newgrounds.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     extract_attributes,
   9     int_or_none,
  10     parse_duration,
  11     parse_filesize,
  12     unified_timestamp,
  13 )
  14
  15
  16 class NewgroundsIE(InfoExtractor):
  17     _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>[0-9]+)'
  18     _TESTS = [{
  19         'url': 'https://www.newgrounds.com/audio/listen/549479',
  20         'md5': 'fe6033d297591288fa1c1f780386f07a',
  21         'info_dict': {
  22             'id': '549479',
  23             'ext': 'mp3',
  24             'title': 'Burn7 - B7 - BusMode',
  25             'uploader': 'Burn7',
  26             'timestamp': 1378878540,
  27             'upload_date': '20130911',
  28             'duration': 143,
  29         },
  30     }, {
  31         'url': 'https://www.newgrounds.com/portal/view/1',
  32         'md5': 'fbfb40e2dc765a7e830cb251d370d981',
  33         'info_dict': {
  34             'id': '1',
  35             'ext': 'mp4',
  36             'title': 'Brian-Beaton - Scrotum 1',
  37             'uploader': 'Brian-Beaton',
  38             'timestamp': 955064100,
  39             'upload_date': '20000406',
  40         },
  41     }, {
  42         # source format unavailable, additional mp4 formats
  43         'url': 'http://www.newgrounds.com/portal/view/689400',
  44         'info_dict': {
  45             'id': '689400',
  46             'ext': 'mp4',
  47             'title': 'Bennettthesage - ZTV News Episode 8',
  48             'uploader': 'BennettTheSage',
  49             'timestamp': 1487965140,
  50             'upload_date': '20170224',
  51         },
  52         'params': {
  53             'skip_download': True,
  54         },
  55     }]
  56
  57     def _real_extract(self, url):
  58         media_id = self._match_id(url)
  59         formats = []
  60         uploader = None
  61         webpage = self._download_webpage(url, media_id)
  62
  63         title = self._html_search_regex(
  64             r'<title>([^>]+)</title>', webpage, 'title')
  65
  66         media_url_string = self._search_regex(
  67             r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None, fatal=False)
  68
  69         if media_url_string:
  70             media_url = self._parse_json(media_url_string, media_id)
  71             formats = [{
  72                 'url': media_url,
  73                 'format_id': 'source',
  74                 'quality': 1,
  75             }]
  76
  77             max_resolution = int_or_none(self._search_regex(
  78                 r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution',
  79                 default=None))
  80             if max_resolution:
  81                 url_base = media_url.rpartition('.')[0]
  82                 for resolution in (360, 720, 1080):
  83                     if resolution > max_resolution:
  84                         break
  85                     formats.append({
  86                         'url': '%s.%dp.mp4' % (url_base, resolution),
  87                         'format_id': '%dp' % resolution,
  88                         'height': resolution,
  89                     })
  90         else:
  91             video_id = int_or_none(self._search_regex(
  92                 r'data-movie-id=\\"([0-9]+)\\"', webpage, ''))
  93             if not video_id:
  94                 raise ExtractorError('Could not extract media data')
  95
  96             url_video_data = 'https://www.newgrounds.com/portal/video/%s' % video_id
  97             headers = {
  98                 'Accept': 'application/json',
  99                 'Referer': url,
 100                 'X-Requested-With': 'XMLHttpRequest'
 101             }
 102             json_video = self._download_json(url_video_data, video_id, headers=headers, fatal=False)
 103             if not json_video:
 104                 raise ExtractorError('Could not fetch media data')
 105
 106             uploader = json_video.get('author')
 107             title = json_video.get('title')
 108             media_formats = json_video.get('sources', [])
 109             for media_format in media_formats:
 110                 media_sources = media_formats[media_format]
 111                 for source in media_sources:
 112                     formats.append({
 113                         'format_id': media_format,
 114                         'quality': int_or_none(media_format[:-1]),
 115                         'url': source.get('src')
 116                     })
 117
 118         self._check_formats(formats, media_id)
 119         self._sort_formats(formats)
 120
 121         if not uploader:
 122             uploader = self._html_search_regex(
 123                 (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*(?:Author|Artist)\s*</em>',
 124                  r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader',
 125                 fatal=False)
 126
 127         timestamp = unified_timestamp(self._html_search_regex(
 128             (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)',
 129              r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp',
 130             default=None))
 131         duration = parse_duration(self._search_regex(
 132             r'(?s)<dd>\s*Song\s*</dd>\s*<dd>.+?</dd>\s*<dd>([^<]+)', webpage,
 133             'duration', default=None))
 134
 135         filesize_approx = parse_filesize(self._html_search_regex(
 136             r'(?s)<dd>\s*Song\s*</dd>\s*<dd>(.+?)</dd>', webpage, 'filesize',
 137             default=None))
 138         if len(formats) == 1:
 139             formats[0]['filesize_approx'] = filesize_approx
 140
 141         if '<dd>Song' in webpage:
 142             formats[0]['vcodec'] = 'none'
 143
 144         if uploader:
 145             title = "%s - %s" % (uploader, title)
 146
 147         return {
 148             'id': media_id,
 149             'title': title,
 150             'uploader': uploader,
 151             'timestamp': timestamp,
 152             'duration': duration,
 153             'formats': formats,
 154         }
 155
 156
 157 class NewgroundsPlaylistIE(InfoExtractor):
 158     _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P<id>[^/?#&]+)'
 159     _TESTS = [{
 160         'url': 'https://www.newgrounds.com/collection/cats',
 161         'info_dict': {
 162             'id': 'cats',
 163             'title': 'Cats',
 164         },
 165         'playlist_mincount': 46,
 166     }, {
 167         'url': 'http://www.newgrounds.com/portal/search/author/ZONE-SAMA',
 168         'info_dict': {
 169             'id': 'ZONE-SAMA',
 170             'title': 'Portal Search: ZONE-SAMA',
 171         },
 172         'playlist_mincount': 47,
 173     }, {
 174         'url': 'http://www.newgrounds.com/audio/search/title/cats',
 175         'only_matching': True,
 176     }]
 177
 178     def _real_extract(self, url):
 179         playlist_id = self._match_id(url)
 180
 181         webpage = self._download_webpage(url, playlist_id)
 182
 183         title = self._search_regex(
 184             r'<title>([^>]+)</title>', webpage, 'title', default=None)
 185
 186         # cut left menu
 187         webpage = self._search_regex(
 188             r'(?s)<div[^>]+\bclass=["\']column wide(.+)',
 189             webpage, 'wide column', default=webpage)
 190
 191         entries = []
 192         for a, path, media_id in re.findall(
 193                 r'(<a[^>]+\bhref=["\']/?((?:portal/view|audio/listen)/(\d+))[^>]+>)',
 194                 webpage):
 195             a_class = extract_attributes(a).get('class')
 196             if a_class not in ('item-portalsubmission', 'item-audiosubmission'):
 197                 continue
 198             entries.append(
 199                 self.url_result(
 200                     'https://www.newgrounds.com/%s' % path,
 201                     ie=NewgroundsIE.ie_key(), video_id=media_id))
 202
 203         return self.playlist_result(entries, playlist_id, title)