yt_dlp/extractor/newgrounds.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import functools
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..utils import (
   9     extract_attributes,
  10     int_or_none,
  11     parse_count,
  12     parse_duration,
  13     unified_timestamp,
  14     OnDemandPagedList,
  15     try_get,
  16 )
  17
  18
  19 class NewgroundsIE(InfoExtractor):
  20     _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>\d+)(?:/format/flash)?'
  21     _TESTS = [{
  22         'url': 'https://www.newgrounds.com/audio/listen/549479',
  23         'md5': 'fe6033d297591288fa1c1f780386f07a',
  24         'info_dict': {
  25             'id': '549479',
  26             'ext': 'mp3',
  27             'title': 'B7 - BusMode',
  28             'uploader': 'Burn7',
  29             'timestamp': 1378878540,
  30             'upload_date': '20130911',
  31             'duration': 143,
  32             'description': 'md5:6d885138814015dfd656c2ddb00dacfc',
  33         },
  34     }, {
  35         'url': 'https://www.newgrounds.com/portal/view/1',
  36         'md5': 'fbfb40e2dc765a7e830cb251d370d981',
  37         'info_dict': {
  38             'id': '1',
  39             'ext': 'mp4',
  40             'title': 'Scrotum 1',
  41             'uploader': 'Brian-Beaton',
  42             'timestamp': 955064100,
  43             'upload_date': '20000406',
  44             'description': 'Scrotum plays "catch."',
  45         },
  46     }, {
  47         # source format unavailable, additional mp4 formats
  48         'url': 'http://www.newgrounds.com/portal/view/689400',
  49         'info_dict': {
  50             'id': '689400',
  51             'ext': 'mp4',
  52             'title': 'ZTV News Episode 8',
  53             'uploader': 'ZONE-SAMA',
  54             'timestamp': 1487965140,
  55             'upload_date': '20170224',
  56             'description': 'ZTV News Episode 8 (February 2017)',
  57         },
  58         'params': {
  59             'skip_download': True,
  60         },
  61     }, {
  62         'url': 'https://www.newgrounds.com/portal/view/297383',
  63         'md5': '2c11f5fd8cb6b433a63c89ba3141436c',
  64         'info_dict': {
  65             'id': '297383',
  66             'ext': 'mp4',
  67             'title': 'Metal Gear Awesome',
  68             'uploader': 'Egoraptor',
  69             'timestamp': 1140663240,
  70             'upload_date': '20060223',
  71             'description': 'Metal Gear is awesome is so is this movie.',
  72         }
  73     }, {
  74         'url': 'https://www.newgrounds.com/portal/view/297383/format/flash',
  75         'md5': '5d05585a9a0caca059f5abfbd3865524',
  76         'info_dict': {
  77             'id': '297383',
  78             'ext': 'swf',
  79             'title': 'Metal Gear Awesome',
  80             'description': 'Metal Gear is awesome is so is this movie.',
  81             'uploader': 'Egoraptor',
  82             'upload_date': '20060223',
  83             'timestamp': 1140663240,
  84         }
  85     }]
  86
  87     def _real_extract(self, url):
  88         media_id = self._match_id(url)
  89         formats = []
  90         uploader = None
  91         webpage = self._download_webpage(url, media_id)
  92
  93         title = self._html_search_regex(
  94             r'<title>(.+?)</title>', webpage, 'title')
  95
  96         media_url_string = self._search_regex(
  97             r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None)
  98
  99         if media_url_string:
 100             media_url = self._parse_json(media_url_string, media_id)
 101             formats = [{
 102                 'url': media_url,
 103                 'format_id': 'source',
 104                 'quality': 1,
 105             }]
 106         else:
 107             json_video = self._download_json('https://www.newgrounds.com/portal/video/' + media_id, media_id, headers={
 108                 'Accept': 'application/json',
 109                 'Referer': url,
 110                 'X-Requested-With': 'XMLHttpRequest'
 111             })
 112
 113             uploader = json_video.get('author')
 114             media_formats = json_video.get('sources', [])
 115             for media_format in media_formats:
 116                 media_sources = media_formats[media_format]
 117                 for source in media_sources:
 118                     formats.append({
 119                         'format_id': media_format,
 120                         'quality': int_or_none(media_format[:-1]),
 121                         'url': source.get('src')
 122                     })
 123
 124         if not uploader:
 125             uploader = self._html_search_regex(
 126                 (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*(?:Author|Artist)\s*</em>',
 127                  r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader',
 128                 fatal=False)
 129
 130         timestamp = unified_timestamp(self._html_search_regex(
 131             (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)',
 132              r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp',
 133             default=None))
 134         duration = parse_duration(self._html_search_regex(
 135             r'"duration"\s*:\s*["\']?([\d]+)["\']?,', webpage,
 136             'duration', default=None))
 137
 138         view_count = parse_count(self._html_search_regex(
 139             r'(?s)<dt>\s*(?:Views|Listens)\s*</dt>\s*<dd>([\d\.,]+)</dd>', webpage,
 140             'view count', default=None))
 141
 142         filesize = int_or_none(self._html_search_regex(
 143             r'"filesize"\s*:\s*["\']?([\d]+)["\']?,', webpage, 'filesize',
 144             default=None))
 145
 146         video_type_description = self._html_search_regex(
 147             r'"description"\s*:\s*["\']?([^"\']+)["\']?,', webpage, 'filesize',
 148             default=None)
 149
 150         if len(formats) == 1:
 151             formats[0]['filesize'] = filesize
 152
 153         if video_type_description == 'Audio File':
 154             formats[0]['vcodec'] = 'none'
 155         self._check_formats(formats, media_id)
 156         self._sort_formats(formats)
 157
 158         return {
 159             'id': media_id,
 160             'title': title,
 161             'uploader': uploader,
 162             'timestamp': timestamp,
 163             'duration': duration,
 164             'formats': formats,
 165             'thumbnail': self._og_search_thumbnail(webpage),
 166             'description': self._og_search_description(webpage),
 167             'view_count': view_count,
 168         }
 169
 170
 171 class NewgroundsPlaylistIE(InfoExtractor):
 172     IE_NAME = 'Newgrounds:playlist'
 173     _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P<id>[^/?#&]+)'
 174     _TESTS = [{
 175         'url': 'https://www.newgrounds.com/collection/cats',
 176         'info_dict': {
 177             'id': 'cats',
 178             'title': 'Cats',
 179         },
 180         'playlist_mincount': 45,
 181     }, {
 182         'url': 'https://www.newgrounds.com/collection/dogs',
 183         'info_dict': {
 184             'id': 'dogs',
 185             'title': 'Dogs',
 186         },
 187         'playlist_mincount': 26,
 188     }, {
 189         'url': 'http://www.newgrounds.com/audio/search/title/cats',
 190         'only_matching': True,
 191     }]
 192
 193     def _real_extract(self, url):
 194         playlist_id = self._match_id(url)
 195
 196         webpage = self._download_webpage(url, playlist_id)
 197
 198         title = self._search_regex(
 199             r'<title>([^>]+)</title>', webpage, 'title', default=None)
 200
 201         # cut left menu
 202         webpage = self._search_regex(
 203             r'(?s)<div[^>]+\bclass=["\']column wide(.+)',
 204             webpage, 'wide column', default=webpage)
 205
 206         entries = []
 207         for a, path, media_id in re.findall(
 208                 r'(<a[^>]+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>)',
 209                 webpage):
 210             a_class = extract_attributes(a).get('class')
 211             if a_class not in ('item-portalsubmission', 'item-audiosubmission'):
 212                 continue
 213             entries.append(
 214                 self.url_result(
 215                     f'https://www.newgrounds.com/{path}',
 216                     ie=NewgroundsIE.ie_key(), video_id=media_id))
 217
 218         return self.playlist_result(entries, playlist_id, title)
 219
 220
 221 class NewgroundsUserIE(InfoExtractor):
 222     IE_NAME = 'Newgrounds:user'
 223     _VALID_URL = r'https?://(?P<id>[^\.]+)\.newgrounds\.com/(?:movies|audio)/?(?:[#?]|$)'
 224     _TESTS = [{
 225         'url': 'https://burn7.newgrounds.com/audio',
 226         'info_dict': {
 227             'id': 'burn7',
 228         },
 229         'playlist_mincount': 150,
 230     }, {
 231         'url': 'https://burn7.newgrounds.com/movies',
 232         'info_dict': {
 233             'id': 'burn7',
 234         },
 235         'playlist_mincount': 2,
 236     }, {
 237         'url': 'https://brian-beaton.newgrounds.com/movies',
 238         'info_dict': {
 239             'id': 'brian-beaton',
 240         },
 241         'playlist_mincount': 10,
 242     }]
 243     _PAGE_SIZE = 30
 244
 245     def _fetch_page(self, channel_id, url, page):
 246         page += 1
 247         posts_info = self._download_json(
 248             f'{url}/page/{page}', channel_id,
 249             note=f'Downloading page {page}', headers={
 250                 'Accept': 'application/json, text/javascript, */*; q = 0.01',
 251                 'X-Requested-With': 'XMLHttpRequest',
 252             })
 253         sequence = posts_info.get('sequence', [])
 254         for year in sequence:
 255             posts = try_get(posts_info, lambda x: x['years'][str(year)]['items'])
 256             for post in posts:
 257                 path, media_id = self._search_regex(
 258                     r'<a[^>]+\bhref=["\'][^"\']+((?:portal/view|audio/listen)/(\d+))[^>]+>',
 259                     post, 'url', group=(1, 2))
 260                 yield self.url_result(f'https://www.newgrounds.com/{path}', NewgroundsIE.ie_key(), media_id)
 261
 262     def _real_extract(self, url):
 263         channel_id = self._match_id(url)
 264
 265         entries = OnDemandPagedList(functools.partial(
 266             self._fetch_page, channel_id, url), self._PAGE_SIZE)
 267
 268         return self.playlist_result(entries, channel_id)