yt_dlp/extractor/appletrailers.py

   1 import json
   2 import re
   3 import urllib.parse
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     int_or_none,
   8     parse_duration,
   9     unified_strdate,
  10 )
  11
  12
  13 class AppleTrailersIE(InfoExtractor):
  14     IE_NAME = 'appletrailers'
  15     _VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
  16     _TESTS = [{
  17         'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
  18         'info_dict': {
  19             'id': '5111',
  20             'title': 'Man of Steel',
  21         },
  22         'playlist': [
  23             {
  24                 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8',
  25                 'info_dict': {
  26                     'id': 'manofsteel-trailer4',
  27                     'ext': 'mov',
  28                     'duration': 111,
  29                     'title': 'Trailer 4',
  30                     'upload_date': '20130523',
  31                     'uploader_id': 'wb',
  32                 },
  33             },
  34             {
  35                 'md5': 'b8017b7131b721fb4e8d6f49e1df908c',
  36                 'info_dict': {
  37                     'id': 'manofsteel-trailer3',
  38                     'ext': 'mov',
  39                     'duration': 182,
  40                     'title': 'Trailer 3',
  41                     'upload_date': '20130417',
  42                     'uploader_id': 'wb',
  43                 },
  44             },
  45             {
  46                 'md5': 'd0f1e1150989b9924679b441f3404d48',
  47                 'info_dict': {
  48                     'id': 'manofsteel-trailer',
  49                     'ext': 'mov',
  50                     'duration': 148,
  51                     'title': 'Trailer',
  52                     'upload_date': '20121212',
  53                     'uploader_id': 'wb',
  54                 },
  55             },
  56             {
  57                 'md5': '5fe08795b943eb2e757fa95cb6def1cb',
  58                 'info_dict': {
  59                     'id': 'manofsteel-teaser',
  60                     'ext': 'mov',
  61                     'duration': 93,
  62                     'title': 'Teaser',
  63                     'upload_date': '20120721',
  64                     'uploader_id': 'wb',
  65                 },
  66             },
  67         ],
  68     }, {
  69         'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',
  70         'info_dict': {
  71             'id': '4489',
  72             'title': 'Blackthorn',
  73         },
  74         'playlist_mincount': 2,
  75         'expected_warnings': ['Unable to download JSON metadata'],
  76     }, {
  77         # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json
  78         'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/',
  79         'info_dict': {
  80             'id': '15881',
  81             'title': 'Kung Fu Panda 3',
  82         },
  83         'playlist_mincount': 4,
  84     }, {
  85         'url': 'http://trailers.apple.com/ca/metropole/autrui/',
  86         'only_matching': True,
  87     }, {
  88         'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/',
  89         'only_matching': True,
  90     }]
  91
  92     _JSON_RE = r'iTunes.playURL\((.*?)\);'
  93
  94     def _real_extract(self, url):
  95         mobj = self._match_valid_url(url)
  96         movie = mobj.group('movie')
  97         uploader_id = mobj.group('company')
  98
  99         webpage = self._download_webpage(url, movie)
 100         film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id')
 101         film_data = self._download_json(
 102             f'http://trailers.apple.com/trailers/feeds/data/{film_id}.json',
 103             film_id, fatal=False)
 104
 105         if film_data:
 106             entries = []
 107             for clip in film_data.get('clips', []):
 108                 clip_title = clip['title']
 109
 110                 formats = []
 111                 for version, version_data in clip.get('versions', {}).items():
 112                     for size, size_data in version_data.get('sizes', {}).items():
 113                         src = size_data.get('src')
 114                         if not src:
 115                             continue
 116                         formats.append({
 117                             'format_id': f'{version}-{size}',
 118                             'url': re.sub(r'_(\d+p\.mov)', r'_h\1', src),
 119                             'width': int_or_none(size_data.get('width')),
 120                             'height': int_or_none(size_data.get('height')),
 121                             'language': version[:2],
 122                         })
 123
 124                 entries.append({
 125                     'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(),
 126                     'formats': formats,
 127                     'title': clip_title,
 128                     'thumbnail': clip.get('screen') or clip.get('thumb'),
 129                     'duration': parse_duration(clip.get('runtime') or clip.get('faded')),
 130                     'upload_date': unified_strdate(clip.get('posted')),
 131                     'uploader_id': uploader_id,
 132                 })
 133
 134             page_data = film_data.get('page', {})
 135             return self.playlist_result(entries, film_id, page_data.get('movie_title'))
 136
 137         playlist_url = urllib.parse.urljoin(url, 'includes/playlists/itunes.inc')
 138
 139         def fix_html(s):
 140             s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
 141             s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s)
 142             # The ' in the onClick attributes are not escaped, it couldn't be parsed
 143             # like: http://trailers.apple.com/trailers/wb/gravity/
 144
 145             def _clean_json(m):
 146                 return 'iTunes.playURL({});'.format(m.group(1).replace('\'', '&#39;'))
 147             s = re.sub(self._JSON_RE, _clean_json, s)
 148             return f'<html>{s}</html>'
 149         doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
 150
 151         playlist = []
 152         for li in doc.findall('./div/ul/li'):
 153             on_click = li.find('.//a').attrib['onClick']
 154             trailer_info_json = self._search_regex(self._JSON_RE,
 155                                                    on_click, 'trailer info')
 156             trailer_info = json.loads(trailer_info_json)
 157             first_url = trailer_info.get('url')
 158             if not first_url:
 159                 continue
 160             title = trailer_info['title']
 161             video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
 162             thumbnail = li.find('.//img').attrib['src']
 163             upload_date = trailer_info['posted'].replace('-', '')
 164
 165             runtime = trailer_info['runtime']
 166             m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime)
 167             duration = None
 168             if m:
 169                 duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
 170
 171             trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
 172             settings_json_url = urllib.parse.urljoin(url, f'includes/settings/{trailer_id}.json')
 173             settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
 174
 175             formats = []
 176             for fmt in settings['metadata']['sizes']:
 177                 # The src is a file pointing to the real video file
 178                 format_url = re.sub(r'_(\d*p\.mov)', r'_h\1', fmt['src'])
 179                 formats.append({
 180                     'url': format_url,
 181                     'format': fmt['type'],
 182                     'width': int_or_none(fmt['width']),
 183                     'height': int_or_none(fmt['height']),
 184                 })
 185
 186             playlist.append({
 187                 '_type': 'video',
 188                 'id': video_id,
 189                 'formats': formats,
 190                 'title': title,
 191                 'duration': duration,
 192                 'thumbnail': thumbnail,
 193                 'upload_date': upload_date,
 194                 'uploader_id': uploader_id,
 195                 'http_headers': {
 196                     'User-Agent': 'QuickTime compatible (yt-dlp)',
 197                 },
 198             })
 199
 200         return {
 201             '_type': 'playlist',
 202             'id': movie,
 203             'entries': playlist,
 204         }
 205
 206
 207 class AppleTrailersSectionIE(InfoExtractor):
 208     IE_NAME = 'appletrailers:section'
 209     _SECTIONS = {
 210         'justadded': {
 211             'feed_path': 'just_added',
 212             'title': 'Just Added',
 213         },
 214         'exclusive': {
 215             'feed_path': 'exclusive',
 216             'title': 'Exclusive',
 217         },
 218         'justhd': {
 219             'feed_path': 'just_hd',
 220             'title': 'Just HD',
 221         },
 222         'mostpopular': {
 223             'feed_path': 'most_pop',
 224             'title': 'Most Popular',
 225         },
 226         'moviestudios': {
 227             'feed_path': 'studios',
 228             'title': 'Movie Studios',
 229         },
 230     }
 231     _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>{})'.format('|'.join(_SECTIONS))
 232     _TESTS = [{
 233         'url': 'http://trailers.apple.com/#section=justadded',
 234         'info_dict': {
 235             'title': 'Just Added',
 236             'id': 'justadded',
 237         },
 238         'playlist_mincount': 80,
 239     }, {
 240         'url': 'http://trailers.apple.com/#section=exclusive',
 241         'info_dict': {
 242             'title': 'Exclusive',
 243             'id': 'exclusive',
 244         },
 245         'playlist_mincount': 80,
 246     }, {
 247         'url': 'http://trailers.apple.com/#section=justhd',
 248         'info_dict': {
 249             'title': 'Just HD',
 250             'id': 'justhd',
 251         },
 252         'playlist_mincount': 80,
 253     }, {
 254         'url': 'http://trailers.apple.com/#section=mostpopular',
 255         'info_dict': {
 256             'title': 'Most Popular',
 257             'id': 'mostpopular',
 258         },
 259         'playlist_mincount': 30,
 260     }, {
 261         'url': 'http://trailers.apple.com/#section=moviestudios',
 262         'info_dict': {
 263             'title': 'Movie Studios',
 264             'id': 'moviestudios',
 265         },
 266         'playlist_mincount': 80,
 267     }]
 268
 269     def _real_extract(self, url):
 270         section = self._match_id(url)
 271         section_data = self._download_json(
 272             'http://trailers.apple.com/trailers/home/feeds/{}.json'.format(self._SECTIONS[section]['feed_path']),
 273             section)
 274         entries = [
 275             self.url_result('http://trailers.apple.com' + e['location'])
 276             for e in section_data]
 277         return self.playlist_result(entries, section, self._SECTIONS[section]['title'])