yt_dlp/extractor/zype.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..compat import compat_HTTPError
   5 from ..utils import (
   6     dict_get,
   7     ExtractorError,
   8     int_or_none,
   9     js_to_json,
  10     parse_iso8601,
  11 )
  12
  13
  14 class ZypeIE(InfoExtractor):
  15     _ID_RE = r'[\da-fA-F]+'
  16     _COMMON_RE = r'//player\.zype\.com/embed/%s\.(?:js|json|html)\?.*?(?:access_token|(?:ap[ip]|player)_key)='
  17     _VALID_URL = r'https?:%s[^&]+' % (_COMMON_RE % ('(?P<id>%s)' % _ID_RE))
  18     _TEST = {
  19         'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false',
  20         'md5': 'eaee31d474c76a955bdaba02a505c595',
  21         'info_dict': {
  22             'id': '5b400b834b32992a310622b9',
  23             'ext': 'mp4',
  24             'title': 'Smoky Barbecue Favorites',
  25             'thumbnail': r're:^https?://.*\.jpe?g',
  26             'description': 'md5:5ff01e76316bd8d46508af26dc86023b',
  27             'timestamp': 1504915200,
  28             'upload_date': '20170909',
  29         },
  30     }
  31
  32     @staticmethod
  33     def _extract_urls(webpage):
  34         return [
  35             mobj.group('url')
  36             for mobj in re.finditer(
  37                 r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.+?)\1' % (ZypeIE._COMMON_RE % ZypeIE._ID_RE),
  38                 webpage)]
  39
  40     def _real_extract(self, url):
  41         video_id = self._match_id(url)
  42
  43         try:
  44             response = self._download_json(re.sub(
  45                 r'\.(?:js|html)\?', '.json?', url), video_id)['response']
  46         except ExtractorError as e:
  47             if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 403):
  48                 raise ExtractorError(self._parse_json(
  49                     e.cause.read().decode(), video_id)['message'], expected=True)
  50             raise
  51
  52         body = response['body']
  53         video = response['video']
  54         title = video['title']
  55
  56         subtitles = {}
  57
  58         if isinstance(body, dict):
  59             formats = []
  60             for output in body.get('outputs', []):
  61                 output_url = output.get('url')
  62                 if not output_url:
  63                     continue
  64                 name = output.get('name')
  65                 if name == 'm3u8':
  66                     formats, subtitles = self._extract_m3u8_formats_and_subtitles(
  67                         output_url, video_id, 'mp4',
  68                         'm3u8_native', m3u8_id='hls', fatal=False)
  69                 else:
  70                     f = {
  71                         'format_id': name,
  72                         'tbr': int_or_none(output.get('bitrate')),
  73                         'url': output_url,
  74                     }
  75                     if name in ('m4a', 'mp3'):
  76                         f['vcodec'] = 'none'
  77                     else:
  78                         f.update({
  79                             'height': int_or_none(output.get('height')),
  80                             'width': int_or_none(output.get('width')),
  81                         })
  82                     formats.append(f)
  83             text_tracks = body.get('subtitles') or []
  84         else:
  85             m3u8_url = self._search_regex(
  86                 r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1',
  87                 body, 'm3u8 url', group='url', default=None)
  88             if not m3u8_url:
  89                 source = self._search_regex(
  90                     r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, 'source')
  91
  92                 def get_attr(key):
  93                     return self._search_regex(
  94                         r'\b%s\s*:\s*([\'"])(?P<val>(?:(?!\1).)+)\1' % key,
  95                         source, key, group='val')
  96
  97                 if get_attr('integration') == 'verizon-media':
  98                     m3u8_url = 'https://content.uplynk.com/%s.m3u8' % get_attr('id')
  99             formats, subtitles = self._extract_m3u8_formats_and_subtitles(
 100                 m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
 101             text_tracks = self._search_regex(
 102                 r'textTracks\s*:\s*(\[[^]]+\])',
 103                 body, 'text tracks', default=None)
 104             if text_tracks:
 105                 text_tracks = self._parse_json(
 106                     text_tracks, video_id, js_to_json, False)
 107         self._sort_formats(formats)
 108
 109         if text_tracks:
 110             for text_track in text_tracks:
 111                 tt_url = dict_get(text_track, ('file', 'src'))
 112                 if not tt_url:
 113                     continue
 114                 subtitles.setdefault(text_track.get('label') or 'English', []).append({
 115                     'url': tt_url,
 116                 })
 117
 118         thumbnails = []
 119         for thumbnail in video.get('thumbnails', []):
 120             thumbnail_url = thumbnail.get('url')
 121             if not thumbnail_url:
 122                 continue
 123             thumbnails.append({
 124                 'url': thumbnail_url,
 125                 'width': int_or_none(thumbnail.get('width')),
 126                 'height': int_or_none(thumbnail.get('height')),
 127             })
 128
 129         return {
 130             'id': video_id,
 131             'display_id': video.get('friendly_title'),
 132             'title': title,
 133             'thumbnails': thumbnails,
 134             'description': dict_get(video, ('description', 'ott_description', 'short_description')),
 135             'timestamp': parse_iso8601(video.get('published_at')),
 136             'duration': int_or_none(video.get('duration')),
 137             'view_count': int_or_none(video.get('request_count')),
 138             'average_rating': int_or_none(video.get('rating')),
 139             'season_number': int_or_none(video.get('season')),
 140             'episode_number': int_or_none(video.get('episode')),
 141             'formats': formats,
 142             'subtitles': subtitles,
 143         }