yt_dlp/downloader/hls.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4 import io
   5 import binascii
   6
   7 from ..downloader import get_suitable_downloader
   8 from .fragment import FragmentFD, can_decrypt_frag
   9 from .external import FFmpegFD
  10
  11 from ..compat import (
  12     compat_urlparse,
  13 )
  14 from ..utils import (
  15     parse_m3u8_attributes,
  16     update_url_query,
  17     bug_reports_message,
  18 )
  19 from .. import webvtt
  20
  21
  22 class HlsFD(FragmentFD):
  23     """
  24     Download segments in a m3u8 manifest. External downloaders can take over
  25     the fragment downloads by supporting the 'm3u8_frag_urls' protocol and
  26     re-defining 'supports_manifest' function
  27     """
  28
  29     FD_NAME = 'hlsnative'
  30
  31     @staticmethod
  32     def can_download(manifest, info_dict, allow_unplayable_formats=False, with_crypto=can_decrypt_frag):
  33         UNSUPPORTED_FEATURES = [
  34             # r'#EXT-X-BYTERANGE',  # playlists composed of byte ranges of media files [2]
  35
  36             # Live streams heuristic does not always work (e.g. geo restricted to Germany
  37             # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0)
  38             # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)',  # live streams [3]
  39
  40             # This heuristic also is not correct since segments may not be appended as well.
  41             # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite
  42             # no segments will definitely be appended to the end of the playlist.
  43             # r'#EXT-X-PLAYLIST-TYPE:EVENT',  # media segments may be appended to the end of
  44             #                                 # event media playlists [4]
  45             # r'#EXT-X-MAP:',  # media initialization [5]
  46             # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
  47             # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
  48             # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
  49             # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5
  50             # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5
  51         ]
  52         if not allow_unplayable_formats:
  53             UNSUPPORTED_FEATURES += [
  54                 r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)',  # encrypted streams [1]
  55             ]
  56
  57         def check_results():
  58             yield not info_dict.get('is_live')
  59             is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest
  60             yield with_crypto or not is_aes128_enc
  61             yield not (is_aes128_enc and r'#EXT-X-BYTERANGE' in manifest)
  62             for feature in UNSUPPORTED_FEATURES:
  63                 yield not re.search(feature, manifest)
  64         return all(check_results())
  65
  66     def real_download(self, filename, info_dict):
  67         man_url = info_dict['url']
  68         self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
  69
  70         urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
  71         man_url = urlh.geturl()
  72         s = urlh.read().decode('utf-8', 'ignore')
  73
  74         if not self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')):
  75             if info_dict.get('extra_param_to_segment_url') or info_dict.get('_decryption_key_url'):
  76                 self.report_error('pycryptodome not found. Please install')
  77                 return False
  78             if self.can_download(s, info_dict, with_crypto=True):
  79                 self.report_warning('pycryptodome is needed to download this file natively')
  80             fd = FFmpegFD(self.ydl, self.params)
  81             self.report_warning(
  82                 '%s detected unsupported features; extraction will be delegated to %s' % (self.FD_NAME, fd.get_basename()))
  83             return fd.real_download(filename, info_dict)
  84
  85         is_webvtt = info_dict['ext'] == 'vtt'
  86         if is_webvtt:
  87             real_downloader = None  # Packing the fragments is not currently supported for external downloader
  88         else:
  89             real_downloader = get_suitable_downloader(
  90                 info_dict, self.params, None, protocol='m3u8_frag_urls', to_stdout=(filename== '-'))
  91         if real_downloader and not real_downloader.supports_manifest(s):
  92             real_downloader = None
  93         if real_downloader:
  94             self.to_screen(
  95                 '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename()))
  96
  97         def is_ad_fragment_start(s):
  98             return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s
  99                     or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad'))
 100
 101         def is_ad_fragment_end(s):
 102             return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s
 103                     or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment'))
 104
 105         fragments = []
 106
 107         media_frags = 0
 108         ad_frags = 0
 109         ad_frag_next = False
 110         for line in s.splitlines():
 111             line = line.strip()
 112             if not line:
 113                 continue
 114             if line.startswith('#'):
 115                 if is_ad_fragment_start(line):
 116                     ad_frag_next = True
 117                 elif is_ad_fragment_end(line):
 118                     ad_frag_next = False
 119                 continue
 120             if ad_frag_next:
 121                 ad_frags += 1
 122                 continue
 123             media_frags += 1
 124
 125         ctx = {
 126             'filename': filename,
 127             'total_frags': media_frags,
 128             'ad_frags': ad_frags,
 129         }
 130
 131         if real_downloader:
 132             self._prepare_external_frag_download(ctx)
 133         else:
 134             self._prepare_and_start_frag_download(ctx, info_dict)
 135
 136         extra_state = ctx.setdefault('extra_state', {})
 137
 138         format_index = info_dict.get('format_index')
 139         extra_query = None
 140         extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
 141         if extra_param_to_segment_url:
 142             extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url)
 143         i = 0
 144         media_sequence = 0
 145         decrypt_info = {'METHOD': 'NONE'}
 146         byte_range = {}
 147         discontinuity_count = 0
 148         frag_index = 0
 149         ad_frag_next = False
 150         for line in s.splitlines():
 151             line = line.strip()
 152             if line:
 153                 if not line.startswith('#'):
 154                     if format_index and discontinuity_count != format_index:
 155                         continue
 156                     if ad_frag_next:
 157                         continue
 158                     frag_index += 1
 159                     if frag_index <= ctx['fragment_index']:
 160                         continue
 161                     frag_url = (
 162                         line
 163                         if re.match(r'^https?://', line)
 164                         else compat_urlparse.urljoin(man_url, line))
 165                     if extra_query:
 166                         frag_url = update_url_query(frag_url, extra_query)
 167
 168                     fragments.append({
 169                         'frag_index': frag_index,
 170                         'url': frag_url,
 171                         'decrypt_info': decrypt_info,
 172                         'byte_range': byte_range,
 173                         'media_sequence': media_sequence,
 174                     })
 175
 176                 elif line.startswith('#EXT-X-MAP'):
 177                     if format_index and discontinuity_count != format_index:
 178                         continue
 179                     if frag_index > 0:
 180                         self.report_error(
 181                             'Initialization fragment found after media fragments, unable to download')
 182                         return False
 183                     frag_index += 1
 184                     map_info = parse_m3u8_attributes(line[11:])
 185                     frag_url = (
 186                         map_info.get('URI')
 187                         if re.match(r'^https?://', map_info.get('URI'))
 188                         else compat_urlparse.urljoin(man_url, map_info.get('URI')))
 189                     if extra_query:
 190                         frag_url = update_url_query(frag_url, extra_query)
 191
 192                     fragments.append({
 193                         'frag_index': frag_index,
 194                         'url': frag_url,
 195                         'decrypt_info': decrypt_info,
 196                         'byte_range': byte_range,
 197                         'media_sequence': media_sequence
 198                     })
 199
 200                     if map_info.get('BYTERANGE'):
 201                         splitted_byte_range = map_info.get('BYTERANGE').split('@')
 202                         sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end']
 203                         byte_range = {
 204                             'start': sub_range_start,
 205                             'end': sub_range_start + int(splitted_byte_range[0]),
 206                         }
 207
 208                 elif line.startswith('#EXT-X-KEY'):
 209                     decrypt_url = decrypt_info.get('URI')
 210                     decrypt_info = parse_m3u8_attributes(line[11:])
 211                     if decrypt_info['METHOD'] == 'AES-128':
 212                         if 'IV' in decrypt_info:
 213                             decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32))
 214                         if not re.match(r'^https?://', decrypt_info['URI']):
 215                             decrypt_info['URI'] = compat_urlparse.urljoin(
 216                                 man_url, decrypt_info['URI'])
 217                         if extra_query:
 218                             decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
 219                         if decrypt_url != decrypt_info['URI']:
 220                             decrypt_info['KEY'] = None
 221
 222                 elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
 223                     media_sequence = int(line[22:])
 224                 elif line.startswith('#EXT-X-BYTERANGE'):
 225                     splitted_byte_range = line[17:].split('@')
 226                     sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end']
 227                     byte_range = {
 228                         'start': sub_range_start,
 229                         'end': sub_range_start + int(splitted_byte_range[0]),
 230                     }
 231                 elif is_ad_fragment_start(line):
 232                     ad_frag_next = True
 233                 elif is_ad_fragment_end(line):
 234                     ad_frag_next = False
 235                 elif line.startswith('#EXT-X-DISCONTINUITY'):
 236                     discontinuity_count += 1
 237                 i += 1
 238                 media_sequence += 1
 239
 240         # We only download the first fragment during the test
 241         if self.params.get('test', False):
 242             fragments = [fragments[0] if fragments else None]
 243
 244         if real_downloader:
 245             info_copy = info_dict.copy()
 246             info_copy['fragments'] = fragments
 247             fd = real_downloader(self.ydl, self.params)
 248             # TODO: Make progress updates work without hooking twice
 249             # for ph in self._progress_hooks:
 250             #     fd.add_progress_hook(ph)
 251             return fd.real_download(filename, info_copy)
 252
 253         if is_webvtt:
 254             def pack_fragment(frag_content, frag_index):
 255                 output = io.StringIO()
 256                 adjust = 0
 257                 for block in webvtt.parse_fragment(frag_content):
 258                     if isinstance(block, webvtt.CueBlock):
 259                         block.start += adjust
 260                         block.end += adjust
 261
 262                         dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
 263                         cue = block.as_json
 264
 265                         # skip the cue if an identical one appears
 266                         # in the window of potential duplicates
 267                         # and prune the window of unviable candidates
 268                         i = 0
 269                         skip = True
 270                         while i < len(dedup_window):
 271                             window_cue = dedup_window[i]
 272                             if window_cue == cue:
 273                                 break
 274                             if window_cue['end'] >= cue['start']:
 275                                 i += 1
 276                                 continue
 277                             del dedup_window[i]
 278                         else:
 279                             skip = False
 280
 281                         if skip:
 282                             continue
 283
 284                         # add the cue to the window
 285                         dedup_window.append(cue)
 286                     elif isinstance(block, webvtt.Magic):
 287                         # take care of MPEG PES timestamp overflow
 288                         if block.mpegts is None:
 289                             block.mpegts = 0
 290                         extra_state.setdefault('webvtt_mpegts_adjust', 0)
 291                         block.mpegts += extra_state['webvtt_mpegts_adjust'] << 33
 292                         if block.mpegts < extra_state.get('webvtt_mpegts_last', 0):
 293                             extra_state['webvtt_mpegts_adjust'] += 1
 294                             block.mpegts += 1 << 33
 295                         extra_state['webvtt_mpegts_last'] = block.mpegts
 296
 297                         if frag_index == 1:
 298                             extra_state['webvtt_mpegts'] = block.mpegts or 0
 299                             extra_state['webvtt_local'] = block.local or 0
 300                             # XXX: block.local = block.mpegts = None ?
 301                         else:
 302                             if block.mpegts is not None and block.local is not None:
 303                                 adjust = (
 304                                     (block.mpegts - extra_state.get('webvtt_mpegts', 0))
 305                                     - (block.local - extra_state.get('webvtt_local', 0))
 306                                 )
 307                             continue
 308                     elif isinstance(block, webvtt.HeaderBlock):
 309                         if frag_index != 1:
 310                             # XXX: this should probably be silent as well
 311                             # or verify that all segments contain the same data
 312                             self.report_warning(bug_reports_message(
 313                                 'Discarding a %s block found in the middle of the stream; '
 314                                 'if the subtitles display incorrectly,'
 315                                 % (type(block).__name__)))
 316                             continue
 317                     block.write_into(output)
 318
 319                 return output.getvalue().encode('utf-8')
 320         else:
 321             pack_fragment = None
 322         return self.download_and_append_fragments(ctx, fragments, info_dict, pack_fragment)