yt_dlp/downloader/hls.py

   1 from __future__ import unicode_literals
   2
   3 import errno
   4 import re
   5 import binascii
   6 try:
   7     from Crypto.Cipher import AES
   8     can_decrypt_frag = True
   9 except ImportError:
  10     can_decrypt_frag = False
  11 try:
  12     import concurrent.futures
  13     can_threaded_download = True
  14 except ImportError:
  15     can_threaded_download = False
  16
  17 from ..downloader import _get_real_downloader
  18 from .fragment import FragmentFD
  19 from .external import FFmpegFD
  20
  21 from ..compat import (
  22     compat_urllib_error,
  23     compat_urlparse,
  24     compat_struct_pack,
  25 )
  26 from ..utils import (
  27     parse_m3u8_attributes,
  28     sanitize_open,
  29     update_url_query,
  30 )
  31
  32
  33 class HlsFD(FragmentFD):
  34     """
  35     Download segments in a m3u8 manifest. External downloaders can take over
  36     the fragment downloads by supporting the 'm3u8_frag_urls' protocol and
  37     re-defining 'supports_manifest' function
  38     """
  39
  40     FD_NAME = 'hlsnative'
  41
  42     @staticmethod
  43     def can_download(manifest, info_dict, allow_unplayable_formats=False, with_crypto=can_decrypt_frag):
  44         UNSUPPORTED_FEATURES = [
  45             # r'#EXT-X-BYTERANGE',  # playlists composed of byte ranges of media files [2]
  46
  47             # Live streams heuristic does not always work (e.g. geo restricted to Germany
  48             # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0)
  49             # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)',  # live streams [3]
  50
  51             # This heuristic also is not correct since segments may not be appended as well.
  52             # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite
  53             # no segments will definitely be appended to the end of the playlist.
  54             # r'#EXT-X-PLAYLIST-TYPE:EVENT',  # media segments may be appended to the end of
  55             #                                 # event media playlists [4]
  56             # r'#EXT-X-MAP:',  # media initialization [5]
  57             # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
  58             # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
  59             # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
  60             # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5
  61             # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5
  62         ]
  63         if not allow_unplayable_formats:
  64             UNSUPPORTED_FEATURES += [
  65                 r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)',  # encrypted streams [1]
  66             ]
  67
  68         def check_results():
  69             yield not info_dict.get('is_live')
  70             is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest
  71             yield with_crypto or not is_aes128_enc
  72             yield not (is_aes128_enc and r'#EXT-X-BYTERANGE' in manifest)
  73             for feature in UNSUPPORTED_FEATURES:
  74                 yield not re.search(feature, manifest)
  75         return all(check_results())
  76
  77     def real_download(self, filename, info_dict):
  78         man_url = info_dict['url']
  79         self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
  80
  81         urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
  82         man_url = urlh.geturl()
  83         s = urlh.read().decode('utf-8', 'ignore')
  84
  85         if not self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')):
  86             if info_dict.get('extra_param_to_segment_url') or info_dict.get('_decryption_key_url'):
  87                 self.report_error('pycryptodome not found. Please install')
  88                 return False
  89             if self.can_download(s, info_dict, with_crypto=True):
  90                 self.report_warning('pycryptodome is needed to download this file natively')
  91             fd = FFmpegFD(self.ydl, self.params)
  92             self.report_warning(
  93                 '%s detected unsupported features; extraction will be delegated to %s' % (self.FD_NAME, fd.get_basename()))
  94             # TODO: Make progress updates work without hooking twice
  95             # for ph in self._progress_hooks:
  96             #     fd.add_progress_hook(ph)
  97             return fd.real_download(filename, info_dict)
  98
  99         real_downloader = _get_real_downloader(info_dict, 'm3u8_frag_urls', self.params, None)
 100         if real_downloader and not real_downloader.supports_manifest(s):
 101             real_downloader = None
 102         if real_downloader:
 103             self.to_screen(
 104                 '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename()))
 105
 106         def is_ad_fragment_start(s):
 107             return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s
 108                     or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad'))
 109
 110         def is_ad_fragment_end(s):
 111             return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s
 112                     or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment'))
 113
 114         fragments = []
 115
 116         media_frags = 0
 117         ad_frags = 0
 118         ad_frag_next = False
 119         for line in s.splitlines():
 120             line = line.strip()
 121             if not line:
 122                 continue
 123             if line.startswith('#'):
 124                 if is_ad_fragment_start(line):
 125                     ad_frag_next = True
 126                 elif is_ad_fragment_end(line):
 127                     ad_frag_next = False
 128                 continue
 129             if ad_frag_next:
 130                 ad_frags += 1
 131                 continue
 132             media_frags += 1
 133
 134         ctx = {
 135             'filename': filename,
 136             'total_frags': media_frags,
 137             'ad_frags': ad_frags,
 138         }
 139
 140         if real_downloader:
 141             self._prepare_external_frag_download(ctx)
 142         else:
 143             self._prepare_and_start_frag_download(ctx)
 144
 145         fragment_retries = self.params.get('fragment_retries', 0)
 146         skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
 147         test = self.params.get('test', False)
 148
 149         format_index = info_dict.get('format_index')
 150         extra_query = None
 151         extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
 152         if extra_param_to_segment_url:
 153             extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url)
 154         i = 0
 155         media_sequence = 0
 156         decrypt_info = {'METHOD': 'NONE'}
 157         byte_range = {}
 158         discontinuity_count = 0
 159         frag_index = 0
 160         ad_frag_next = False
 161         for line in s.splitlines():
 162             line = line.strip()
 163             if line:
 164                 if not line.startswith('#'):
 165                     if format_index and discontinuity_count != format_index:
 166                         continue
 167                     if ad_frag_next:
 168                         continue
 169                     frag_index += 1
 170                     if frag_index <= ctx['fragment_index']:
 171                         continue
 172                     frag_url = (
 173                         line
 174                         if re.match(r'^https?://', line)
 175                         else compat_urlparse.urljoin(man_url, line))
 176                     if extra_query:
 177                         frag_url = update_url_query(frag_url, extra_query)
 178
 179                     fragments.append({
 180                         'frag_index': frag_index,
 181                         'url': frag_url,
 182                         'decrypt_info': decrypt_info,
 183                         'byte_range': byte_range,
 184                         'media_sequence': media_sequence,
 185                     })
 186
 187                 elif line.startswith('#EXT-X-MAP'):
 188                     if format_index and discontinuity_count != format_index:
 189                         continue
 190                     if frag_index > 0:
 191                         self.report_error(
 192                             'Initialization fragment found after media fragments, unable to download')
 193                         return False
 194                     frag_index += 1
 195                     map_info = parse_m3u8_attributes(line[11:])
 196                     frag_url = (
 197                         map_info.get('URI')
 198                         if re.match(r'^https?://', map_info.get('URI'))
 199                         else compat_urlparse.urljoin(man_url, map_info.get('URI')))
 200                     if extra_query:
 201                         frag_url = update_url_query(frag_url, extra_query)
 202
 203                     fragments.append({
 204                         'frag_index': frag_index,
 205                         'url': frag_url,
 206                         'decrypt_info': decrypt_info,
 207                         'byte_range': byte_range,
 208                         'media_sequence': media_sequence
 209                     })
 210
 211                     if map_info.get('BYTERANGE'):
 212                         splitted_byte_range = map_info.get('BYTERANGE').split('@')
 213                         sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end']
 214                         byte_range = {
 215                             'start': sub_range_start,
 216                             'end': sub_range_start + int(splitted_byte_range[0]),
 217                         }
 218
 219                 elif line.startswith('#EXT-X-KEY'):
 220                     decrypt_url = decrypt_info.get('URI')
 221                     decrypt_info = parse_m3u8_attributes(line[11:])
 222                     if decrypt_info['METHOD'] == 'AES-128':
 223                         if 'IV' in decrypt_info:
 224                             decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32))
 225                         if not re.match(r'^https?://', decrypt_info['URI']):
 226                             decrypt_info['URI'] = compat_urlparse.urljoin(
 227                                 man_url, decrypt_info['URI'])
 228                         if extra_query:
 229                             decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
 230                         if decrypt_url != decrypt_info['URI']:
 231                             decrypt_info['KEY'] = None
 232
 233                 elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
 234                     media_sequence = int(line[22:])
 235                 elif line.startswith('#EXT-X-BYTERANGE'):
 236                     splitted_byte_range = line[17:].split('@')
 237                     sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end']
 238                     byte_range = {
 239                         'start': sub_range_start,
 240                         'end': sub_range_start + int(splitted_byte_range[0]),
 241                     }
 242                 elif is_ad_fragment_start(line):
 243                     ad_frag_next = True
 244                 elif is_ad_fragment_end(line):
 245                     ad_frag_next = False
 246                 elif line.startswith('#EXT-X-DISCONTINUITY'):
 247                     discontinuity_count += 1
 248                 i += 1
 249                 media_sequence += 1
 250
 251         # We only download the first fragment during the test
 252         if test:
 253             fragments = [fragments[0] if fragments else None]
 254
 255         if real_downloader:
 256             info_copy = info_dict.copy()
 257             info_copy['fragments'] = fragments
 258             fd = real_downloader(self.ydl, self.params)
 259             # TODO: Make progress updates work without hooking twice
 260             # for ph in self._progress_hooks:
 261             #     fd.add_progress_hook(ph)
 262             success = fd.real_download(filename, info_copy)
 263             if not success:
 264                 return False
 265         else:
 266             def download_fragment(fragment):
 267                 frag_index = fragment['frag_index']
 268                 frag_url = fragment['url']
 269                 decrypt_info = fragment['decrypt_info']
 270                 byte_range = fragment['byte_range']
 271                 media_sequence = fragment['media_sequence']
 272
 273                 ctx['fragment_index'] = frag_index
 274
 275                 count = 0
 276                 headers = info_dict.get('http_headers', {})
 277                 if byte_range:
 278                     headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1)
 279                 while count <= fragment_retries:
 280                     try:
 281                         success, frag_content = self._download_fragment(
 282                             ctx, frag_url, info_dict, headers)
 283                         if not success:
 284                             return False, frag_index
 285                         break
 286                     except compat_urllib_error.HTTPError as err:
 287                         # Unavailable (possibly temporary) fragments may be served.
 288                         # First we try to retry then either skip or abort.
 289                         # See https://github.com/ytdl-org/youtube-dl/issues/10165,
 290                         # https://github.com/ytdl-org/youtube-dl/issues/10448).
 291                         count += 1
 292                         if count <= fragment_retries:
 293                             self.report_retry_fragment(err, frag_index, count, fragment_retries)
 294                 if count > fragment_retries:
 295                     self.report_error('Giving up after %s fragment retries' % fragment_retries)
 296                     return False, frag_index
 297
 298                 if decrypt_info['METHOD'] == 'AES-128':
 299                     iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
 300                     decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(
 301                         self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read()
 302                     # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block
 303                     # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded,
 304                     # not what it decrypts to.
 305                     if not test:
 306                         frag_content = AES.new(
 307                             decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
 308
 309                 return frag_content, frag_index
 310
 311             def append_fragment(frag_content, frag_index):
 312                 if frag_content:
 313                     fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], frag_index)
 314                     try:
 315                         file, frag_sanitized = sanitize_open(fragment_filename, 'rb')
 316                         ctx['fragment_filename_sanitized'] = frag_sanitized
 317                         file.close()
 318                         self._append_fragment(ctx, frag_content)
 319                         return True
 320                     except EnvironmentError as ose:
 321                         if ose.errno != errno.ENOENT:
 322                             raise
 323                         # FileNotFoundError
 324                         if skip_unavailable_fragments:
 325                             self.report_skip_fragment(frag_index)
 326                             return True
 327                         else:
 328                             self.report_error(
 329                                 'fragment %s not found, unable to continue' % frag_index)
 330                             return False
 331                 else:
 332                     if skip_unavailable_fragments:
 333                         self.report_skip_fragment(frag_index)
 334                         return True
 335                     else:
 336                         self.report_error(
 337                             'fragment %s not found, unable to continue' % frag_index)
 338                         return False
 339
 340             max_workers = self.params.get('concurrent_fragment_downloads', 1)
 341             if can_threaded_download and max_workers > 1:
 342                 self.report_warning('The download speed shown is only of one thread. This is a known issue')
 343                 with concurrent.futures.ThreadPoolExecutor(max_workers) as pool:
 344                     futures = [pool.submit(download_fragment, fragment) for fragment in fragments]
 345                     # timeout must be 0 to return instantly
 346                     done, not_done = concurrent.futures.wait(futures, timeout=0)
 347                     try:
 348                         while not_done:
 349                             # Check every 1 second for KeyboardInterrupt
 350                             freshly_done, not_done = concurrent.futures.wait(not_done, timeout=1)
 351                             done |= freshly_done
 352                     except KeyboardInterrupt:
 353                         for future in not_done:
 354                             future.cancel()
 355                         # timeout must be none to cancel
 356                         concurrent.futures.wait(not_done, timeout=None)
 357                         raise KeyboardInterrupt
 358                 results = [future.result() for future in futures]
 359
 360                 for frag_content, frag_index in results:
 361                     result = append_fragment(frag_content, frag_index)
 362                     if not result:
 363                         return False
 364             else:
 365                 for fragment in fragments:
 366                     frag_content, frag_index = download_fragment(fragment)
 367                     result = append_fragment(frag_content, frag_index)
 368                     if not result:
 369                         return False
 370
 371             self._finish_frag_download(ctx)
 372         return True