Option `--compat-options` to revert some of yt-dlp's changes

[yt-dlp.git] / yt_dlp / extractor / common.py
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index 0c56a901503f5767757346fe4290d2882caf8b67..0112585af39d310c158f9c73596bdf26756c0573 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -9,8 +9,6 @@
  import os
  import random
  import re
-import socket
-import ssl
  import sys
  import time
  import math
@@ -58,6 +56,7 @@
      js_to_json,
      JSON_LD_RE,
      mimetype2ext,
+    network_exceptions,
      orderedSet,
      parse_bitrate,
      parse_codecs,
@@ -157,7 +156,7 @@ class InfoExtractor(object):
                      * player_url SWF Player URL (used for rtmpdump).
                      * protocol   The protocol that will be used for the actual
                                   download, lower-case.
-                                 "http", "https", "rtsp", "rtmp", "rtmpe",
+                                 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
                                   "m3u8", "m3u8_native" or "http_dash_segments".
                      * fragment_base_url
                                   Base URL for fragments. Each fragment's path
@@ -558,6 +557,10 @@ def extract(self, url):
                      ie_result = self._real_extract(url)
                      if self._x_forwarded_for_ip:
                          ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
+                    subtitles = ie_result.get('subtitles')
+                    if (subtitles and 'live_chat' in subtitles
+                            and 'no-live-chat' in self._downloader.params.get('compat_opts')):
+                        del subtitles['live_chat']
                      return ie_result
                  except GeoRestrictedError as e:
                      if self.__maybe_fake_ip_and_retry(e.countries):
@@ -659,12 +662,9 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa
                  url_or_request = update_url_query(url_or_request, query)
              if data is not None or headers:
                  url_or_request = sanitized_Request(url_or_request, data, headers)
-        exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
-        if hasattr(ssl, 'CertificateError'):
-            exceptions.append(ssl.CertificateError)
          try:
              return self._downloader.urlopen(url_or_request)
-        except tuple(exceptions) as err:
+        except network_exceptions as err:
              if isinstance(err, compat_urllib_error.HTTPError):
                  if self.__can_accept_status_code(err, expected_status):
                      # Retain reference to error to prevent file object from
@@ -1419,7 +1419,10 @@ class FormatSort:
  
          default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
                     'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
-                   'proto', 'ext', 'has_audio', 'source', 'format_id')  # These must not be aliases
+                   'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
+        ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
+                        'height', 'width', 'proto', 'vext', 'abr', 'aext',
+                        'fps', 'fs_approx', 'source', 'format_id')
  
          settings = {
              'vcodec': {'type': 'ordered', 'regex': True,
@@ -2035,6 +2038,12 @@ def extract_media(x_media_line):
                      'url': url,
                      'ext': determine_ext(url),
                  }
+                if sub_info['ext'] == 'm3u8':
+                    # Per RFC 8216 §3.1, the only possible subtitle format m3u8
+                    # files may contain is WebVTT:
+                    # <https://tools.ietf.org/html/rfc8216#section-3.1>
+                    sub_info['ext'] = 'vtt'
+                    sub_info['protocol'] = 'm3u8_native'
                  subtitles.setdefault(lang, []).append(sub_info)
              if media_type not in ('VIDEO', 'AUDIO'):
                  return
@@ -2750,26 +2759,38 @@ def add_segment_url():
                          else:
                              # Assuming direct URL to unfragmented media.
                              f['url'] = base_url
-                        formats.append(f)
+                        if content_type in ('video', 'audio'):
+                            formats.append(f)
+                        elif content_type == 'text':
+                            subtitles.setdefault(lang or 'und', []).append(f)
                      else:
                          self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
          return formats, subtitles
  
-    def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
+    def _extract_ism_formats(self, *args, **kwargs):
+        fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
+        if subs:
+            self.report_warning(bug_reports_message(
+                "Ignoring subtitle tracks found in the ISM manifest; "
+                "if any subtitle tracks are missing,"
+            ))
+        return fmts
+
+    def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
          res = self._download_xml_handle(
              ism_url, video_id,
              note=note or 'Downloading ISM manifest',
              errnote=errnote or 'Failed to download ISM manifest',
              fatal=fatal, data=data, headers=headers, query=query)
          if res is False:
-            return []
+            return [], {}
          ism_doc, urlh = res
          if ism_doc is None:
-            return []
+            return [], {}
  
-        return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
+        return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
  
-    def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+    def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
          """
          Parse formats from ISM manifest.
          References:
@@ -2777,26 +2798,28 @@ def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
              https://msdn.microsoft.com/en-us/library/ff469518.aspx
          """
          if ism_doc.get('IsLive') == 'TRUE':
-            return []
+            return [], {}
          if (not self._downloader.params.get('allow_unplayable_formats')
                  and ism_doc.find('Protection') is not None):
-            return []
+            return [], {}
  
          duration = int(ism_doc.attrib['Duration'])
          timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
  
          formats = []
+        subtitles = {}
          for stream in ism_doc.findall('StreamIndex'):
              stream_type = stream.get('Type')
-            if stream_type not in ('video', 'audio'):
+            if stream_type not in ('video', 'audio', 'text'):
                  continue
              url_pattern = stream.attrib['Url']
              stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
              stream_name = stream.get('Name')
+            stream_language = stream.get('Language', 'und')
              for track in stream.findall('QualityLevel'):
                  fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
                  # TODO: add support for WVC1 and WMAP
-                if fourcc not in ('H264', 'AVC1', 'AACL'):
+                if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
                      self.report_warning('%s is not a supported codec' % fourcc)
                      continue
                  tbr = int(track.attrib['Bitrate']) // 1000
@@ -2839,33 +2862,52 @@ def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
                      format_id.append(stream_name)
                  format_id.append(compat_str(tbr))
  
-                formats.append({
-                    'format_id': '-'.join(format_id),
-                    'url': ism_url,
-                    'manifest_url': ism_url,
-                    'ext': 'ismv' if stream_type == 'video' else 'isma',
-                    'width': width,
-                    'height': height,
-                    'tbr': tbr,
-                    'asr': sampling_rate,
-                    'vcodec': 'none' if stream_type == 'audio' else fourcc,
-                    'acodec': 'none' if stream_type == 'video' else fourcc,
-                    'protocol': 'ism',
-                    'fragments': fragments,
-                    '_download_params': {
-                        'duration': duration,
-                        'timescale': stream_timescale,
-                        'width': width or 0,
-                        'height': height or 0,
-                        'fourcc': fourcc,
-                        'codec_private_data': track.get('CodecPrivateData'),
-                        'sampling_rate': sampling_rate,
-                        'channels': int_or_none(track.get('Channels', 2)),
-                        'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
-                        'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
-                    },
-                })
-        return formats
+                if stream_type == 'text':
+                    subtitles.setdefault(stream_language, []).append({
+                        'ext': 'ismt',
+                        'protocol': 'ism',
+                        'url': ism_url,
+                        'manifest_url': ism_url,
+                        'fragments': fragments,
+                        '_download_params': {
+                            'stream_type': stream_type,
+                            'duration': duration,
+                            'timescale': stream_timescale,
+                            'fourcc': fourcc,
+                            'language': stream_language,
+                            'codec_private_data': track.get('CodecPrivateData'),
+                        }
+                    })
+                elif stream_type in ('video', 'audio'):
+                    formats.append({
+                        'format_id': '-'.join(format_id),
+                        'url': ism_url,
+                        'manifest_url': ism_url,
+                        'ext': 'ismv' if stream_type == 'video' else 'isma',
+                        'width': width,
+                        'height': height,
+                        'tbr': tbr,
+                        'asr': sampling_rate,
+                        'vcodec': 'none' if stream_type == 'audio' else fourcc,
+                        'acodec': 'none' if stream_type == 'video' else fourcc,
+                        'protocol': 'ism',
+                        'fragments': fragments,
+                        '_download_params': {
+                            'stream_type': stream_type,
+                            'duration': duration,
+                            'timescale': stream_timescale,
+                            'width': width or 0,
+                            'height': height or 0,
+                            'fourcc': fourcc,
+                            'language': stream_language,
+                            'codec_private_data': track.get('CodecPrivateData'),
+                            'sampling_rate': sampling_rate,
+                            'channels': int_or_none(track.get('Channels', 2)),
+                            'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
+                            'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
+                        },
+                    })
+        return formats, subtitles
  
      def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
          def absolute_url(item_url):
@@ -2990,7 +3032,16 @@ def _media_formats(src, cur_media_type, type_info={}):
                  entries.append(media_info)
          return entries
  
-    def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
+    def _extract_akamai_formats(self, *args, **kwargs):
+        fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
+        if subs:
+            self.report_warning(bug_reports_message(
+                "Ignoring subtitle tracks found in the manifests; "
+                "if any subtitle tracks are missing,"
+            ))
+        return fmts
+
+    def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
          signed = 'hdnea=' in manifest_url
          if not signed:
              # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
@@ -2999,6 +3050,7 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
                  '', manifest_url).strip('?')
  
          formats = []
+        subtitles = {}
  
          hdcore_sign = 'hdcore=3.7.0'
          f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
@@ -3017,10 +3069,11 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
          hls_host = hosts.get('hls')
          if hls_host:
              m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
-        m3u8_formats = self._extract_m3u8_formats(
+        m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
              m3u8_url, video_id, 'mp4', 'm3u8_native',
              m3u8_id='hls', fatal=False)
          formats.extend(m3u8_formats)
+        subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
  
          http_host = hosts.get('http')
          if http_host and m3u8_formats and not signed:
@@ -3044,7 +3097,7 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
                              formats.append(http_f)
                          i += 1
  
-        return formats
+        return formats, subtitles
  
      def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
          query = compat_urlparse.urlparse(url).query