[dash,youtube] Download live from start to end (#888)

[yt-dlp.git] / yt_dlp / extractor / common.py
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index 1565ba5c373c3253d814f73a688f664c2ed6ea6f..9abbaf04f50b45864d2f63ec6943469da32e83e0 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -3,7 +3,6 @@
  
  import base64
  import collections
-import datetime
  import hashlib
  import itertools
  import json
@@ -164,9 +163,8 @@ class InfoExtractor(object):
                      * filesize_approx  An estimate for the number of bytes
                      * player_url SWF Player URL (used for rtmpdump).
                      * protocol   The protocol that will be used for the actual
-                                 download, lower-case.
-                                 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
-                                 "m3u8", "m3u8_native" or "http_dash_segments".
+                                 download, lower-case. One of "http", "https" or
+                                 one of the protocols defined in downloader.PROTOCOL_MAP
                      * fragment_base_url
                                   Base URL for fragments. Each fragment's path
                                   value (if present) will be relative to
@@ -182,6 +180,8 @@ class InfoExtractor(object):
                                              fragment_base_url
                                   * "duration" (optional, int or float)
                                   * "filesize" (optional, int)
+                    * is_from_start  Is a live format that can be downloaded
+                                from the start. Boolean
                      * preference Order number of this format. If this field is
                                   present and not None, the formats get sorted
                                   by this field, regardless of all other values.
@@ -466,6 +466,8 @@ def _match_valid_url(cls, url):
          # we have cached the regexp for *this* class, whereas getattr would also
          # match the superclass
          if '_VALID_URL_RE' not in cls.__dict__:
+            if '_VALID_URL' not in cls.__dict__:
+                cls._VALID_URL = cls._make_valid_url()
              cls._VALID_URL_RE = re.compile(cls._VALID_URL)
          return cls._VALID_URL_RE.match(url)
  
@@ -1079,7 +1081,8 @@ def report_login(self):
      def raise_login_required(
              self, msg='This video is only available for registered users',
              metadata_available=False, method='any'):
-        if metadata_available and self.get_param('ignore_no_formats_error'):
+        if metadata_available and (
+                self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
              self.report_warning(msg)
          if method is not None:
              msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
@@ -1088,13 +1091,15 @@ def raise_login_required(
      def raise_geo_restricted(
              self, msg='This video is not available from your location due to geo restriction',
              countries=None, metadata_available=False):
-        if metadata_available and self.get_param('ignore_no_formats_error'):
+        if metadata_available and (
+                self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
              self.report_warning(msg)
          else:
              raise GeoRestrictedError(msg, countries=countries)
  
      def raise_no_formats(self, msg, expected=False, video_id=None):
-        if expected and self.get_param('ignore_no_formats_error'):
+        if expected and (
+                self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
              self.report_warning(msg, video_id)
          elif isinstance(msg, ExtractorError):
              raise msg
@@ -1447,11 +1452,19 @@ def extract_video_object(e):
              })
              extract_interaction_statistic(e)
  
-        for e in json_ld:
-            if '@context' in e:
+        def traverse_json_ld(json_ld, at_top_level=True):
+            for e in json_ld:
+                if at_top_level and '@context' not in e:
+                    continue
+                if at_top_level and set(e.keys()) == {'@context', '@graph'}:
+                    traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
+                    break
                  item_type = e.get('@type')
                  if expected_type is not None and expected_type != item_type:
                      continue
+                rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
+                if rating is not None:
+                    info['average_rating'] = rating
                  if item_type in ('TVEpisode', 'Episode'):
                      episode_name = unescapeHTML(e.get('name'))
                      info.update({
@@ -1481,7 +1494,7 @@ def extract_video_object(e):
                      info.update({
                          'timestamp': parse_iso8601(e.get('datePublished')),
                          'title': unescapeHTML(e.get('headline')),
-                        'description': unescapeHTML(e.get('articleBody')),
+                        'description': unescapeHTML(e.get('articleBody') or e.get('description')),
                      })
                  elif item_type == 'VideoObject':
                      extract_video_object(e)
@@ -1496,6 +1509,8 @@ def extract_video_object(e):
                      continue
                  else:
                      break
+        traverse_json_ld(json_ld)
+
          return dict((k, v) for k, v in info.items() if v is not None)
  
      def _search_nextjs_data(self, webpage, video_id, **kw):
@@ -1505,6 +1520,24 @@ def _search_nextjs_data(self, webpage, video_id, **kw):
                  webpage, 'next.js data', **kw),
              video_id, **kw)
  
+    def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
+        ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
+        # not all website do this, but it can be changed
+        # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
+        rectx = re.escape(context_name)
+        js, arg_keys, arg_vals = self._search_regex(
+            (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
+             r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
+            webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
+
+        args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
+
+        for key, val in args.items():
+            if val in ('undefined', 'void 0'):
+                args[key] = 'null'
+
+        return self._parse_json(js_to_json(js, args), video_id)['data'][0]
+
      @staticmethod
      def _hidden_inputs(html):
          html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
@@ -1532,10 +1565,10 @@ class FormatSort:
  
          default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
                     'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
-                   'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
+                   'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
          ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
                          'height', 'width', 'proto', 'vext', 'abr', 'aext',
-                        'fps', 'fs_approx', 'source', 'format_id')
+                        'fps', 'fs_approx', 'source', 'id')
  
          settings = {
              'vcodec': {'type': 'ordered', 'regex': True,
@@ -1545,7 +1578,7 @@ class FormatSort:
              'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
                      'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
              'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
-                      'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
+                      'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
              'vext': {'type': 'ordered', 'field': 'video_ext',
                       'order': ('mp4', 'webm', 'flv', '', 'none'),
                       'order_free': ('webm', 'mp4', 'flv', '', 'none')},
@@ -1580,7 +1613,12 @@ class FormatSort:
              'res': {'type': 'multiple', 'field': ('height', 'width'),
                      'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
  
-            # Most of these exist only for compatibility reasons
+            # For compatibility with youtube-dl
+            'format_id': {'type': 'alias', 'field': 'id'},
+            'preference': {'type': 'alias', 'field': 'ie_pref'},
+            'language_preference': {'type': 'alias', 'field': 'lang'},
+
+            # Deprecated
              'dimension': {'type': 'alias', 'field': 'res'},
              'resolution': {'type': 'alias', 'field': 'res'},
              'extension': {'type': 'alias', 'field': 'ext'},
@@ -1589,7 +1627,6 @@ class FormatSort:
              'video_bitrate': {'type': 'alias', 'field': 'vbr'},
              'audio_bitrate': {'type': 'alias', 'field': 'abr'},
              'framerate': {'type': 'alias', 'field': 'fps'},
-            'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
              'protocol': {'type': 'alias', 'field': 'proto'},
              'source_preference': {'type': 'alias', 'field': 'source'},
              'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
@@ -1604,15 +1641,23 @@ class FormatSort:
              'audio': {'type': 'alias', 'field': 'hasaud'},
              'has_audio': {'type': 'alias', 'field': 'hasaud'},
              'extractor': {'type': 'alias', 'field': 'ie_pref'},
-            'preference': {'type': 'alias', 'field': 'ie_pref'},
              'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
-            'format_id': {'type': 'alias', 'field': 'id'},
          }
  
-        _order = []
+        def __init__(self, ie, field_preference):
+            self._order = []
+            self.ydl = ie._downloader
+            self.evaluate_params(self.ydl.params, field_preference)
+            if ie.get_param('verbose'):
+                self.print_verbose_info(self.ydl.write_debug)
  
          def _get_field_setting(self, field, key):
              if field not in self.settings:
+                if key in ('forced', 'priority'):
+                    return False
+                self.ydl.deprecation_warning(
+                    f'Using arbitrary fields ({field}) for format sorting is deprecated '
+                    'and may be removed in a future version')
                  self.settings[field] = {}
              propObj = self.settings[field]
              if key not in propObj:
@@ -1695,7 +1740,11 @@ def add_item(field, reverse, closest, limit_text):
                  if field is None:
                      continue
                  if self._get_field_setting(field, 'type') == 'alias':
-                    field = self._get_field_setting(field, 'field')
+                    alias, field = field, self._get_field_setting(field, 'field')
+                    if alias not in ('format_id', 'preference', 'language_preference'):
+                        self.ydl.deprecation_warning(
+                            f'Format sorting alias {alias} is deprecated '
+                            f'and may be removed in a future version. Please use {field} instead')
                  reverse = match.group('reverse') is not None
                  closest = match.group('separator') == '~'
                  limit_text = match.group('limit')
@@ -1799,10 +1848,7 @@ def calculate_preference(self, format):
      def _sort_formats(self, formats, field_preference=[]):
          if not formats:
              return
-        format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
-        format_sort.evaluate_params(self._downloader.params, field_preference)
-        if self.get_param('verbose', False):
-            format_sort.print_verbose_info(self._downloader.write_debug)
+        format_sort = self.FormatSort(self, field_preference)
          formats.sort(key=lambda f: format_sort.calculate_preference(f))
  
      def _check_formats(self, formats, video_id):
@@ -3417,10 +3463,8 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
          return formats
  
      def _live_title(self, name):
-        """ Generate the title for a live video """
-        now = datetime.datetime.now()
-        now_str = now.strftime('%Y-%m-%d %H:%M')
-        return name + ' ' + now_str
+        self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
+        return name
  
      def _int(self, v, name, fatal=False, **kwargs):
          res = int_or_none(v, **kwargs)
@@ -3530,14 +3574,18 @@ def extract_comments(self, *args, **kwargs):
  
          def extractor():
              comments = []
+            interrupted = True
              try:
                  while True:
                      comments.append(next(generator))
-            except KeyboardInterrupt:
-                interrupted = True
-                self.to_screen('Interrupted by user')
              except StopIteration:
                  interrupted = False
+            except KeyboardInterrupt:
+                self.to_screen('Interrupted by user')
+            except Exception as e:
+                if self.get_param('ignoreerrors') is not True:
+                    raise
+                self._downloader.report_error(e)
              comment_count = len(comments)
              self.to_screen(f'Extracted {comment_count} comments')
              return {
@@ -3642,17 +3690,8 @@ class SearchInfoExtractor(InfoExtractor):
      def _make_valid_url(cls):
          return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
  
-    @classmethod
-    def suitable(cls, url):
-        return re.match(cls._make_valid_url(), url) is not None
-
      def _real_extract(self, query):
-        mobj = re.match(self._make_valid_url(), query)
-        if mobj is None:
-            raise ExtractorError('Invalid search query "%s"' % query)
-
-        prefix = mobj.group('prefix')
-        query = mobj.group('query')
+        prefix, query = self._match_valid_url(query).group('prefix', 'query')
          if prefix == '':
              return self._get_n_results(query, 1)
          elif prefix == 'all':