[extractor] Do not warn for invalid chapter data in description

[yt-dlp.git] / yt_dlp / extractor / common.py
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index b18d2e73eba66362503faaae17ff35d698a2b0ba..78288f8091c95458479d8241d2b35a4afea6a657 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -32,6 +32,7 @@
      FormatSorter,
      GeoRestrictedError,
      GeoUtils,
+    HEADRequest,
      LenientJSONDecoder,
      RegexNotFoundError,
      RetryManager,
@@ -81,6 +82,7 @@
      update_url_query,
      url_basename,
      url_or_none,
+    urlhandle_detect_ext,
      urljoin,
      variadic,
      xpath_element,
@@ -130,6 +132,7 @@ class InfoExtractor:
                                         is parsed from a string (in case of
                                         fragmented media)
                                     for MSS - URL of the ISM manifest.
+                    * request_data  Data to send in POST request to the URL
                      * manifest_url
                                   The URL of the manifest file in case of
                                   fragmented media:
@@ -218,6 +221,17 @@ class InfoExtractor:
                      * no_resume  The server does not support resuming the
                                   (HTTP or RTMP) download. Boolean.
                      * has_drm    The format has DRM and cannot be downloaded. Boolean
+                    * extra_param_to_segment_url  A query string to append to each
+                                 fragment's URL, or to update each existing query string
+                                 with. Only applied by the native HLS/DASH downloaders.
+                    * hls_aes    A dictionary of HLS AES-128 decryption information
+                                 used by the native HLS downloader to override the
+                                 values in the media playlist when an '#EXT-X-KEY' tag
+                                 is present in the playlist:
+                                 * uri  The URI from which the key will be downloaded
+                                 * key  The key (as hex) used to decrypt fragments.
+                                        If `key` is given, any key URI will be ignored
+                                 * iv   The IV (as hex) used to decrypt fragments
                      * downloader_options  A dictionary of downloader options
                                   (For internal use only)
                                   * http_chunk_size Chunk size for HTTP downloads
@@ -1263,11 +1277,8 @@ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=Tr
          """
          res = self._search_regex(pattern, string, name, default, fatal, flags, group)
          if isinstance(res, tuple):
-            return [clean_html(r).strip() for r in res]
-        elif res:
-            return clean_html(res).strip()
-        else:
-            return res
+            return tuple(map(clean_html, res))
+        return clean_html(res)
  
      def _get_netrc_login_info(self, netrc_machine=None):
          username = None
@@ -1328,7 +1339,7 @@ def _get_tfa_info(self, note='two-factor verification code'):
      # Helper functions for extracting OpenGraph info
      @staticmethod
      def _og_regexes(prop):
-        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
          property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
                         % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
          template = r'<meta[^>]+?%s[^>]+?%s'
@@ -1660,11 +1671,8 @@ def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal
          if js is None:
              return {}
  
-        args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
-
-        for key, val in args.items():
-            if val in ('undefined', 'void 0'):
-                args[key] = 'null'
+        args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
+            f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
  
          ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
          return traverse_obj(ret, traverse) or {}
@@ -2056,6 +2064,7 @@ def extract_media(x_media_line):
                      'protocol': entry_protocol,
                      'preference': preference,
                      'quality': quality,
+                    'has_drm': has_drm,
                      'vcodec': 'none' if media_type == 'AUDIO' else None,
                  } for idx in _extract_m3u8_playlist_indices(manifest_url))
  
@@ -2115,6 +2124,7 @@ def build_stream_name():
                          'protocol': entry_protocol,
                          'preference': preference,
                          'quality': quality,
+                        'has_drm': has_drm,
                      }
                      resolution = last_stream_inf.get('RESOLUTION')
                      if resolution:
@@ -2181,13 +2191,23 @@ def _extract_m3u8_vod_duration(
          return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
  
      def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
-        if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
+        if '#EXT-X-ENDLIST' not in m3u8_vod:
              return None
  
          return int(sum(
              float(line[len('#EXTINF:'):].split(',')[0])
              for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
  
+    def _extract_mpd_vod_duration(
+            self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
+
+        mpd_doc = self._download_xml(
+            mpd_url, video_id,
+            note='Downloading MPD VOD manifest' if note is None else note,
+            errnote='Failed to download VOD manifest' if errnote is None else errnote,
+            fatal=False, data=data, headers=headers, query=query) or {}
+        return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
+
      @staticmethod
      def _xpath_ns(path, namespace=None):
          if not namespace:
@@ -2314,7 +2334,8 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
              height = int_or_none(medium.get('height'))
              proto = medium.get('proto')
              ext = medium.get('ext')
-            src_ext = determine_ext(src)
+            src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
+                self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
              streamer = medium.get('streamer') or base
  
              if proto == 'rtmp' or streamer.startswith('rtmp'):
@@ -2962,6 +2983,8 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
                          'protocol': 'ism',
                          'fragments': fragments,
                          'has_drm': ism_doc.find('Protection') is not None,
+                        'language': stream_language,
+                        'audio_channels': int_or_none(track.get('Channels')),
                          '_download_params': {
                              'stream_type': stream_type,
                              'duration': duration,
@@ -3492,8 +3515,8 @@ def _RETURN_TYPE(cls):
      @classmethod
      def is_single_video(cls, url):
          """Returns whether the URL is of a single video, None if unknown"""
-        assert cls.suitable(url), 'The URL must be suitable for the extractor'
-        return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
+        if cls.suitable(url):
+            return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
  
      @classmethod
      def is_suitable(cls, age_limit):
@@ -3506,7 +3529,7 @@ def description(cls, *, markdown=True, search_examples=None):
          desc = ''
          if cls._NETRC_MACHINE:
              if markdown:
-                desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
+                desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
              else:
                  desc += f' [{cls._NETRC_MACHINE}]'
          if cls.IE_DESC is False:
@@ -3628,6 +3651,42 @@ def _generic_title(self, url='', webpage='', *, default=None):
                  or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
                  or default)
  
+    def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
+        if not duration:
+            return
+        chapter_list = [{
+            'start_time': start_function(chapter),
+            'title': title_function(chapter),
+        } for chapter in chapter_list or []]
+        if strict:
+            warn = self.report_warning
+        else:
+            warn = self.write_debug
+            chapter_list.sort(key=lambda c: c['start_time'] or 0)
+
+        chapters = [{'start_time': 0}]
+        for idx, chapter in enumerate(chapter_list):
+            if chapter['start_time'] is None:
+                warn(f'Incomplete chapter {idx}')
+            elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
+                chapters.append(chapter)
+            elif chapter not in chapters:
+                issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
+                         else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
+                warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
+        return chapters[1:]
+
+    def _extract_chapters_from_description(self, description, duration):
+        duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
+        sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
+        return self._extract_chapters_helper(
+            re.findall(sep_re % (duration_re, r'.+?'), description or ''),
+            start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
+            duration=duration, strict=False) or self._extract_chapters_helper(
+            re.findall(sep_re % (r'.+?', duration_re), description or ''),
+            start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
+            duration=duration, strict=False)
+
      @staticmethod
      def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
          all_known = all(map(