[cleanup] Misc

[yt-dlp.git] / yt_dlp / extractor / common.py
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index ef975997404f321195f705841b2f520264272726..8ad63b4118fddd25a35f59764d0f659b4fa494c2 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -32,6 +32,7 @@
      FormatSorter,
      GeoRestrictedError,
      GeoUtils,
+    HEADRequest,
      LenientJSONDecoder,
      RegexNotFoundError,
      RetryManager,
@@ -81,6 +82,7 @@
      update_url_query,
      url_basename,
      url_or_none,
+    urlhandle_detect_ext,
      urljoin,
      variadic,
      xpath_element,
@@ -130,6 +132,7 @@ class InfoExtractor:
                                         is parsed from a string (in case of
                                         fragmented media)
                                     for MSS - URL of the ISM manifest.
+                    * request_data  Data to send in POST request to the URL
                      * manifest_url
                                   The URL of the manifest file in case of
                                   fragmented media:
@@ -218,6 +221,17 @@ class InfoExtractor:
                      * no_resume  The server does not support resuming the
                                   (HTTP or RTMP) download. Boolean.
                      * has_drm    The format has DRM and cannot be downloaded. Boolean
+                    * extra_param_to_segment_url  A query string to append to each
+                                 fragment's URL, or to update each existing query string
+                                 with. Only applied by the native HLS/DASH downloaders.
+                    * hls_aes    A dictionary of HLS AES-128 decryption information
+                                 used by the native HLS downloader to override the
+                                 values in the media playlist when an '#EXT-X-KEY' tag
+                                 is present in the playlist:
+                                 * uri  The URI from which the key will be downloaded
+                                 * key  The key (as hex) used to decrypt fragments.
+                                        If `key` is given, any key URI will be ignored
+                                 * iv   The IV (as hex) used to decrypt fragments
                      * downloader_options  A dictionary of downloader options
                                   (For internal use only)
                                   * http_chunk_size Chunk size for HTTP downloads
@@ -1325,7 +1339,7 @@ def _get_tfa_info(self, note='two-factor verification code'):
      # Helper functions for extracting OpenGraph info
      @staticmethod
      def _og_regexes(prop):
-        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
          property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
                         % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
          template = r'<meta[^>]+?%s[^>]+?%s'
@@ -1657,11 +1671,8 @@ def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal
          if js is None:
              return {}
  
-        args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
-
-        for key, val in args.items():
-            if val in ('undefined', 'void 0'):
-                args[key] = 'null'
+        args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
+            f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
  
          ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
          return traverse_obj(ret, traverse) or {}
@@ -2053,6 +2064,7 @@ def extract_media(x_media_line):
                      'protocol': entry_protocol,
                      'preference': preference,
                      'quality': quality,
+                    'has_drm': has_drm,
                      'vcodec': 'none' if media_type == 'AUDIO' else None,
                  } for idx in _extract_m3u8_playlist_indices(manifest_url))
  
@@ -2112,6 +2124,7 @@ def build_stream_name():
                          'protocol': entry_protocol,
                          'preference': preference,
                          'quality': quality,
+                        'has_drm': has_drm,
                      }
                      resolution = last_stream_inf.get('RESOLUTION')
                      if resolution:
@@ -2178,13 +2191,23 @@ def _extract_m3u8_vod_duration(
          return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
  
      def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
-        if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
+        if '#EXT-X-ENDLIST' not in m3u8_vod:
              return None
  
          return int(sum(
              float(line[len('#EXTINF:'):].split(',')[0])
              for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
  
+    def _extract_mpd_vod_duration(
+            self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
+
+        mpd_doc = self._download_xml(
+            mpd_url, video_id,
+            note='Downloading MPD VOD manifest' if note is None else note,
+            errnote='Failed to download VOD manifest' if errnote is None else errnote,
+            fatal=False, data=data, headers=headers, query=query) or {}
+        return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
+
      @staticmethod
      def _xpath_ns(path, namespace=None):
          if not namespace:
@@ -2311,7 +2334,8 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
              height = int_or_none(medium.get('height'))
              proto = medium.get('proto')
              ext = medium.get('ext')
-            src_ext = determine_ext(src)
+            src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
+                self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
              streamer = medium.get('streamer') or base
  
              if proto == 'rtmp' or streamer.startswith('rtmp'):
@@ -3503,7 +3527,7 @@ def description(cls, *, markdown=True, search_examples=None):
          desc = ''
          if cls._NETRC_MACHINE:
              if markdown:
-                desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
+                desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
              else:
                  desc += f' [{cls._NETRC_MACHINE}]'
          if cls.IE_DESC is False: