[core] Fix `filesize_approx` calculation (#9560)

[yt-dlp.git] / yt_dlp / extractor / common.py
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index 78288f8091c95458479d8241d2b35a4afea6a657..57bbf9bdf1e6efef0fb45230cbefcbe130454b25 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -13,6 +13,7 @@
  import os
  import random
  import re
+import subprocess
  import sys
  import time
  import types
@@ -21,9 +22,22 @@
  import xml.etree.ElementTree
  
  from ..compat import functools  # isort: split
-from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
+from ..compat import (
+    compat_etree_fromstring,
+    compat_expanduser,
+    compat_os_name,
+    urllib_req_to_req,
+)
  from ..cookies import LenientSimpleCookie
  from ..downloader.f4m import get_base_url, remove_encrypted_media
+from ..downloader.hls import HlsFD
+from ..networking import HEADRequest, Request
+from ..networking.exceptions import (
+    HTTPError,
+    IncompleteRead,
+    network_exceptions,
+)
+from ..networking.impersonate import ImpersonateTarget
  from ..utils import (
      IDENTITY,
      JSON_LD_RE,
@@ -32,8 +46,8 @@
      FormatSorter,
      GeoRestrictedError,
      GeoUtils,
-    HEADRequest,
      LenientJSONDecoder,
+    Popen,
      RegexNotFoundError,
      RetryManager,
      UnsupportedError,
@@ -56,7 +70,7 @@
      join_nonempty,
      js_to_json,
      mimetype2ext,
-    network_exceptions,
+    netrc_from_content,
      orderedSet,
      parse_bitrate,
      parse_codecs,
@@ -66,7 +80,6 @@
      parse_resolution,
      sanitize_filename,
      sanitize_url,
-    sanitized_Request,
      smuggle_url,
      str_or_none,
      str_to_int,
@@ -78,8 +91,6 @@
      unescapeHTML,
      unified_strdate,
      unified_timestamp,
-    update_Request,
-    update_url_query,
      url_basename,
      url_or_none,
      urlhandle_detect_ext,
@@ -160,12 +171,12 @@ class InfoExtractor:
                                   Automatically calculated from width and height
                      * dynamic_range The dynamic range of the video. One of:
                                   "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
-                    * tbr        Average bitrate of audio and video in KBit/s
-                    * abr        Average audio bitrate in KBit/s
+                    * tbr        Average bitrate of audio and video in kbps (1000 bits/sec)
+                    * abr        Average audio bitrate in kbps (1000 bits/sec)
                      * acodec     Name of the audio codec in use
                      * asr        Audio sampling rate in Hertz
                      * audio_channels  Number of audio channels
-                    * vbr        Average video bitrate in KBit/s
+                    * vbr        Average video bitrate in kbps (1000 bits/sec)
                      * fps        Frame rate
                      * vcodec     Name of the video codec in use
                      * container  Name of the container format
@@ -220,7 +231,8 @@ class InfoExtractor:
                                   width : height ratio as float.
                      * no_resume  The server does not support resuming the
                                   (HTTP or RTMP) download. Boolean.
-                    * has_drm    The format has DRM and cannot be downloaded. Boolean
+                    * has_drm    True if the format has DRM and cannot be downloaded.
+                                 'maybe' if the format may have DRM and has to be tested before download.
                      * extra_param_to_segment_url  A query string to append to each
                                   fragment's URL, or to update each existing query string
                                   with. Only applied by the native HLS/DASH downloaders.
@@ -235,7 +247,10 @@ class InfoExtractor:
                      * downloader_options  A dictionary of downloader options
                                   (For internal use only)
                                   * http_chunk_size Chunk size for HTTP downloads
-                                 * ffmpeg_args     Extra arguments for ffmpeg downloader
+                                 * ffmpeg_args     Extra arguments for ffmpeg downloader (input)
+                                 * ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
+                    * is_dash_periods  Whether the format is a result of merging
+                                 multiple DASH periods.
                      RTMP formats can also have the additional fields: page_url,
                      app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
                      rtmp_protocol, rtmp_real_time
@@ -249,7 +264,7 @@ class InfoExtractor:
  
      direct:         True if a direct video file was given (must only be set by GenericIE)
      alt_title:      A secondary title of the video.
-    display_id      An alternative identifier for the video, not necessarily
+    display_id:     An alternative identifier for the video, not necessarily
                      unique, but available before title. Typically, id is
                      something like "4234987", title "Dancing naked mole rats",
                      and display_id "dancing-naked-mole-rats"
@@ -267,7 +282,7 @@ class InfoExtractor:
      description:    Full video description.
      uploader:       Full name of the video uploader.
      license:        License name the video is licensed under.
-    creator:        The creator of the video.
+    creators:       List of creators of the video.
      timestamp:      UNIX timestamp of the moment the video was uploaded
      upload_date:    Video upload date in UTC (YYYYMMDD).
                      If not explicitly set, calculated from timestamp
@@ -275,6 +290,9 @@ class InfoExtractor:
                      If it is not clear whether to use timestamp or this, use the former
      release_date:   The date (YYYYMMDD) when the video was released in UTC.
                      If not explicitly set, calculated from release_timestamp
+    release_year:   Year (YYYY) as integer when the video or album was released.
+                    To be used if no exact release date is known.
+                    If not explicitly set, calculated from release_date.
      modified_timestamp: UNIX timestamp of the moment the video was last modified.
      modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
                      If not explicitly set, calculated from modified_timestamp
@@ -286,6 +304,7 @@ class InfoExtractor:
      channel_id:     Id of the channel.
      channel_url:    Full URL to a channel webpage.
      channel_follower_count: Number of followers of the channel.
+    channel_is_verified: Whether the channel is verified on the platform.
      location:       Physical location where the video was filmed.
      subtitles:      The available subtitles as a dictionary in the format
                      {tag: subformats}. "tag" is usually a language code, and
@@ -314,6 +333,11 @@ class InfoExtractor:
                          * "author" - human-readable name of the comment author
                          * "author_id" - user ID of the comment author
                          * "author_thumbnail" - The thumbnail of the comment author
+                        * "author_url" - The url to the comment author's page
+                        * "author_is_verified" - Whether the author is verified
+                                                 on the platform
+                        * "author_is_uploader" - Whether the comment is made by
+                                                 the video uploader
                          * "id" - Comment ID
                          * "html" - Comment as HTML
                          * "text" - Plain text of the comment
@@ -325,8 +349,8 @@ class InfoExtractor:
                          * "dislike_count" - Number of negative ratings of the comment
                          * "is_favorited" - Whether the comment is marked as
                                             favorite by the video uploader
-                        * "author_is_uploader" - Whether the comment is made by
-                                                 the video uploader
+                        * "is_pinned" - Whether the comment is pinned to
+                                        the top of the comments
      age_limit:      Age restriction for the video, as an integer (years)
      webpage_url:    The URL to the video webpage, if given to yt-dlp it
                      should allow to get the same result again. (It will be set
@@ -350,6 +374,10 @@ class InfoExtractor:
                          * "start_time" - The start time of the chapter in seconds
                          * "end_time" - The end time of the chapter in seconds
                          * "title" (optional, string)
+    heatmap:        A list of dictionaries, with the following entries:
+                        * "start_time" - The start time of the data point in seconds
+                        * "end_time" - The end time of the data point in seconds
+                        * "value" - The normalized value of the data point (float between 0 and 1)
      playable_in_embed: Whether this video is allowed to play in embedded
                      players on other sites. Can be True (=always allowed),
                      False (=never allowed), None (=unknown), or a string
@@ -358,6 +386,7 @@ class InfoExtractor:
                      'private', 'premium_only', 'subscriber_only', 'needs_auth',
                      'unlisted' or 'public'. Use 'InfoExtractor._availability'
                      to set it
+    media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
      _old_archive_ids: A list of old archive ids needed for backward compatibility
      _format_sort_fields: A list of fields to use for sorting formats
      __post_extractor: A function to be called just before the metadata is
@@ -397,17 +426,16 @@ class InfoExtractor:
      track_number:   Number of the track within an album or a disc, as an integer.
      track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
                      as a unicode string.
-    artist:         Artist(s) of the track.
-    genre:          Genre(s) of the track.
+    artists:        List of artists of the track.
+    composers:      List of composers of the piece.
+    genres:         List of genres of the track.
      album:          Title of the album the track belongs to.
      album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
-    album_artist:   List of all artists appeared on the album (e.g.
-                    "Ash Borer / Fell Voices" or "Various Artists", useful for splits
-                    and compilations).
+    album_artists:  List of all artists appeared on the album.
+                    E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
+                    Useful for splits and compilations.
      disc_number:    Number of the disc or other physical medium the track belongs to,
                      as an integer.
-    release_year:   Year (YYYY) when the album was released.
-    composer:       Composer of the piece
  
      The following fields should only be set for clips that should be cut from the original video:
  
@@ -418,6 +446,18 @@ class InfoExtractor:
      rows:           Number of rows in each storyboard fragment, as an integer
      columns:        Number of columns in each storyboard fragment, as an integer
  
+    The following fields are deprecated and should not be set by new code:
+    composer:       Use "composers" instead.
+                    Composer(s) of the piece, comma-separated.
+    artist:         Use "artists" instead.
+                    Artist(s) of the track, comma-separated.
+    genre:          Use "genres" instead.
+                    Genre(s) of the track, comma-separated.
+    album_artist:   Use "album_artists" instead.
+                    All artists appeared on the album, comma-separated.
+    creator:        Use "creators" instead.
+                    The creator of the video.
+
      Unless mentioned otherwise, the fields should be Unicode strings.
  
      Unless mentioned otherwise, None is equivalent to absence of information.
@@ -461,8 +501,8 @@ class InfoExtractor:
  
  
      Subclasses of this should also be added to the list of extractors and
-    should define a _VALID_URL regexp and, re-define the _real_extract() and
-    (optionally) _real_initialize() methods.
+    should define _VALID_URL as a regexp or a Sequence of regexps, and
+    re-define the _real_extract() and (optionally) _real_initialize() methods.
  
      Subclasses may also override suitable() if necessary, but ensure the function
      signature is preserved and that this function imports everything it needs
@@ -525,7 +565,7 @@ class InfoExtractor:
      _EMBED_REGEX = []
  
      def _login_hint(self, method=NO_DEFAULT, netrc=None):
-        password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
+        password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
          return {
              None: '',
              'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
@@ -552,8 +592,8 @@ def _match_valid_url(cls, url):
          # we have cached the regexp for *this* class, whereas getattr would also
          # match the superclass
          if '_VALID_URL_RE' not in cls.__dict__:
-            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
-        return cls._VALID_URL_RE.match(url)
+            cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
+        return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
  
      @classmethod
      def suitable(cls, url):
@@ -708,11 +748,11 @@ def extract(self, url):
          except UnsupportedError:
              raise
          except ExtractorError as e:
-            e.video_id = e.video_id or self.get_temp_id(url),
-            e.ie = e.ie or self.IE_NAME,
+            e.video_id = e.video_id or self.get_temp_id(url)
+            e.ie = e.ie or self.IE_NAME
              e.traceback = e.traceback or sys.exc_info()[2]
              raise
-        except http.client.IncompleteRead as e:
+        except IncompleteRead as e:
              raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
          except (KeyError, StopIteration) as e:
              raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
@@ -771,22 +811,28 @@ def IE_NAME(cls):
  
      @staticmethod
      def __can_accept_status_code(err, expected_status):
-        assert isinstance(err, urllib.error.HTTPError)
+        assert isinstance(err, HTTPError)
          if expected_status is None:
              return False
          elif callable(expected_status):
-            return expected_status(err.code) is True
+            return expected_status(err.status) is True
          else:
-            return err.code in variadic(expected_status)
+            return err.status in variadic(expected_status)
  
-    def _create_request(self, url_or_request, data=None, headers=None, query=None):
+    def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None):
          if isinstance(url_or_request, urllib.request.Request):
-            return update_Request(url_or_request, data=data, headers=headers, query=query)
-        if query:
-            url_or_request = update_url_query(url_or_request, query)
-        return sanitized_Request(url_or_request, data, headers or {})
+            self._downloader.deprecation_warning(
+                'Passing a urllib.request.Request to _create_request() is deprecated. '
+                'Use yt_dlp.networking.common.Request instead.')
+            url_or_request = urllib_req_to_req(url_or_request)
+        elif not isinstance(url_or_request, Request):
+            url_or_request = Request(url_or_request)
+
+        url_or_request.update(data=data, headers=headers, query=query, extensions=extensions)
+        return url_or_request
  
-    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
+    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None,
+                         headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False):
          """
          Return the response handle.
  
@@ -817,17 +863,35 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa
              headers = (headers or {}).copy()
              headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
  
+        extensions = {}
+
+        if impersonate in (True, ''):
+            impersonate = ImpersonateTarget()
+        requested_targets = [
+            t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
+            for t in variadic(impersonate)
+        ] if impersonate else []
+
+        available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
+        if available_target:
+            extensions['impersonate'] = available_target
+        elif requested_targets:
+            message = 'The extractor is attempting impersonation, but '
+            message += (
+                'no impersonate target is available' if not str(impersonate)
+                else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
+            info_msg = ('see  https://github.com/yt-dlp/yt-dlp#impersonation  '
+                        'for information on installing the required dependencies')
+            if require_impersonation:
+                raise ExtractorError(f'{message}; {info_msg}', expected=True)
+            self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)
+
          try:
-            return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
+            return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
          except network_exceptions as err:
-            if isinstance(err, urllib.error.HTTPError):
+            if isinstance(err, HTTPError):
                  if self.__can_accept_status_code(err, expected_status):
-                    # Retain reference to error to prevent file object from
-                    # being closed before it can be read. Works around the
-                    # effects of <https://bugs.python.org/issue15002>
-                    # introduced in Python 3.4.1.
-                    err.fp._error = err
-                    return err.fp
+                    return err.response
  
              if errnote is False:
                  return False
@@ -842,13 +906,14 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa
                  return False
  
      def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
-                                 encoding=None, data=None, headers={}, query={}, expected_status=None):
+                                 encoding=None, data=None, headers={}, query={}, expected_status=None,
+                                 impersonate=None, require_impersonation=False):
          """
          Return a tuple (page content as string, URL handle).
  
          Arguments:
          url_or_request -- plain text URL as a string or
-            a urllib.request.Request object
+            a yt_dlp.networking.Request object
          video_id -- Video/playlist/item identifier (string)
  
          Keyword arguments:
@@ -873,13 +938,22 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
                    returning True if it should be accepted
              Note that this argument does not affect success status codes (2xx)
              which are always accepted.
+        impersonate -- the impersonate target. Can be any of the following entities:
+                - an instance of yt_dlp.networking.impersonate.ImpersonateTarget
+                - a string in the format of CLIENT[:OS]
+                - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
+                - a boolean value; True means any impersonate target is sufficient
+        require_impersonation -- flag to toggle whether the request should raise an error
+            if impersonation is not possible (bool, default: False)
          """
  
          # Strip hashes from the URL (#1038)
          if isinstance(url_or_request, str):
              url_or_request = url_or_request.partition('#')[0]
  
-        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
+        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data,
+                                     headers=headers, query=query, expected_status=expected_status,
+                                     impersonate=impersonate, require_impersonation=require_impersonation)
          if urlh is False:
              assert not fatal
              return False
@@ -959,11 +1033,11 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno
          if prefix is not None:
              webpage_bytes = prefix + webpage_bytes
          if self.get_param('dump_intermediate_pages', False):
-            self.to_screen('Dumping request to ' + urlh.geturl())
+            self.to_screen('Dumping request to ' + urlh.url)
              dump = base64.b64encode(webpage_bytes).decode('ascii')
              self._downloader.to_screen(dump)
          if self.get_param('write_pages'):
-            filename = self._request_dump_filename(urlh.geturl(), video_id)
+            filename = self._request_dump_filename(urlh.url, video_id)
              self.to_screen(f'Saving request to {filename}')
              with open(filename, 'wb') as outf:
                  outf.write(webpage_bytes)
@@ -1008,20 +1082,23 @@ def parse(ie, content, *args, errnote=errnote, **kwargs):
              return getattr(ie, parser)(content, *args, **kwargs)
  
          def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
-                            fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
+                            fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
+                            impersonate=None, require_impersonation=False):
              res = self._download_webpage_handle(
                  url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
-                data=data, headers=headers, query=query, expected_status=expected_status)
+                data=data, headers=headers, query=query, expected_status=expected_status,
+                impersonate=impersonate, require_impersonation=require_impersonation)
              if res is False:
                  return res
              content, urlh = res
              return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
  
          def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
-                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
+                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
+                             impersonate=None, require_impersonation=False):
              if self.get_param('load_pages'):
                  url_or_request = self._create_request(url_or_request, data, headers, query)
-                filename = self._request_dump_filename(url_or_request.full_url, video_id)
+                filename = self._request_dump_filename(url_or_request.url, video_id)
                  self.to_screen(f'Loading request from {filename}')
                  try:
                      with open(filename, 'rb') as dumpf:
@@ -1041,6 +1118,8 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
                  'headers': headers,
                  'query': query,
                  'expected_status': expected_status,
+                'impersonate': impersonate,
+                'require_impersonation': require_impersonation,
              }
              if parser is None:
                  kwargs.pop('transform_source')
@@ -1095,7 +1174,7 @@ def _download_webpage(
          while True:
              try:
                  return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
-            except http.client.IncompleteRead as e:
+            except IncompleteRead as e:
                  try_count += 1
                  if try_count >= tries:
                      raise e
@@ -1281,45 +1360,51 @@ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=Tr
          return clean_html(res)
  
      def _get_netrc_login_info(self, netrc_machine=None):
-        username = None
-        password = None
          netrc_machine = netrc_machine or self._NETRC_MACHINE
  
-        if self.get_param('usenetrc', False):
-            try:
-                netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
-                if os.path.isdir(netrc_file):
-                    netrc_file = os.path.join(netrc_file, '.netrc')
-                info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
-                if info is not None:
-                    username = info[0]
-                    password = info[2]
-                else:
-                    raise netrc.NetrcParseError(
-                        'No authenticators for %s' % netrc_machine)
-            except (OSError, netrc.NetrcParseError) as err:
-                self.report_warning(
-                    'parsing .netrc: %s' % error_to_compat_str(err))
+        cmd = self.get_param('netrc_cmd')
+        if cmd:
+            cmd = cmd.replace('{}', netrc_machine)
+            self.to_screen(f'Executing command: {cmd}')
+            stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
+            if ret != 0:
+                raise OSError(f'Command returned error code {ret}')
+            info = netrc_from_content(stdout).authenticators(netrc_machine)
+
+        elif self.get_param('usenetrc', False):
+            netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
+            if os.path.isdir(netrc_file):
+                netrc_file = os.path.join(netrc_file, '.netrc')
+            info = netrc.netrc(netrc_file).authenticators(netrc_machine)
  
-        return username, password
+        else:
+            return None, None
+        if not info:
+            self.to_screen(f'No authenticators for {netrc_machine}')
+            return None, None
+
+        self.write_debug(f'Using netrc for {netrc_machine} authentication')
+        return info[0], info[2]
  
      def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
          """
          Get the login info as (username, password)
          First look for the manually specified credentials using username_option
          and password_option as keys in params dictionary. If no such credentials
-        available look in the netrc file using the netrc_machine or _NETRC_MACHINE
-        value.
+        are available try the netrc_cmd if it is defined or look in the
+        netrc file using the netrc_machine or _NETRC_MACHINE value.
          If there's no info available, return (None, None)
          """
  
-        # Attempt to use provided username and password or .netrc data
          username = self.get_param(username_option)
          if username is not None:
              password = self.get_param(password_option)
          else:
-            username, password = self._get_netrc_login_info(netrc_machine)
-
+            try:
+                username, password = self._get_netrc_login_info(netrc_machine)
+            except (OSError, netrc.NetrcParseError) as err:
+                self.report_warning(f'Failed to parse .netrc: {err}')
+                return None, None
          return username, password
  
      def _get_tfa_info(self, note='two-factor verification code'):
@@ -1663,7 +1748,7 @@ def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal
      def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
          """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
          rectx = re.escape(context_name)
-        FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
+        FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
          js, arg_keys, arg_vals = self._search_regex(
              (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
              webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
@@ -1789,7 +1874,7 @@ def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=
              return []
  
          manifest, urlh = res
-        manifest_url = urlh.geturl()
+        manifest_url = urlh.url
  
          return self._parse_f4m_formats(
              manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
@@ -1948,7 +2033,7 @@ def _extract_m3u8_formats_and_subtitles(
              return [], {}
  
          m3u8_doc, urlh = res
-        m3u8_url = urlh.geturl()
+        m3u8_url = urlh.url
  
          return self._parse_m3u8_formats_and_subtitles(
              m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
@@ -1962,11 +2047,7 @@ def _parse_m3u8_formats_and_subtitles(
              errnote=None, fatal=True, data=None, headers={}, query={},
              video_id=None):
          formats, subtitles = [], {}
-
-        has_drm = re.search('|'.join([
-            r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
-            r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
-        ]), m3u8_doc)
+        has_drm = HlsFD._has_drm(m3u8_doc)
  
          def format_url(url):
              return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
@@ -2205,7 +2286,9 @@ def _extract_mpd_vod_duration(
              mpd_url, video_id,
              note='Downloading MPD VOD manifest' if note is None else note,
              errnote='Failed to download VOD manifest' if errnote is None else errnote,
-            fatal=False, data=data, headers=headers, query=query) or {}
+            fatal=False, data=data, headers=headers, query=query)
+        if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
+            return None
          return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
  
      @staticmethod
@@ -2228,18 +2311,10 @@ def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4
          if res is False:
              assert not fatal
              return [], {}
-
          smil, urlh = res
-        smil_url = urlh.geturl()
  
-        namespace = self._parse_smil_namespace(smil)
-
-        fmts = self._parse_smil_formats(
-            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
-        subs = self._parse_smil_subtitles(
-            smil, namespace=namespace)
-
-        return fmts, subs
+        return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
+                                                      namespace=self._parse_smil_namespace(smil))
  
      def _extract_smil_formats(self, *args, **kwargs):
          fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
@@ -2253,7 +2328,7 @@ def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
              return {}
  
          smil, urlh = res
-        smil_url = urlh.geturl()
+        smil_url = urlh.url
  
          return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
  
@@ -2265,9 +2340,8 @@ def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
      def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
          namespace = self._parse_smil_namespace(smil)
  
-        formats = self._parse_smil_formats(
+        formats, subtitles = self._parse_smil_formats_and_subtitles(
              smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
-        subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
  
          video_id = os.path.splitext(url_basename(smil_url))[0]
          title = None
@@ -2306,7 +2380,14 @@ def _parse_smil_namespace(self, smil):
          return self._search_regex(
              r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
  
-    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+    def _parse_smil_formats(self, *args, **kwargs):
+        fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
+        if subs:
+            self._report_ignoring_subs('SMIL')
+        return fmts
+
+    def _parse_smil_formats_and_subtitles(
+            self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
          base = smil_url
          for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
              b = meta.get('base') or meta.get('httpBase')
@@ -2314,14 +2395,16 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
                  base = b
                  break
  
-        formats = []
+        formats, subtitles = [], {}
          rtmp_count = 0
          http_count = 0
          m3u8_count = 0
          imgs_count = 0
  
          srcs = set()
-        media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
+        media = itertools.chain.from_iterable(
+            smil.findall(self._xpath_ns(arg, namespace))
+            for arg in ['.//video', './/audio', './/media'])
          for medium in media:
              src = medium.get('src')
              if not src or src in srcs:
@@ -2362,8 +2445,9 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
              src_url = src_url.strip()
  
              if proto == 'm3u8' or src_ext == 'm3u8':
-                m3u8_formats = self._extract_m3u8_formats(
+                m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
                      src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+                self._merge_subtitles(m3u8_subs, target=subtitles)
                  if len(m3u8_formats) == 1:
                      m3u8_count += 1
                      m3u8_formats[0].update({
@@ -2384,11 +2468,15 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
                  f4m_url += urllib.parse.urlencode(f4m_params)
                  formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
              elif src_ext == 'mpd':
-                formats.extend(self._extract_mpd_formats(
-                    src_url, video_id, mpd_id='dash', fatal=False))
+                mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
+                    src_url, video_id, mpd_id='dash', fatal=False)
+                formats.extend(mpd_formats)
+                self._merge_subtitles(mpd_subs, target=subtitles)
              elif re.search(r'\.ism/[Mm]anifest', src_url):
-                formats.extend(self._extract_ism_formats(
-                    src_url, video_id, ism_id='mss', fatal=False))
+                ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
+                    src_url, video_id, ism_id='mss', fatal=False)
+                formats.extend(ism_formats)
+                self._merge_subtitles(ism_subs, target=subtitles)
              elif src_url.startswith('http') and self._is_valid_url(src, video_id):
                  http_count += 1
                  formats.append({
@@ -2419,7 +2507,10 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
                  'format_note': 'SMIL storyboards',
              })
  
-        return formats
+        smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
+        self._merge_subtitles(smil_subs, target=subtitles)
+
+        return formats, subtitles
  
      def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
          urls = []
@@ -2445,7 +2536,7 @@ def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
              return []
  
          xspf, urlh = res
-        xspf_url = urlh.geturl()
+        xspf_url = urlh.url
  
          return self._parse_xspf(
              xspf, playlist_id, xspf_url=xspf_url,
@@ -2497,7 +2588,11 @@ def _extract_mpd_formats(self, *args, **kwargs):
              self._report_ignoring_subs('DASH')
          return fmts
  
-    def _extract_mpd_formats_and_subtitles(
+    def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
+        periods = self._extract_mpd_periods(*args, **kwargs)
+        return self._merge_mpd_periods(periods)
+
+    def _extract_mpd_periods(
              self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
              fatal=True, data=None, headers={}, query={}):
  
@@ -2510,17 +2605,16 @@ def _extract_mpd_formats_and_subtitles(
              errnote='Failed to download MPD manifest' if errnote is None else errnote,
              fatal=fatal, data=data, headers=headers, query=query)
          if res is False:
-            return [], {}
+            return []
          mpd_doc, urlh = res
          if mpd_doc is None:
-            return [], {}
+            return []
  
          # We could have been redirected to a new url when we retrieved our mpd file.
-        mpd_url = urlh.geturl()
+        mpd_url = urlh.url
          mpd_base_url = base_url(mpd_url)
  
-        return self._parse_mpd_formats_and_subtitles(
-            mpd_doc, mpd_id, mpd_base_url, mpd_url)
+        return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
  
      def _parse_mpd_formats(self, *args, **kwargs):
          fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
@@ -2528,8 +2622,39 @@ def _parse_mpd_formats(self, *args, **kwargs):
              self._report_ignoring_subs('DASH')
          return fmts
  
-    def _parse_mpd_formats_and_subtitles(
-            self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
+    def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
+        periods = self._parse_mpd_periods(*args, **kwargs)
+        return self._merge_mpd_periods(periods)
+
+    def _merge_mpd_periods(self, periods):
+        """
+        Combine all formats and subtitles from an MPD manifest into a single list,
+        by concatenate streams with similar formats.
+        """
+        formats, subtitles = {}, {}
+        for period in periods:
+            for f in period['formats']:
+                assert 'is_dash_periods' not in f, 'format already processed'
+                f['is_dash_periods'] = True
+                format_key = tuple(v for k, v in f.items() if k not in (
+                    ('format_id', 'fragments', 'manifest_stream_number')))
+                if format_key not in formats:
+                    formats[format_key] = f
+                elif 'fragments' in f:
+                    formats[format_key].setdefault('fragments', []).extend(f['fragments'])
+
+            if subtitles and period['subtitles']:
+                self.report_warning(bug_reports_message(
+                    'Found subtitles in multiple periods in the DASH manifest; '
+                    'if part of the subtitles are missing,'
+                ), only_once=True)
+
+            for sub_lang, sub_info in period['subtitles'].items():
+                subtitles.setdefault(sub_lang, []).extend(sub_info)
+
+        return list(formats.values()), subtitles
+
+    def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
          """
          Parse formats from MPD manifest.
          References:
@@ -2608,9 +2733,13 @@ def extract_Initialization(source):
              return ms_info
  
          mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
-        formats, subtitles = [], {}
          stream_numbers = collections.defaultdict(int)
-        for period in mpd_doc.findall(_add_ns('Period')):
+        for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
+            period_entry = {
+                'id': period.get('id', f'period-{period_idx}'),
+                'formats': [],
+                'subtitles': collections.defaultdict(list),
+            }
              period_duration = parse_duration(period.get('duration')) or mpd_duration
              period_ms_info = extract_multisegment_info(period, {
                  'start_number': 1,
@@ -2860,11 +2989,10 @@ def add_segment_url():
                      if content_type in ('video', 'audio', 'image/jpeg'):
                          f['manifest_stream_number'] = stream_numbers[f['url']]
                          stream_numbers[f['url']] += 1
-                        formats.append(f)
+                        period_entry['formats'].append(f)
                      elif content_type == 'text':
-                        subtitles.setdefault(lang or 'und', []).append(f)
-
-        return formats, subtitles
+                        period_entry['subtitles'][lang or 'und'].append(f)
+            yield period_entry
  
      def _extract_ism_formats(self, *args, **kwargs):
          fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
@@ -2887,7 +3015,7 @@ def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, not
          if ism_doc is None:
              return [], {}
  
-        return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
+        return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
  
      def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
          """
@@ -3440,7 +3568,7 @@ def _set_cookie(self, domain, name, value, expire_time=None, port=None,
  
      def _get_cookies(self, url):
          """ Return a http.cookies.SimpleCookie with the cookies for the url """
-        return LenientSimpleCookie(self._downloader._calc_cookies(url))
+        return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
  
      def _apply_first_set_cookie_header(self, url_handle, cookie):
          """