yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import collections
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     encode_data_uri,
  49     error_to_compat_str,
  50     extract_attributes,
  51     ExtractorError,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     format_field,
  55     GeoRestrictedError,
  56     GeoUtils,
  57     int_or_none,
  58     join_nonempty,
  59     js_to_json,
  60     JSON_LD_RE,
  61     mimetype2ext,
  62     network_exceptions,
  63     NO_DEFAULT,
  64     orderedSet,
  65     parse_bitrate,
  66     parse_codecs,
  67     parse_duration,
  68     parse_iso8601,
  69     parse_m3u8_attributes,
  70     parse_resolution,
  71     RegexNotFoundError,
  72     sanitize_filename,
  73     sanitized_Request,
  74     str_or_none,
  75     str_to_int,
  76     strip_or_none,
  77     traverse_obj,
  78     unescapeHTML,
  79     UnsupportedError,
  80     unified_strdate,
  81     unified_timestamp,
  82     update_Request,
  83     update_url_query,
  84     url_basename,
  85     url_or_none,
  86     urljoin,
  87     variadic,
  88     xpath_element,
  89     xpath_text,
  90     xpath_with_ns,
  91 )
  92
  93
  94 class InfoExtractor(object):
  95     """Information Extractor class.
  96
  97     Information extractors are the classes that, given a URL, extract
  98     information about the video (or videos) the URL refers to. This
  99     information includes the real video URL, the video title, author and
 100     others. The information is stored in a dictionary which is then
 101     passed to the YoutubeDL. The YoutubeDL processes this
 102     information possibly downloading the video to the file system, among
 103     other possible outcomes.
 104
 105     The type field determines the type of the result.
 106     By far the most common value (and the default if _type is missing) is
 107     "video", which indicates a single video.
 108
 109     For a video, the dictionaries must include the following fields:
 110
 111     id:             Video identifier.
 112     title:          Video title, unescaped.
 113
 114     Additionally, it must contain either a formats entry or a url one:
 115
 116     formats:        A list of dictionaries for each format available, ordered
 117                     from worst to best quality.
 118
 119                     Potential fields:
 120                     * url        The mandatory URL representing the media:
 121                                    for plain file media - HTTP URL of this file,
 122                                    for RTMP - RTMP URL,
 123                                    for HLS - URL of the M3U8 media playlist,
 124                                    for HDS - URL of the F4M manifest,
 125                                    for DASH
 126                                      - HTTP URL to plain file media (in case of
 127                                        unfragmented media)
 128                                      - URL of the MPD manifest or base URL
 129                                        representing the media if MPD manifest
 130                                        is parsed from a string (in case of
 131                                        fragmented media)
 132                                    for MSS - URL of the ISM manifest.
 133                     * manifest_url
 134                                  The URL of the manifest file in case of
 135                                  fragmented media:
 136                                    for HLS - URL of the M3U8 master playlist,
 137                                    for HDS - URL of the F4M manifest,
 138                                    for DASH - URL of the MPD manifest,
 139                                    for MSS - URL of the ISM manifest.
 140                     * ext        Will be calculated from URL if missing
 141                     * format     A human-readable description of the format
 142                                  ("mp4 container with h264/opus").
 143                                  Calculated from the format_id, width, height.
 144                                  and format_note fields if missing.
 145                     * format_id  A short description of the format
 146                                  ("mp4_h264_opus" or "19").
 147                                 Technically optional, but strongly recommended.
 148                     * format_note Additional info about the format
 149                                  ("3D" or "DASH video")
 150                     * width      Width of the video, if known
 151                     * height     Height of the video, if known
 152                     * resolution Textual description of width and height
 153                     * dynamic_range The dynamic range of the video. One of:
 154                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 155                     * tbr        Average bitrate of audio and video in KBit/s
 156                     * abr        Average audio bitrate in KBit/s
 157                     * acodec     Name of the audio codec in use
 158                     * asr        Audio sampling rate in Hertz
 159                     * vbr        Average video bitrate in KBit/s
 160                     * fps        Frame rate
 161                     * vcodec     Name of the video codec in use
 162                     * container  Name of the container format
 163                     * filesize   The number of bytes, if known in advance
 164                     * filesize_approx  An estimate for the number of bytes
 165                     * player_url SWF Player URL (used for rtmpdump).
 166                     * protocol   The protocol that will be used for the actual
 167                                  download, lower-case. One of "http", "https" or
 168                                  one of the protocols defined in downloader.PROTOCOL_MAP
 169                     * fragment_base_url
 170                                  Base URL for fragments. Each fragment's path
 171                                  value (if present) will be relative to
 172                                  this URL.
 173                     * fragments  A list of fragments of a fragmented media.
 174                                  Each fragment entry must contain either an url
 175                                  or a path. If an url is present it should be
 176                                  considered by a client. Otherwise both path and
 177                                  fragment_base_url must be present. Here is
 178                                  the list of all potential fields:
 179                                  * "url" - fragment's URL
 180                                  * "path" - fragment's path relative to
 181                                             fragment_base_url
 182                                  * "duration" (optional, int or float)
 183                                  * "filesize" (optional, int)
 184                     * is_from_start  Is a live format that can be downloaded
 185                                 from the start. Boolean
 186                     * preference Order number of this format. If this field is
 187                                  present and not None, the formats get sorted
 188                                  by this field, regardless of all other values.
 189                                  -1 for default (order by other properties),
 190                                  -2 or smaller for less than default.
 191                                  < -1000 to hide the format (if there is
 192                                     another one which is strictly better)
 193                     * language   Language code, e.g. "de" or "en-US".
 194                     * language_preference  Is this in the language mentioned in
 195                                  the URL?
 196                                  10 if it's what the URL is about,
 197                                  -1 for default (don't know),
 198                                  -10 otherwise, other values reserved for now.
 199                     * quality    Order number of the video quality of this
 200                                  format, irrespective of the file format.
 201                                  -1 for default (order by other properties),
 202                                  -2 or smaller for less than default.
 203                     * source_preference  Order number for this video source
 204                                   (quality takes higher priority)
 205                                  -1 for default (order by other properties),
 206                                  -2 or smaller for less than default.
 207                     * http_headers  A dictionary of additional HTTP headers
 208                                  to add to the request.
 209                     * stretched_ratio  If given and not 1, indicates that the
 210                                  video's pixels are not square.
 211                                  width : height ratio as float.
 212                     * no_resume  The server does not support resuming the
 213                                  (HTTP or RTMP) download. Boolean.
 214                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 215                     * downloader_options  A dictionary of downloader options as
 216                                  described in FileDownloader
 217                     RTMP formats can also have the additional fields: page_url,
 218                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 219                     rtmp_protocol, rtmp_real_time
 220
 221     url:            Final video URL.
 222     ext:            Video filename extension.
 223     format:         The video format, defaults to ext (used for --get-format)
 224     player_url:     SWF Player URL (used for rtmpdump).
 225
 226     The following fields are optional:
 227
 228     alt_title:      A secondary title of the video.
 229     display_id      An alternative identifier for the video, not necessarily
 230                     unique, but available before title. Typically, id is
 231                     something like "4234987", title "Dancing naked mole rats",
 232                     and display_id "dancing-naked-mole-rats"
 233     thumbnails:     A list of dictionaries, with the following entries:
 234                         * "id" (optional, string) - Thumbnail format ID
 235                         * "url"
 236                         * "preference" (optional, int) - quality of the image
 237                         * "width" (optional, int)
 238                         * "height" (optional, int)
 239                         * "resolution" (optional, string "{width}x{height}",
 240                                         deprecated)
 241                         * "filesize" (optional, int)
 242     thumbnail:      Full URL to a video thumbnail image.
 243     description:    Full video description.
 244     uploader:       Full name of the video uploader.
 245     license:        License name the video is licensed under.
 246     creator:        The creator of the video.
 247     timestamp:      UNIX timestamp of the moment the video was uploaded
 248     upload_date:    Video upload date (YYYYMMDD).
 249                     If not explicitly set, calculated from timestamp
 250     release_timestamp: UNIX timestamp of the moment the video was released.
 251                     If it is not clear whether to use timestamp or this, use the former
 252     release_date:   The date (YYYYMMDD) when the video was released.
 253                     If not explicitly set, calculated from release_timestamp
 254     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 255     modified_date:   The date (YYYYMMDD) when the video was last modified.
 256                     If not explicitly set, calculated from modified_timestamp
 257     uploader_id:    Nickname or id of the video uploader.
 258     uploader_url:   Full URL to a personal webpage of the video uploader.
 259     channel:        Full name of the channel the video is uploaded on.
 260                     Note that channel fields may or may not repeat uploader
 261                     fields. This depends on a particular extractor.
 262     channel_id:     Id of the channel.
 263     channel_url:    Full URL to a channel webpage.
 264     location:       Physical location where the video was filmed.
 265     subtitles:      The available subtitles as a dictionary in the format
 266                     {tag: subformats}. "tag" is usually a language code, and
 267                     "subformats" is a list sorted from lower to higher
 268                     preference, each element is a dictionary with the "ext"
 269                     entry and one of:
 270                         * "data": The subtitles file contents
 271                         * "url": A URL pointing to the subtitles file
 272                     It can optionally also have:
 273                         * "name": Name or description of the subtitles
 274                     "ext" will be calculated from URL if missing
 275     automatic_captions: Like 'subtitles'; contains automatically generated
 276                     captions instead of normal subtitles
 277     duration:       Length of the video in seconds, as an integer or float.
 278     view_count:     How many users have watched the video on the platform.
 279     like_count:     Number of positive ratings of the video
 280     dislike_count:  Number of negative ratings of the video
 281     repost_count:   Number of reposts of the video
 282     average_rating: Average rating give by users, the scale used depends on the webpage
 283     comment_count:  Number of comments on the video
 284     comments:       A list of comments, each with one or more of the following
 285                     properties (all but one of text or html optional):
 286                         * "author" - human-readable name of the comment author
 287                         * "author_id" - user ID of the comment author
 288                         * "author_thumbnail" - The thumbnail of the comment author
 289                         * "id" - Comment ID
 290                         * "html" - Comment as HTML
 291                         * "text" - Plain text of the comment
 292                         * "timestamp" - UNIX timestamp of comment
 293                         * "parent" - ID of the comment this one is replying to.
 294                                      Set to "root" to indicate that this is a
 295                                      comment to the original video.
 296                         * "like_count" - Number of positive ratings of the comment
 297                         * "dislike_count" - Number of negative ratings of the comment
 298                         * "is_favorited" - Whether the comment is marked as
 299                                            favorite by the video uploader
 300                         * "author_is_uploader" - Whether the comment is made by
 301                                                  the video uploader
 302     age_limit:      Age restriction for the video, as an integer (years)
 303     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 304                     should allow to get the same result again. (It will be set
 305                     by YoutubeDL if it's missing)
 306     categories:     A list of categories that the video falls in, for example
 307                     ["Sports", "Berlin"]
 308     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 309     cast:           A list of the video cast
 310     is_live:        True, False, or None (=unknown). Whether this video is a
 311                     live stream that goes on instead of a fixed-length video.
 312     was_live:       True, False, or None (=unknown). Whether this video was
 313                     originally a live stream.
 314     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 315                     If absent, automatically set from is_live, was_live
 316     start_time:     Time in seconds where the reproduction should start, as
 317                     specified in the URL.
 318     end_time:       Time in seconds where the reproduction should end, as
 319                     specified in the URL.
 320     chapters:       A list of dictionaries, with the following entries:
 321                         * "start_time" - The start time of the chapter in seconds
 322                         * "end_time" - The end time of the chapter in seconds
 323                         * "title" (optional, string)
 324     playable_in_embed: Whether this video is allowed to play in embedded
 325                     players on other sites. Can be True (=always allowed),
 326                     False (=never allowed), None (=unknown), or a string
 327                     specifying the criteria for embedability (Eg: 'whitelist')
 328     availability:   Under what condition the video is available. One of
 329                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 330                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 331                     to set it
 332     __post_extractor: A function to be called just before the metadata is
 333                     written to either disk, logger or console. The function
 334                     must return a dict which will be added to the info_dict.
 335                     This is usefull for additional information that is
 336                     time-consuming to extract. Note that the fields thus
 337                     extracted will not be available to output template and
 338                     match_filter. So, only "comments" and "comment_count" are
 339                     currently allowed to be extracted via this method.
 340
 341     The following fields should only be used when the video belongs to some logical
 342     chapter or section:
 343
 344     chapter:        Name or title of the chapter the video belongs to.
 345     chapter_number: Number of the chapter the video belongs to, as an integer.
 346     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 347
 348     The following fields should only be used when the video is an episode of some
 349     series, programme or podcast:
 350
 351     series:         Title of the series or programme the video episode belongs to.
 352     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 353     season:         Title of the season the video episode belongs to.
 354     season_number:  Number of the season the video episode belongs to, as an integer.
 355     season_id:      Id of the season the video episode belongs to, as a unicode string.
 356     episode:        Title of the video episode. Unlike mandatory video title field,
 357                     this field should denote the exact title of the video episode
 358                     without any kind of decoration.
 359     episode_number: Number of the video episode within a season, as an integer.
 360     episode_id:     Id of the video episode, as a unicode string.
 361
 362     The following fields should only be used when the media is a track or a part of
 363     a music album:
 364
 365     track:          Title of the track.
 366     track_number:   Number of the track within an album or a disc, as an integer.
 367     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 368                     as a unicode string.
 369     artist:         Artist(s) of the track.
 370     genre:          Genre(s) of the track.
 371     album:          Title of the album the track belongs to.
 372     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 373     album_artist:   List of all artists appeared on the album (e.g.
 374                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 375                     and compilations).
 376     disc_number:    Number of the disc or other physical medium the track belongs to,
 377                     as an integer.
 378     release_year:   Year (YYYY) when the album was released.
 379     composer:       Composer of the piece
 380
 381     Unless mentioned otherwise, the fields should be Unicode strings.
 382
 383     Unless mentioned otherwise, None is equivalent to absence of information.
 384
 385
 386     _type "playlist" indicates multiple videos.
 387     There must be a key "entries", which is a list, an iterable, or a PagedList
 388     object, each element of which is a valid dictionary by this specification.
 389
 390     Additionally, playlists can have "id", "title", and any other relevent
 391     attributes with the same semantics as videos (see above).
 392
 393     It can also have the following optional fields:
 394
 395     playlist_count: The total number of videos in a playlist. If not given,
 396                     YoutubeDL tries to calculate it from "entries"
 397
 398
 399     _type "multi_video" indicates that there are multiple videos that
 400     form a single show, for examples multiple acts of an opera or TV episode.
 401     It must have an entries key like a playlist and contain all the keys
 402     required for a video at the same time.
 403
 404
 405     _type "url" indicates that the video must be extracted from another
 406     location, possibly by a different extractor. Its only required key is:
 407     "url" - the next URL to extract.
 408     The key "ie_key" can be set to the class name (minus the trailing "IE",
 409     e.g. "Youtube") if the extractor class is known in advance.
 410     Additionally, the dictionary may have any properties of the resolved entity
 411     known in advance, for example "title" if the title of the referred video is
 412     known ahead of time.
 413
 414
 415     _type "url_transparent" entities have the same specification as "url", but
 416     indicate that the given additional information is more precise than the one
 417     associated with the resolved URL.
 418     This is useful when a site employs a video service that hosts the video and
 419     its technical metadata, but that video service does not embed a useful
 420     title, description etc.
 421
 422
 423     Subclasses of this one should re-define the _real_initialize() and
 424     _real_extract() methods and define a _VALID_URL regexp.
 425     Probably, they should also be added to the list of extractors.
 426
 427     Subclasses may also override suitable() if necessary, but ensure the function
 428     signature is preserved and that this function imports everything it needs
 429     (except other extractors), so that lazy_extractors works correctly
 430
 431     _GEO_BYPASS attribute may be set to False in order to disable
 432     geo restriction bypass mechanisms for a particular extractor.
 433     Though it won't disable explicit geo restriction bypass based on
 434     country code provided with geo_bypass_country.
 435
 436     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 437     countries for this extractor. One of these countries will be used by
 438     geo restriction bypass mechanism right away in order to bypass
 439     geo restriction, of course, if the mechanism is not disabled.
 440
 441     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 442     IP blocks in CIDR notation for this extractor. One of these IP blocks
 443     will be used by geo restriction bypass mechanism similarly
 444     to _GEO_COUNTRIES.
 445
 446     The _WORKING attribute should be set to False for broken IEs
 447     in order to warn the users and skip the tests.
 448     """
 449
 450     _ready = False
 451     _downloader = None
 452     _x_forwarded_for_ip = None
 453     _GEO_BYPASS = True
 454     _GEO_COUNTRIES = None
 455     _GEO_IP_BLOCKS = None
 456     _WORKING = True
 457
 458     _LOGIN_HINTS = {
 459         'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
 460         'cookies': (
 461             'Use --cookies-from-browser or --cookies for the authentication. '
 462             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 463         'password': 'Use --username and --password, or --netrc to provide account credentials',
 464     }
 465
 466     def __init__(self, downloader=None):
 467         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 468         If a downloader is not passed during initialization,
 469         it must be set using "set_downloader()" before "extract()" is called"""
 470         self._ready = False
 471         self._x_forwarded_for_ip = None
 472         self._printed_messages = set()
 473         self.set_downloader(downloader)
 474
 475     @classmethod
 476     def _match_valid_url(cls, url):
 477         # This does not use has/getattr intentionally - we want to know whether
 478         # we have cached the regexp for *this* class, whereas getattr would also
 479         # match the superclass
 480         if '_VALID_URL_RE' not in cls.__dict__:
 481             if '_VALID_URL' not in cls.__dict__:
 482                 cls._VALID_URL = cls._make_valid_url()
 483             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 484         return cls._VALID_URL_RE.match(url)
 485
 486     @classmethod
 487     def suitable(cls, url):
 488         """Receives a URL and returns True if suitable for this IE."""
 489         # This function must import everything it needs (except other extractors),
 490         # so that lazy_extractors works correctly
 491         return cls._match_valid_url(url) is not None
 492
 493     @classmethod
 494     def _match_id(cls, url):
 495         return cls._match_valid_url(url).group('id')
 496
 497     @classmethod
 498     def get_temp_id(cls, url):
 499         try:
 500             return cls._match_id(url)
 501         except (IndexError, AttributeError):
 502             return None
 503
 504     @classmethod
 505     def working(cls):
 506         """Getter method for _WORKING."""
 507         return cls._WORKING
 508
 509     def initialize(self):
 510         """Initializes an instance (authentication, etc)."""
 511         self._printed_messages = set()
 512         self._initialize_geo_bypass({
 513             'countries': self._GEO_COUNTRIES,
 514             'ip_blocks': self._GEO_IP_BLOCKS,
 515         })
 516         if not self._ready:
 517             self._real_initialize()
 518             self._ready = True
 519
 520     def _initialize_geo_bypass(self, geo_bypass_context):
 521         """
 522         Initialize geo restriction bypass mechanism.
 523
 524         This method is used to initialize geo bypass mechanism based on faking
 525         X-Forwarded-For HTTP header. A random country from provided country list
 526         is selected and a random IP belonging to this country is generated. This
 527         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 528         HTTP requests.
 529
 530         This method will be used for initial geo bypass mechanism initialization
 531         during the instance initialization with _GEO_COUNTRIES and
 532         _GEO_IP_BLOCKS.
 533
 534         You may also manually call it from extractor's code if geo bypass
 535         information is not available beforehand (e.g. obtained during
 536         extraction) or due to some other reason. In this case you should pass
 537         this information in geo bypass context passed as first argument. It may
 538         contain following fields:
 539
 540         countries:  List of geo unrestricted countries (similar
 541                     to _GEO_COUNTRIES)
 542         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 543                     (similar to _GEO_IP_BLOCKS)
 544
 545         """
 546         if not self._x_forwarded_for_ip:
 547
 548             # Geo bypass mechanism is explicitly disabled by user
 549             if not self.get_param('geo_bypass', True):
 550                 return
 551
 552             if not geo_bypass_context:
 553                 geo_bypass_context = {}
 554
 555             # Backward compatibility: previously _initialize_geo_bypass
 556             # expected a list of countries, some 3rd party code may still use
 557             # it this way
 558             if isinstance(geo_bypass_context, (list, tuple)):
 559                 geo_bypass_context = {
 560                     'countries': geo_bypass_context,
 561                 }
 562
 563             # The whole point of geo bypass mechanism is to fake IP
 564             # as X-Forwarded-For HTTP header based on some IP block or
 565             # country code.
 566
 567             # Path 1: bypassing based on IP block in CIDR notation
 568
 569             # Explicit IP block specified by user, use it right away
 570             # regardless of whether extractor is geo bypassable or not
 571             ip_block = self.get_param('geo_bypass_ip_block', None)
 572
 573             # Otherwise use random IP block from geo bypass context but only
 574             # if extractor is known as geo bypassable
 575             if not ip_block:
 576                 ip_blocks = geo_bypass_context.get('ip_blocks')
 577                 if self._GEO_BYPASS and ip_blocks:
 578                     ip_block = random.choice(ip_blocks)
 579
 580             if ip_block:
 581                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 582                 self._downloader.write_debug(
 583                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 584                 return
 585
 586             # Path 2: bypassing based on country code
 587
 588             # Explicit country code specified by user, use it right away
 589             # regardless of whether extractor is geo bypassable or not
 590             country = self.get_param('geo_bypass_country', None)
 591
 592             # Otherwise use random country code from geo bypass context but
 593             # only if extractor is known as geo bypassable
 594             if not country:
 595                 countries = geo_bypass_context.get('countries')
 596                 if self._GEO_BYPASS and countries:
 597                     country = random.choice(countries)
 598
 599             if country:
 600                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 601                 self._downloader.write_debug(
 602                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 603
 604     def extract(self, url):
 605         """Extracts URL information and returns it in list of dicts."""
 606         try:
 607             for _ in range(2):
 608                 try:
 609                     self.initialize()
 610                     self.write_debug('Extracting URL: %s' % url)
 611                     ie_result = self._real_extract(url)
 612                     if ie_result is None:
 613                         return None
 614                     if self._x_forwarded_for_ip:
 615                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 616                     subtitles = ie_result.get('subtitles')
 617                     if (subtitles and 'live_chat' in subtitles
 618                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 619                         del subtitles['live_chat']
 620                     return ie_result
 621                 except GeoRestrictedError as e:
 622                     if self.__maybe_fake_ip_and_retry(e.countries):
 623                         continue
 624                     raise
 625         except UnsupportedError:
 626             raise
 627         except ExtractorError as e:
 628             kwargs = {
 629                 'video_id': e.video_id or self.get_temp_id(url),
 630                 'ie': self.IE_NAME,
 631                 'tb': e.traceback or sys.exc_info()[2],
 632                 'expected': e.expected,
 633                 'cause': e.cause
 634             }
 635             if hasattr(e, 'countries'):
 636                 kwargs['countries'] = e.countries
 637             raise type(e)(e.msg, **kwargs)
 638         except compat_http_client.IncompleteRead as e:
 639             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 640         except (KeyError, StopIteration) as e:
 641             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 642
 643     def __maybe_fake_ip_and_retry(self, countries):
 644         if (not self.get_param('geo_bypass_country', None)
 645                 and self._GEO_BYPASS
 646                 and self.get_param('geo_bypass', True)
 647                 and not self._x_forwarded_for_ip
 648                 and countries):
 649             country_code = random.choice(countries)
 650             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 651             if self._x_forwarded_for_ip:
 652                 self.report_warning(
 653                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 654                     % (self._x_forwarded_for_ip, country_code.upper()))
 655                 return True
 656         return False
 657
 658     def set_downloader(self, downloader):
 659         """Sets the downloader for this IE."""
 660         self._downloader = downloader
 661
 662     def _real_initialize(self):
 663         """Real initialization process. Redefine in subclasses."""
 664         pass
 665
 666     def _real_extract(self, url):
 667         """Real extraction process. Redefine in subclasses."""
 668         pass
 669
 670     @classmethod
 671     def ie_key(cls):
 672         """A string for getting the InfoExtractor with get_info_extractor"""
 673         return cls.__name__[:-2]
 674
 675     @property
 676     def IE_NAME(self):
 677         return compat_str(type(self).__name__[:-2])
 678
 679     @staticmethod
 680     def __can_accept_status_code(err, expected_status):
 681         assert isinstance(err, compat_urllib_error.HTTPError)
 682         if expected_status is None:
 683             return False
 684         elif callable(expected_status):
 685             return expected_status(err.code) is True
 686         else:
 687             return err.code in variadic(expected_status)
 688
 689     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 690         """
 691         Return the response handle.
 692
 693         See _download_webpage docstring for arguments specification.
 694         """
 695         if not self._downloader._first_webpage_request:
 696             sleep_interval = self.get_param('sleep_interval_requests') or 0
 697             if sleep_interval > 0:
 698                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 699                 time.sleep(sleep_interval)
 700         else:
 701             self._downloader._first_webpage_request = False
 702
 703         if note is None:
 704             self.report_download_webpage(video_id)
 705         elif note is not False:
 706             if video_id is None:
 707                 self.to_screen('%s' % (note,))
 708             else:
 709                 self.to_screen('%s: %s' % (video_id, note))
 710
 711         # Some sites check X-Forwarded-For HTTP header in order to figure out
 712         # the origin of the client behind proxy. This allows bypassing geo
 713         # restriction by faking this header's value to IP that belongs to some
 714         # geo unrestricted country. We will do so once we encounter any
 715         # geo restriction error.
 716         if self._x_forwarded_for_ip:
 717             if 'X-Forwarded-For' not in headers:
 718                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 719
 720         if isinstance(url_or_request, compat_urllib_request.Request):
 721             url_or_request = update_Request(
 722                 url_or_request, data=data, headers=headers, query=query)
 723         else:
 724             if query:
 725                 url_or_request = update_url_query(url_or_request, query)
 726             if data is not None or headers:
 727                 url_or_request = sanitized_Request(url_or_request, data, headers)
 728         try:
 729             return self._downloader.urlopen(url_or_request)
 730         except network_exceptions as err:
 731             if isinstance(err, compat_urllib_error.HTTPError):
 732                 if self.__can_accept_status_code(err, expected_status):
 733                     # Retain reference to error to prevent file object from
 734                     # being closed before it can be read. Works around the
 735                     # effects of <https://bugs.python.org/issue15002>
 736                     # introduced in Python 3.4.1.
 737                     err.fp._error = err
 738                     return err.fp
 739
 740             if errnote is False:
 741                 return False
 742             if errnote is None:
 743                 errnote = 'Unable to download webpage'
 744
 745             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 746             if fatal:
 747                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 748             else:
 749                 self.report_warning(errmsg)
 750                 return False
 751
 752     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 753         """
 754         Return a tuple (page content as string, URL handle).
 755
 756         See _download_webpage docstring for arguments specification.
 757         """
 758         # Strip hashes from the URL (#1038)
 759         if isinstance(url_or_request, (compat_str, str)):
 760             url_or_request = url_or_request.partition('#')[0]
 761
 762         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 763         if urlh is False:
 764             assert not fatal
 765             return False
 766         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 767         return (content, urlh)
 768
 769     @staticmethod
 770     def _guess_encoding_from_content(content_type, webpage_bytes):
 771         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 772         if m:
 773             encoding = m.group(1)
 774         else:
 775             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 776                           webpage_bytes[:1024])
 777             if m:
 778                 encoding = m.group(1).decode('ascii')
 779             elif webpage_bytes.startswith(b'\xff\xfe'):
 780                 encoding = 'utf-16'
 781             else:
 782                 encoding = 'utf-8'
 783
 784         return encoding
 785
 786     def __check_blocked(self, content):
 787         first_block = content[:512]
 788         if ('<title>Access to this site is blocked</title>' in content
 789                 and 'Websense' in first_block):
 790             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 791             blocked_iframe = self._html_search_regex(
 792                 r'<iframe src="([^"]+)"', content,
 793                 'Websense information URL', default=None)
 794             if blocked_iframe:
 795                 msg += ' Visit %s for more details' % blocked_iframe
 796             raise ExtractorError(msg, expected=True)
 797         if '<title>The URL you requested has been blocked</title>' in first_block:
 798             msg = (
 799                 'Access to this webpage has been blocked by Indian censorship. '
 800                 'Use a VPN or proxy server (with --proxy) to route around it.')
 801             block_msg = self._html_search_regex(
 802                 r'</h1><p>(.*?)</p>',
 803                 content, 'block message', default=None)
 804             if block_msg:
 805                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 806             raise ExtractorError(msg, expected=True)
 807         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 808                 and 'blocklist.rkn.gov.ru' in content):
 809             raise ExtractorError(
 810                 'Access to this webpage has been blocked by decision of the Russian government. '
 811                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 812                 expected=True)
 813
 814     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 815         content_type = urlh.headers.get('Content-Type', '')
 816         webpage_bytes = urlh.read()
 817         if prefix is not None:
 818             webpage_bytes = prefix + webpage_bytes
 819         if not encoding:
 820             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 821         if self.get_param('dump_intermediate_pages', False):
 822             self.to_screen('Dumping request to ' + urlh.geturl())
 823             dump = base64.b64encode(webpage_bytes).decode('ascii')
 824             self._downloader.to_screen(dump)
 825         if self.get_param('write_pages', False):
 826             basen = '%s_%s' % (video_id, urlh.geturl())
 827             trim_length = self.get_param('trim_file_name') or 240
 828             if len(basen) > trim_length:
 829                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 830                 basen = basen[:trim_length - len(h)] + h
 831             raw_filename = basen + '.dump'
 832             filename = sanitize_filename(raw_filename, restricted=True)
 833             self.to_screen('Saving request to ' + filename)
 834             # Working around MAX_PATH limitation on Windows (see
 835             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 836             if compat_os_name == 'nt':
 837                 absfilepath = os.path.abspath(filename)
 838                 if len(absfilepath) > 259:
 839                     filename = '\\\\?\\' + absfilepath
 840             with open(filename, 'wb') as outf:
 841                 outf.write(webpage_bytes)
 842
 843         try:
 844             content = webpage_bytes.decode(encoding, 'replace')
 845         except LookupError:
 846             content = webpage_bytes.decode('utf-8', 'replace')
 847
 848         self.__check_blocked(content)
 849
 850         return content
 851
 852     def _download_webpage(
 853             self, url_or_request, video_id, note=None, errnote=None,
 854             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 855             headers={}, query={}, expected_status=None):
 856         """
 857         Return the data of the page as a string.
 858
 859         Arguments:
 860         url_or_request -- plain text URL as a string or
 861             a compat_urllib_request.Requestobject
 862         video_id -- Video/playlist/item identifier (string)
 863
 864         Keyword arguments:
 865         note -- note printed before downloading (string)
 866         errnote -- note printed in case of an error (string)
 867         fatal -- flag denoting whether error should be considered fatal,
 868             i.e. whether it should cause ExtractionError to be raised,
 869             otherwise a warning will be reported and extraction continued
 870         tries -- number of tries
 871         timeout -- sleep interval between tries
 872         encoding -- encoding for a page content decoding, guessed automatically
 873             when not explicitly specified
 874         data -- POST data (bytes)
 875         headers -- HTTP headers (dict)
 876         query -- URL query (dict)
 877         expected_status -- allows to accept failed HTTP requests (non 2xx
 878             status code) by explicitly specifying a set of accepted status
 879             codes. Can be any of the following entities:
 880                 - an integer type specifying an exact failed status code to
 881                   accept
 882                 - a list or a tuple of integer types specifying a list of
 883                   failed status codes to accept
 884                 - a callable accepting an actual failed status code and
 885                   returning True if it should be accepted
 886             Note that this argument does not affect success status codes (2xx)
 887             which are always accepted.
 888         """
 889
 890         success = False
 891         try_count = 0
 892         while success is False:
 893             try:
 894                 res = self._download_webpage_handle(
 895                     url_or_request, video_id, note, errnote, fatal,
 896                     encoding=encoding, data=data, headers=headers, query=query,
 897                     expected_status=expected_status)
 898                 success = True
 899             except compat_http_client.IncompleteRead as e:
 900                 try_count += 1
 901                 if try_count >= tries:
 902                     raise e
 903                 self._sleep(timeout, video_id)
 904         if res is False:
 905             return res
 906         else:
 907             content, _ = res
 908             return content
 909
 910     def _download_xml_handle(
 911             self, url_or_request, video_id, note='Downloading XML',
 912             errnote='Unable to download XML', transform_source=None,
 913             fatal=True, encoding=None, data=None, headers={}, query={},
 914             expected_status=None):
 915         """
 916         Return a tuple (xml as an compat_etree_Element, URL handle).
 917
 918         See _download_webpage docstring for arguments specification.
 919         """
 920         res = self._download_webpage_handle(
 921             url_or_request, video_id, note, errnote, fatal=fatal,
 922             encoding=encoding, data=data, headers=headers, query=query,
 923             expected_status=expected_status)
 924         if res is False:
 925             return res
 926         xml_string, urlh = res
 927         return self._parse_xml(
 928             xml_string, video_id, transform_source=transform_source,
 929             fatal=fatal), urlh
 930
 931     def _download_xml(
 932             self, url_or_request, video_id,
 933             note='Downloading XML', errnote='Unable to download XML',
 934             transform_source=None, fatal=True, encoding=None,
 935             data=None, headers={}, query={}, expected_status=None):
 936         """
 937         Return the xml as an compat_etree_Element.
 938
 939         See _download_webpage docstring for arguments specification.
 940         """
 941         res = self._download_xml_handle(
 942             url_or_request, video_id, note=note, errnote=errnote,
 943             transform_source=transform_source, fatal=fatal, encoding=encoding,
 944             data=data, headers=headers, query=query,
 945             expected_status=expected_status)
 946         return res if res is False else res[0]
 947
 948     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 949         if transform_source:
 950             xml_string = transform_source(xml_string)
 951         try:
 952             return compat_etree_fromstring(xml_string.encode('utf-8'))
 953         except compat_xml_parse_error as ve:
 954             errmsg = '%s: Failed to parse XML ' % video_id
 955             if fatal:
 956                 raise ExtractorError(errmsg, cause=ve)
 957             else:
 958                 self.report_warning(errmsg + str(ve))
 959
 960     def _download_json_handle(
 961             self, url_or_request, video_id, note='Downloading JSON metadata',
 962             errnote='Unable to download JSON metadata', transform_source=None,
 963             fatal=True, encoding=None, data=None, headers={}, query={},
 964             expected_status=None):
 965         """
 966         Return a tuple (JSON object, URL handle).
 967
 968         See _download_webpage docstring for arguments specification.
 969         """
 970         res = self._download_webpage_handle(
 971             url_or_request, video_id, note, errnote, fatal=fatal,
 972             encoding=encoding, data=data, headers=headers, query=query,
 973             expected_status=expected_status)
 974         if res is False:
 975             return res
 976         json_string, urlh = res
 977         return self._parse_json(
 978             json_string, video_id, transform_source=transform_source,
 979             fatal=fatal), urlh
 980
 981     def _download_json(
 982             self, url_or_request, video_id, note='Downloading JSON metadata',
 983             errnote='Unable to download JSON metadata', transform_source=None,
 984             fatal=True, encoding=None, data=None, headers={}, query={},
 985             expected_status=None):
 986         """
 987         Return the JSON object as a dict.
 988
 989         See _download_webpage docstring for arguments specification.
 990         """
 991         res = self._download_json_handle(
 992             url_or_request, video_id, note=note, errnote=errnote,
 993             transform_source=transform_source, fatal=fatal, encoding=encoding,
 994             data=data, headers=headers, query=query,
 995             expected_status=expected_status)
 996         return res if res is False else res[0]
 997
 998     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 999         if transform_source:
1000             json_string = transform_source(json_string)
1001         try:
1002             return json.loads(json_string)
1003         except ValueError as ve:
1004             errmsg = '%s: Failed to parse JSON ' % video_id
1005             if fatal:
1006                 raise ExtractorError(errmsg, cause=ve)
1007             else:
1008                 self.report_warning(errmsg + str(ve))
1009
1010     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1011         return self._parse_json(
1012             data[data.find('{'):data.rfind('}') + 1],
1013             video_id, transform_source, fatal)
1014
1015     def _download_socket_json_handle(
1016             self, url_or_request, video_id, note='Polling socket',
1017             errnote='Unable to poll socket', transform_source=None,
1018             fatal=True, encoding=None, data=None, headers={}, query={},
1019             expected_status=None):
1020         """
1021         Return a tuple (JSON object, URL handle).
1022
1023         See _download_webpage docstring for arguments specification.
1024         """
1025         res = self._download_webpage_handle(
1026             url_or_request, video_id, note, errnote, fatal=fatal,
1027             encoding=encoding, data=data, headers=headers, query=query,
1028             expected_status=expected_status)
1029         if res is False:
1030             return res
1031         webpage, urlh = res
1032         return self._parse_socket_response_as_json(
1033             webpage, video_id, transform_source=transform_source,
1034             fatal=fatal), urlh
1035
1036     def _download_socket_json(
1037             self, url_or_request, video_id, note='Polling socket',
1038             errnote='Unable to poll socket', transform_source=None,
1039             fatal=True, encoding=None, data=None, headers={}, query={},
1040             expected_status=None):
1041         """
1042         Return the JSON object as a dict.
1043
1044         See _download_webpage docstring for arguments specification.
1045         """
1046         res = self._download_socket_json_handle(
1047             url_or_request, video_id, note=note, errnote=errnote,
1048             transform_source=transform_source, fatal=fatal, encoding=encoding,
1049             data=data, headers=headers, query=query,
1050             expected_status=expected_status)
1051         return res if res is False else res[0]
1052
1053     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1054         idstr = format_field(video_id, template='%s: ')
1055         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1056         if only_once:
1057             if f'WARNING: {msg}' in self._printed_messages:
1058                 return
1059             self._printed_messages.add(f'WARNING: {msg}')
1060         self._downloader.report_warning(msg, *args, **kwargs)
1061
1062     def to_screen(self, msg, *args, **kwargs):
1063         """Print msg to screen, prefixing it with '[ie_name]'"""
1064         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1065
1066     def write_debug(self, msg, *args, **kwargs):
1067         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1068
1069     def get_param(self, name, default=None, *args, **kwargs):
1070         if self._downloader:
1071             return self._downloader.params.get(name, default, *args, **kwargs)
1072         return default
1073
1074     def report_drm(self, video_id, partial=False):
1075         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1076
1077     def report_extraction(self, id_or_name):
1078         """Report information extraction."""
1079         self.to_screen('%s: Extracting information' % id_or_name)
1080
1081     def report_download_webpage(self, video_id):
1082         """Report webpage download."""
1083         self.to_screen('%s: Downloading webpage' % video_id)
1084
1085     def report_age_confirmation(self):
1086         """Report attempt to confirm age."""
1087         self.to_screen('Confirming age')
1088
1089     def report_login(self):
1090         """Report attempt to log in."""
1091         self.to_screen('Logging in')
1092
1093     def raise_login_required(
1094             self, msg='This video is only available for registered users',
1095             metadata_available=False, method='any'):
1096         if metadata_available and (
1097                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1098             self.report_warning(msg)
1099         if method is not None:
1100             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1101         raise ExtractorError(msg, expected=True)
1102
1103     def raise_geo_restricted(
1104             self, msg='This video is not available from your location due to geo restriction',
1105             countries=None, metadata_available=False):
1106         if metadata_available and (
1107                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1108             self.report_warning(msg)
1109         else:
1110             raise GeoRestrictedError(msg, countries=countries)
1111
1112     def raise_no_formats(self, msg, expected=False, video_id=None):
1113         if expected and (
1114                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1115             self.report_warning(msg, video_id)
1116         elif isinstance(msg, ExtractorError):
1117             raise msg
1118         else:
1119             raise ExtractorError(msg, expected=expected, video_id=video_id)
1120
1121     # Methods for following #608
1122     @staticmethod
1123     def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1124         """Returns a URL that points to a page that should be processed"""
1125         # TODO: ie should be the class used for getting the info
1126         video_info = {'_type': 'url',
1127                       'url': url,
1128                       'ie_key': ie}
1129         video_info.update(kwargs)
1130         if video_id is not None:
1131             video_info['id'] = video_id
1132         if video_title is not None:
1133             video_info['title'] = video_title
1134         return video_info
1135
1136     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1137         urls = orderedSet(
1138             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1139             for m in matches)
1140         return self.playlist_result(
1141             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1142
1143     @staticmethod
1144     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1145         """Returns a playlist"""
1146         video_info = {'_type': 'playlist',
1147                       'entries': entries}
1148         video_info.update(kwargs)
1149         if playlist_id:
1150             video_info['id'] = playlist_id
1151         if playlist_title:
1152             video_info['title'] = playlist_title
1153         if playlist_description is not None:
1154             video_info['description'] = playlist_description
1155         return video_info
1156
1157     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1158         """
1159         Perform a regex search on the given string, using a single or a list of
1160         patterns returning the first matching group.
1161         In case of failure return a default value or raise a WARNING or a
1162         RegexNotFoundError, depending on fatal, specifying the field name.
1163         """
1164         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1165             mobj = re.search(pattern, string, flags)
1166         else:
1167             for p in pattern:
1168                 mobj = re.search(p, string, flags)
1169                 if mobj:
1170                     break
1171
1172         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1173
1174         if mobj:
1175             if group is None:
1176                 # return the first matching group
1177                 return next(g for g in mobj.groups() if g is not None)
1178             elif isinstance(group, (list, tuple)):
1179                 return tuple(mobj.group(g) for g in group)
1180             else:
1181                 return mobj.group(group)
1182         elif default is not NO_DEFAULT:
1183             return default
1184         elif fatal:
1185             raise RegexNotFoundError('Unable to extract %s' % _name)
1186         else:
1187             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1188             return None
1189
1190     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1191         """
1192         Like _search_regex, but strips HTML tags and unescapes entities.
1193         """
1194         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1195         if res:
1196             return clean_html(res).strip()
1197         else:
1198             return res
1199
1200     def _get_netrc_login_info(self, netrc_machine=None):
1201         username = None
1202         password = None
1203         netrc_machine = netrc_machine or self._NETRC_MACHINE
1204
1205         if self.get_param('usenetrc', False):
1206             try:
1207                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1208                 if os.path.isdir(netrc_file):
1209                     netrc_file = os.path.join(netrc_file, '.netrc')
1210                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1211                 if info is not None:
1212                     username = info[0]
1213                     password = info[2]
1214                 else:
1215                     raise netrc.NetrcParseError(
1216                         'No authenticators for %s' % netrc_machine)
1217             except (IOError, netrc.NetrcParseError) as err:
1218                 self.report_warning(
1219                     'parsing .netrc: %s' % error_to_compat_str(err))
1220
1221         return username, password
1222
1223     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1224         """
1225         Get the login info as (username, password)
1226         First look for the manually specified credentials using username_option
1227         and password_option as keys in params dictionary. If no such credentials
1228         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1229         value.
1230         If there's no info available, return (None, None)
1231         """
1232
1233         # Attempt to use provided username and password or .netrc data
1234         username = self.get_param(username_option)
1235         if username is not None:
1236             password = self.get_param(password_option)
1237         else:
1238             username, password = self._get_netrc_login_info(netrc_machine)
1239
1240         return username, password
1241
1242     def _get_tfa_info(self, note='two-factor verification code'):
1243         """
1244         Get the two-factor authentication info
1245         TODO - asking the user will be required for sms/phone verify
1246         currently just uses the command line option
1247         If there's no info available, return None
1248         """
1249
1250         tfa = self.get_param('twofactor')
1251         if tfa is not None:
1252             return tfa
1253
1254         return compat_getpass('Type %s and press [Return]: ' % note)
1255
1256     # Helper functions for extracting OpenGraph info
1257     @staticmethod
1258     def _og_regexes(prop):
1259         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1260         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1261                        % {'prop': re.escape(prop)})
1262         template = r'<meta[^>]+?%s[^>]+?%s'
1263         return [
1264             template % (property_re, content_re),
1265             template % (content_re, property_re),
1266         ]
1267
1268     @staticmethod
1269     def _meta_regex(prop):
1270         return r'''(?isx)<meta
1271                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1272                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1273
1274     def _og_search_property(self, prop, html, name=None, **kargs):
1275         prop = variadic(prop)
1276         if name is None:
1277             name = 'OpenGraph %s' % prop[0]
1278         og_regexes = []
1279         for p in prop:
1280             og_regexes.extend(self._og_regexes(p))
1281         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1282         if escaped is None:
1283             return None
1284         return unescapeHTML(escaped)
1285
1286     def _og_search_thumbnail(self, html, **kargs):
1287         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1288
1289     def _og_search_description(self, html, **kargs):
1290         return self._og_search_property('description', html, fatal=False, **kargs)
1291
1292     def _og_search_title(self, html, **kargs):
1293         return self._og_search_property('title', html, **kargs)
1294
1295     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1296         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1297         if secure:
1298             regexes = self._og_regexes('video:secure_url') + regexes
1299         return self._html_search_regex(regexes, html, name, **kargs)
1300
1301     def _og_search_url(self, html, **kargs):
1302         return self._og_search_property('url', html, **kargs)
1303
1304     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1305         name = variadic(name)
1306         if display_name is None:
1307             display_name = name[0]
1308         return self._html_search_regex(
1309             [self._meta_regex(n) for n in name],
1310             html, display_name, fatal=fatal, group='content', **kwargs)
1311
1312     def _dc_search_uploader(self, html):
1313         return self._html_search_meta('dc.creator', html, 'uploader')
1314
1315     def _rta_search(self, html):
1316         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1317         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1318                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1319                      html):
1320             return 18
1321         return 0
1322
1323     def _media_rating_search(self, html):
1324         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1325         rating = self._html_search_meta('rating', html)
1326
1327         if not rating:
1328             return None
1329
1330         RATING_TABLE = {
1331             'safe for kids': 0,
1332             'general': 8,
1333             '14 years': 14,
1334             'mature': 17,
1335             'restricted': 19,
1336         }
1337         return RATING_TABLE.get(rating.lower())
1338
1339     def _family_friendly_search(self, html):
1340         # See http://schema.org/VideoObject
1341         family_friendly = self._html_search_meta(
1342             'isFamilyFriendly', html, default=None)
1343
1344         if not family_friendly:
1345             return None
1346
1347         RATING_TABLE = {
1348             '1': 0,
1349             'true': 0,
1350             '0': 18,
1351             'false': 18,
1352         }
1353         return RATING_TABLE.get(family_friendly.lower())
1354
1355     def _twitter_search_player(self, html):
1356         return self._html_search_meta('twitter:player', html,
1357                                       'twitter card player')
1358
1359     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1360         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1361         default = kwargs.get('default', NO_DEFAULT)
1362         # JSON-LD may be malformed and thus `fatal` should be respected.
1363         # At the same time `default` may be passed that assumes `fatal=False`
1364         # for _search_regex. Let's simulate the same behavior here as well.
1365         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1366         json_ld = []
1367         for mobj in json_ld_list:
1368             json_ld_item = self._parse_json(
1369                 mobj.group('json_ld'), video_id, fatal=fatal)
1370             if not json_ld_item:
1371                 continue
1372             if isinstance(json_ld_item, dict):
1373                 json_ld.append(json_ld_item)
1374             elif isinstance(json_ld_item, (list, tuple)):
1375                 json_ld.extend(json_ld_item)
1376         if json_ld:
1377             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1378         if json_ld:
1379             return json_ld
1380         if default is not NO_DEFAULT:
1381             return default
1382         elif fatal:
1383             raise RegexNotFoundError('Unable to extract JSON-LD')
1384         else:
1385             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1386             return {}
1387
1388     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1389         if isinstance(json_ld, compat_str):
1390             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1391         if not json_ld:
1392             return {}
1393         info = {}
1394         if not isinstance(json_ld, (list, tuple, dict)):
1395             return info
1396         if isinstance(json_ld, dict):
1397             json_ld = [json_ld]
1398
1399         INTERACTION_TYPE_MAP = {
1400             'CommentAction': 'comment',
1401             'AgreeAction': 'like',
1402             'DisagreeAction': 'dislike',
1403             'LikeAction': 'like',
1404             'DislikeAction': 'dislike',
1405             'ListenAction': 'view',
1406             'WatchAction': 'view',
1407             'ViewAction': 'view',
1408         }
1409
1410         def extract_interaction_type(e):
1411             interaction_type = e.get('interactionType')
1412             if isinstance(interaction_type, dict):
1413                 interaction_type = interaction_type.get('@type')
1414             return str_or_none(interaction_type)
1415
1416         def extract_interaction_statistic(e):
1417             interaction_statistic = e.get('interactionStatistic')
1418             if isinstance(interaction_statistic, dict):
1419                 interaction_statistic = [interaction_statistic]
1420             if not isinstance(interaction_statistic, list):
1421                 return
1422             for is_e in interaction_statistic:
1423                 if not isinstance(is_e, dict):
1424                     continue
1425                 if is_e.get('@type') != 'InteractionCounter':
1426                     continue
1427                 interaction_type = extract_interaction_type(is_e)
1428                 if not interaction_type:
1429                     continue
1430                 # For interaction count some sites provide string instead of
1431                 # an integer (as per spec) with non digit characters (e.g. ",")
1432                 # so extracting count with more relaxed str_to_int
1433                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1434                 if interaction_count is None:
1435                     continue
1436                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1437                 if not count_kind:
1438                     continue
1439                 count_key = '%s_count' % count_kind
1440                 if info.get(count_key) is not None:
1441                     continue
1442                 info[count_key] = interaction_count
1443
1444         def extract_chapter_information(e):
1445             chapters = [{
1446                 'title': part.get('name'),
1447                 'start_time': part.get('startOffset'),
1448                 'end_time': part.get('endOffset'),
1449             } for part in e.get('hasPart', []) if part.get('@type') == 'Clip']
1450             for idx, (last_c, current_c, next_c) in enumerate(zip(
1451                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1452                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1453                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1454                 if None in current_c.values():
1455                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1456                     return
1457             if chapters:
1458                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1459                 info['chapters'] = chapters
1460
1461         def extract_video_object(e):
1462             assert e['@type'] == 'VideoObject'
1463             author = e.get('author')
1464             info.update({
1465                 'url': url_or_none(e.get('contentUrl')),
1466                 'title': unescapeHTML(e.get('name')),
1467                 'description': unescapeHTML(e.get('description')),
1468                 'thumbnails': [{'url': url_or_none(url)}
1469                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1470                 'duration': parse_duration(e.get('duration')),
1471                 'timestamp': unified_timestamp(e.get('uploadDate')),
1472                 # author can be an instance of 'Organization' or 'Person' types.
1473                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1474                 # however some websites are using 'Text' type instead.
1475                 # 1. https://schema.org/VideoObject
1476                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1477                 'filesize': float_or_none(e.get('contentSize')),
1478                 'tbr': int_or_none(e.get('bitrate')),
1479                 'width': int_or_none(e.get('width')),
1480                 'height': int_or_none(e.get('height')),
1481                 'view_count': int_or_none(e.get('interactionCount')),
1482             })
1483             extract_interaction_statistic(e)
1484             extract_chapter_information(e)
1485
1486         def traverse_json_ld(json_ld, at_top_level=True):
1487             for e in json_ld:
1488                 if at_top_level and '@context' not in e:
1489                     continue
1490                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1491                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1492                     break
1493                 item_type = e.get('@type')
1494                 if expected_type is not None and expected_type != item_type:
1495                     continue
1496                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1497                 if rating is not None:
1498                     info['average_rating'] = rating
1499                 if item_type in ('TVEpisode', 'Episode'):
1500                     episode_name = unescapeHTML(e.get('name'))
1501                     info.update({
1502                         'episode': episode_name,
1503                         'episode_number': int_or_none(e.get('episodeNumber')),
1504                         'description': unescapeHTML(e.get('description')),
1505                     })
1506                     if not info.get('title') and episode_name:
1507                         info['title'] = episode_name
1508                     part_of_season = e.get('partOfSeason')
1509                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1510                         info.update({
1511                             'season': unescapeHTML(part_of_season.get('name')),
1512                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1513                         })
1514                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1515                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1516                         info['series'] = unescapeHTML(part_of_series.get('name'))
1517                 elif item_type == 'Movie':
1518                     info.update({
1519                         'title': unescapeHTML(e.get('name')),
1520                         'description': unescapeHTML(e.get('description')),
1521                         'duration': parse_duration(e.get('duration')),
1522                         'timestamp': unified_timestamp(e.get('dateCreated')),
1523                     })
1524                 elif item_type in ('Article', 'NewsArticle'):
1525                     info.update({
1526                         'timestamp': parse_iso8601(e.get('datePublished')),
1527                         'title': unescapeHTML(e.get('headline')),
1528                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1529                     })
1530                 elif item_type == 'VideoObject':
1531                     extract_video_object(e)
1532                     if expected_type is None:
1533                         continue
1534                     else:
1535                         break
1536                 video = e.get('video')
1537                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1538                     extract_video_object(video)
1539                 if expected_type is None:
1540                     continue
1541                 else:
1542                     break
1543         traverse_json_ld(json_ld)
1544
1545         return dict((k, v) for k, v in info.items() if v is not None)
1546
1547     def _search_nextjs_data(self, webpage, video_id, **kw):
1548         return self._parse_json(
1549             self._search_regex(
1550                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1551                 webpage, 'next.js data', **kw),
1552             video_id, **kw)
1553
1554     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1555         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1556         # not all website do this, but it can be changed
1557         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1558         rectx = re.escape(context_name)
1559         js, arg_keys, arg_vals = self._search_regex(
1560             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1561              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1562             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1563
1564         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1565
1566         for key, val in args.items():
1567             if val in ('undefined', 'void 0'):
1568                 args[key] = 'null'
1569
1570         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1571
1572     @staticmethod
1573     def _hidden_inputs(html):
1574         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1575         hidden_inputs = {}
1576         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1577             attrs = extract_attributes(input)
1578             if not input:
1579                 continue
1580             if attrs.get('type') not in ('hidden', 'submit'):
1581                 continue
1582             name = attrs.get('name') or attrs.get('id')
1583             value = attrs.get('value')
1584             if name and value is not None:
1585                 hidden_inputs[name] = value
1586         return hidden_inputs
1587
1588     def _form_hidden_inputs(self, form_id, html):
1589         form = self._search_regex(
1590             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1591             html, '%s form' % form_id, group='form')
1592         return self._hidden_inputs(form)
1593
1594     class FormatSort:
1595         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1596
1597         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1598                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1599                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1600         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1601                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1602                         'fps', 'fs_approx', 'source', 'id')
1603
1604         settings = {
1605             'vcodec': {'type': 'ordered', 'regex': True,
1606                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1607             'acodec': {'type': 'ordered', 'regex': True,
1608                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1609             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1610                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1611             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1612                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1613             'vext': {'type': 'ordered', 'field': 'video_ext',
1614                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1615                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1616             'aext': {'type': 'ordered', 'field': 'audio_ext',
1617                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1618                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1619             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1620             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1621                            'field': ('vcodec', 'acodec'),
1622                            'function': lambda it: int(any(v != 'none' for v in it))},
1623             'ie_pref': {'priority': True, 'type': 'extractor'},
1624             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1625             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1626             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1627             'quality': {'convert': 'float', 'default': -1},
1628             'filesize': {'convert': 'bytes'},
1629             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1630             'id': {'convert': 'string', 'field': 'format_id'},
1631             'height': {'convert': 'float_none'},
1632             'width': {'convert': 'float_none'},
1633             'fps': {'convert': 'float_none'},
1634             'tbr': {'convert': 'float_none'},
1635             'vbr': {'convert': 'float_none'},
1636             'abr': {'convert': 'float_none'},
1637             'asr': {'convert': 'float_none'},
1638             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1639
1640             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1641             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1642             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1643             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1644             'res': {'type': 'multiple', 'field': ('height', 'width'),
1645                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1646
1647             # For compatibility with youtube-dl
1648             'format_id': {'type': 'alias', 'field': 'id'},
1649             'preference': {'type': 'alias', 'field': 'ie_pref'},
1650             'language_preference': {'type': 'alias', 'field': 'lang'},
1651
1652             # Deprecated
1653             'dimension': {'type': 'alias', 'field': 'res'},
1654             'resolution': {'type': 'alias', 'field': 'res'},
1655             'extension': {'type': 'alias', 'field': 'ext'},
1656             'bitrate': {'type': 'alias', 'field': 'br'},
1657             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1658             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1659             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1660             'framerate': {'type': 'alias', 'field': 'fps'},
1661             'protocol': {'type': 'alias', 'field': 'proto'},
1662             'source_preference': {'type': 'alias', 'field': 'source'},
1663             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1664             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1665             'samplerate': {'type': 'alias', 'field': 'asr'},
1666             'video_ext': {'type': 'alias', 'field': 'vext'},
1667             'audio_ext': {'type': 'alias', 'field': 'aext'},
1668             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1669             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1670             'video': {'type': 'alias', 'field': 'hasvid'},
1671             'has_video': {'type': 'alias', 'field': 'hasvid'},
1672             'audio': {'type': 'alias', 'field': 'hasaud'},
1673             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1674             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1675             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1676         }
1677
1678         def __init__(self, ie, field_preference):
1679             self._order = []
1680             self.ydl = ie._downloader
1681             self.evaluate_params(self.ydl.params, field_preference)
1682             if ie.get_param('verbose'):
1683                 self.print_verbose_info(self.ydl.write_debug)
1684
1685         def _get_field_setting(self, field, key):
1686             if field not in self.settings:
1687                 if key in ('forced', 'priority'):
1688                     return False
1689                 self.ydl.deprecation_warning(
1690                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1691                     'and may be removed in a future version')
1692                 self.settings[field] = {}
1693             propObj = self.settings[field]
1694             if key not in propObj:
1695                 type = propObj.get('type')
1696                 if key == 'field':
1697                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1698                 elif key == 'convert':
1699                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1700                 else:
1701                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1702                 propObj[key] = default
1703             return propObj[key]
1704
1705         def _resolve_field_value(self, field, value, convertNone=False):
1706             if value is None:
1707                 if not convertNone:
1708                     return None
1709             else:
1710                 value = value.lower()
1711             conversion = self._get_field_setting(field, 'convert')
1712             if conversion == 'ignore':
1713                 return None
1714             if conversion == 'string':
1715                 return value
1716             elif conversion == 'float_none':
1717                 return float_or_none(value)
1718             elif conversion == 'bytes':
1719                 return FileDownloader.parse_bytes(value)
1720             elif conversion == 'order':
1721                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1722                 use_regex = self._get_field_setting(field, 'regex')
1723                 list_length = len(order_list)
1724                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1725                 if use_regex and value is not None:
1726                     for i, regex in enumerate(order_list):
1727                         if regex and re.match(regex, value):
1728                             return list_length - i
1729                     return list_length - empty_pos  # not in list
1730                 else:  # not regex or  value = None
1731                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1732             else:
1733                 if value.isnumeric():
1734                     return float(value)
1735                 else:
1736                     self.settings[field]['convert'] = 'string'
1737                     return value
1738
1739         def evaluate_params(self, params, sort_extractor):
1740             self._use_free_order = params.get('prefer_free_formats', False)
1741             self._sort_user = params.get('format_sort', [])
1742             self._sort_extractor = sort_extractor
1743
1744             def add_item(field, reverse, closest, limit_text):
1745                 field = field.lower()
1746                 if field in self._order:
1747                     return
1748                 self._order.append(field)
1749                 limit = self._resolve_field_value(field, limit_text)
1750                 data = {
1751                     'reverse': reverse,
1752                     'closest': False if limit is None else closest,
1753                     'limit_text': limit_text,
1754                     'limit': limit}
1755                 if field in self.settings:
1756                     self.settings[field].update(data)
1757                 else:
1758                     self.settings[field] = data
1759
1760             sort_list = (
1761                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1762                 + (tuple() if params.get('format_sort_force', False)
1763                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1764                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1765
1766             for item in sort_list:
1767                 match = re.match(self.regex, item)
1768                 if match is None:
1769                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1770                 field = match.group('field')
1771                 if field is None:
1772                     continue
1773                 if self._get_field_setting(field, 'type') == 'alias':
1774                     alias, field = field, self._get_field_setting(field, 'field')
1775                     if alias not in ('format_id', 'preference', 'language_preference'):
1776                         self.ydl.deprecation_warning(
1777                             f'Format sorting alias {alias} is deprecated '
1778                             f'and may be removed in a future version. Please use {field} instead')
1779                 reverse = match.group('reverse') is not None
1780                 closest = match.group('separator') == '~'
1781                 limit_text = match.group('limit')
1782
1783                 has_limit = limit_text is not None
1784                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1785                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1786
1787                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1788                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1789                 limit_count = len(limits)
1790                 for (i, f) in enumerate(fields):
1791                     add_item(f, reverse, closest,
1792                              limits[i] if i < limit_count
1793                              else limits[0] if has_limit and not has_multiple_limits
1794                              else None)
1795
1796         def print_verbose_info(self, write_debug):
1797             if self._sort_user:
1798                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1799             if self._sort_extractor:
1800                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1801             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1802                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1803                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1804                               self._get_field_setting(field, 'limit_text'),
1805                               self._get_field_setting(field, 'limit'))
1806                 if self._get_field_setting(field, 'limit_text') is not None else '')
1807                 for field in self._order if self._get_field_setting(field, 'visible')]))
1808
1809         def _calculate_field_preference_from_value(self, format, field, type, value):
1810             reverse = self._get_field_setting(field, 'reverse')
1811             closest = self._get_field_setting(field, 'closest')
1812             limit = self._get_field_setting(field, 'limit')
1813
1814             if type == 'extractor':
1815                 maximum = self._get_field_setting(field, 'max')
1816                 if value is None or (maximum is not None and value >= maximum):
1817                     value = -1
1818             elif type == 'boolean':
1819                 in_list = self._get_field_setting(field, 'in_list')
1820                 not_in_list = self._get_field_setting(field, 'not_in_list')
1821                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1822             elif type == 'ordered':
1823                 value = self._resolve_field_value(field, value, True)
1824
1825             # try to convert to number
1826             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1827             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1828             if is_num:
1829                 value = val_num
1830
1831             return ((-10, 0) if value is None
1832                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1833                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1834                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1835                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1836                     else (-1, value, 0))
1837
1838         def _calculate_field_preference(self, format, field):
1839             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1840             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1841             if type == 'multiple':
1842                 type = 'field'  # Only 'field' is allowed in multiple for now
1843                 actual_fields = self._get_field_setting(field, 'field')
1844
1845                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1846             else:
1847                 value = get_value(field)
1848             return self._calculate_field_preference_from_value(format, field, type, value)
1849
1850         def calculate_preference(self, format):
1851             # Determine missing protocol
1852             if not format.get('protocol'):
1853                 format['protocol'] = determine_protocol(format)
1854
1855             # Determine missing ext
1856             if not format.get('ext') and 'url' in format:
1857                 format['ext'] = determine_ext(format['url'])
1858             if format.get('vcodec') == 'none':
1859                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1860                 format['video_ext'] = 'none'
1861             else:
1862                 format['video_ext'] = format['ext']
1863                 format['audio_ext'] = 'none'
1864             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1865             #    format['preference'] = -1000
1866
1867             # Determine missing bitrates
1868             if format.get('tbr') is None:
1869                 if format.get('vbr') is not None and format.get('abr') is not None:
1870                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1871             else:
1872                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1873                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1874                 if format.get('acodec') != 'none' and format.get('abr') is None:
1875                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1876
1877             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1878
1879     def _sort_formats(self, formats, field_preference=[]):
1880         if not formats:
1881             return
1882         format_sort = self.FormatSort(self, field_preference)
1883         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1884
1885     def _check_formats(self, formats, video_id):
1886         if formats:
1887             formats[:] = filter(
1888                 lambda f: self._is_valid_url(
1889                     f['url'], video_id,
1890                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1891                 formats)
1892
1893     @staticmethod
1894     def _remove_duplicate_formats(formats):
1895         format_urls = set()
1896         unique_formats = []
1897         for f in formats:
1898             if f['url'] not in format_urls:
1899                 format_urls.add(f['url'])
1900                 unique_formats.append(f)
1901         formats[:] = unique_formats
1902
1903     def _is_valid_url(self, url, video_id, item='video', headers={}):
1904         url = self._proto_relative_url(url, scheme='http:')
1905         # For now assume non HTTP(S) URLs always valid
1906         if not (url.startswith('http://') or url.startswith('https://')):
1907             return True
1908         try:
1909             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1910             return True
1911         except ExtractorError as e:
1912             self.to_screen(
1913                 '%s: %s URL is invalid, skipping: %s'
1914                 % (video_id, item, error_to_compat_str(e.cause)))
1915             return False
1916
1917     def http_scheme(self):
1918         """ Either "http:" or "https:", depending on the user's preferences """
1919         return (
1920             'http:'
1921             if self.get_param('prefer_insecure', False)
1922             else 'https:')
1923
1924     def _proto_relative_url(self, url, scheme=None):
1925         if url is None:
1926             return url
1927         if url.startswith('//'):
1928             if scheme is None:
1929                 scheme = self.http_scheme()
1930             return scheme + url
1931         else:
1932             return url
1933
1934     def _sleep(self, timeout, video_id, msg_template=None):
1935         if msg_template is None:
1936             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1937         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1938         self.to_screen(msg)
1939         time.sleep(timeout)
1940
1941     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1942                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1943                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1944         manifest = self._download_xml(
1945             manifest_url, video_id, 'Downloading f4m manifest',
1946             'Unable to download f4m manifest',
1947             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1948             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1949             transform_source=transform_source,
1950             fatal=fatal, data=data, headers=headers, query=query)
1951
1952         if manifest is False:
1953             return []
1954
1955         return self._parse_f4m_formats(
1956             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1957             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1958
1959     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1960                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1961                            fatal=True, m3u8_id=None):
1962         if not isinstance(manifest, compat_etree_Element) and not fatal:
1963             return []
1964
1965         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1966         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1967         if akamai_pv is not None and ';' in akamai_pv.text:
1968             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1969             if playerVerificationChallenge.strip() != '':
1970                 return []
1971
1972         formats = []
1973         manifest_version = '1.0'
1974         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1975         if not media_nodes:
1976             manifest_version = '2.0'
1977             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1978         # Remove unsupported DRM protected media from final formats
1979         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1980         media_nodes = remove_encrypted_media(media_nodes)
1981         if not media_nodes:
1982             return formats
1983
1984         manifest_base_url = get_base_url(manifest)
1985
1986         bootstrap_info = xpath_element(
1987             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1988             'bootstrap info', default=None)
1989
1990         vcodec = None
1991         mime_type = xpath_text(
1992             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1993             'base URL', default=None)
1994         if mime_type and mime_type.startswith('audio/'):
1995             vcodec = 'none'
1996
1997         for i, media_el in enumerate(media_nodes):
1998             tbr = int_or_none(media_el.attrib.get('bitrate'))
1999             width = int_or_none(media_el.attrib.get('width'))
2000             height = int_or_none(media_el.attrib.get('height'))
2001             format_id = join_nonempty(f4m_id, tbr or i)
2002             # If <bootstrapInfo> is present, the specified f4m is a
2003             # stream-level manifest, and only set-level manifests may refer to
2004             # external resources.  See section 11.4 and section 4 of F4M spec
2005             if bootstrap_info is None:
2006                 media_url = None
2007                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2008                 if manifest_version == '2.0':
2009                     media_url = media_el.attrib.get('href')
2010                 if media_url is None:
2011                     media_url = media_el.attrib.get('url')
2012                 if not media_url:
2013                     continue
2014                 manifest_url = (
2015                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2016                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2017                 # If media_url is itself a f4m manifest do the recursive extraction
2018                 # since bitrates in parent manifest (this one) and media_url manifest
2019                 # may differ leading to inability to resolve the format by requested
2020                 # bitrate in f4m downloader
2021                 ext = determine_ext(manifest_url)
2022                 if ext == 'f4m':
2023                     f4m_formats = self._extract_f4m_formats(
2024                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2025                         transform_source=transform_source, fatal=fatal)
2026                     # Sometimes stream-level manifest contains single media entry that
2027                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2028                     # At the same time parent's media entry in set-level manifest may
2029                     # contain it. We will copy it from parent in such cases.
2030                     if len(f4m_formats) == 1:
2031                         f = f4m_formats[0]
2032                         f.update({
2033                             'tbr': f.get('tbr') or tbr,
2034                             'width': f.get('width') or width,
2035                             'height': f.get('height') or height,
2036                             'format_id': f.get('format_id') if not tbr else format_id,
2037                             'vcodec': vcodec,
2038                         })
2039                     formats.extend(f4m_formats)
2040                     continue
2041                 elif ext == 'm3u8':
2042                     formats.extend(self._extract_m3u8_formats(
2043                         manifest_url, video_id, 'mp4', preference=preference,
2044                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2045                     continue
2046             formats.append({
2047                 'format_id': format_id,
2048                 'url': manifest_url,
2049                 'manifest_url': manifest_url,
2050                 'ext': 'flv' if bootstrap_info is not None else None,
2051                 'protocol': 'f4m',
2052                 'tbr': tbr,
2053                 'width': width,
2054                 'height': height,
2055                 'vcodec': vcodec,
2056                 'preference': preference,
2057                 'quality': quality,
2058             })
2059         return formats
2060
2061     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2062         return {
2063             'format_id': join_nonempty(m3u8_id, 'meta'),
2064             'url': m3u8_url,
2065             'ext': ext,
2066             'protocol': 'm3u8',
2067             'preference': preference - 100 if preference else -100,
2068             'quality': quality,
2069             'resolution': 'multiple',
2070             'format_note': 'Quality selection URL',
2071         }
2072
2073     def _report_ignoring_subs(self, name):
2074         self.report_warning(bug_reports_message(
2075             f'Ignoring subtitle tracks found in the {name} manifest; '
2076             'if any subtitle tracks are missing,'
2077         ), only_once=True)
2078
2079     def _extract_m3u8_formats(self, *args, **kwargs):
2080         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2081         if subs:
2082             self._report_ignoring_subs('HLS')
2083         return fmts
2084
2085     def _extract_m3u8_formats_and_subtitles(
2086             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2087             preference=None, quality=None, m3u8_id=None, note=None,
2088             errnote=None, fatal=True, live=False, data=None, headers={},
2089             query={}):
2090
2091         res = self._download_webpage_handle(
2092             m3u8_url, video_id,
2093             note='Downloading m3u8 information' if note is None else note,
2094             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2095             fatal=fatal, data=data, headers=headers, query=query)
2096
2097         if res is False:
2098             return [], {}
2099
2100         m3u8_doc, urlh = res
2101         m3u8_url = urlh.geturl()
2102
2103         return self._parse_m3u8_formats_and_subtitles(
2104             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2105             preference=preference, quality=quality, m3u8_id=m3u8_id,
2106             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2107             headers=headers, query=query, video_id=video_id)
2108
2109     def _parse_m3u8_formats_and_subtitles(
2110             self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2111             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2112             errnote=None, fatal=True, data=None, headers={}, query={},
2113             video_id=None):
2114         formats, subtitles = [], {}
2115
2116         has_drm = re.search('|'.join([
2117             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2118             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2119         ]), m3u8_doc)
2120
2121         def format_url(url):
2122             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2123
2124         if self.get_param('hls_split_discontinuity', False):
2125             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2126                 if not m3u8_doc:
2127                     if not manifest_url:
2128                         return []
2129                     m3u8_doc = self._download_webpage(
2130                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2131                         note=False, errnote='Failed to download m3u8 playlist information')
2132                     if m3u8_doc is False:
2133                         return []
2134                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2135
2136         else:
2137             def _extract_m3u8_playlist_indices(*args, **kwargs):
2138                 return [None]
2139
2140         # References:
2141         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2142         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2143         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2144
2145         # We should try extracting formats only from master playlists [1, 4.3.4],
2146         # i.e. playlists that describe available qualities. On the other hand
2147         # media playlists [1, 4.3.3] should be returned as is since they contain
2148         # just the media without qualities renditions.
2149         # Fortunately, master playlist can be easily distinguished from media
2150         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2151         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2152         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2153         # media playlist and MUST NOT appear in master playlist thus we can
2154         # clearly detect media playlist with this criterion.
2155
2156         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2157             formats = [{
2158                 'format_id': join_nonempty(m3u8_id, idx),
2159                 'format_index': idx,
2160                 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2161                 'ext': ext,
2162                 'protocol': entry_protocol,
2163                 'preference': preference,
2164                 'quality': quality,
2165                 'has_drm': has_drm,
2166             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2167
2168             return formats, subtitles
2169
2170         groups = {}
2171         last_stream_inf = {}
2172
2173         def extract_media(x_media_line):
2174             media = parse_m3u8_attributes(x_media_line)
2175             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2176             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2177             if not (media_type and group_id and name):
2178                 return
2179             groups.setdefault(group_id, []).append(media)
2180             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2181             if media_type == 'SUBTITLES':
2182                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2183                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2184                 # However, lack of URI has been spotted in the wild.
2185                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2186                 if not media.get('URI'):
2187                     return
2188                 url = format_url(media['URI'])
2189                 sub_info = {
2190                     'url': url,
2191                     'ext': determine_ext(url),
2192                 }
2193                 if sub_info['ext'] == 'm3u8':
2194                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2195                     # files may contain is WebVTT:
2196                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2197                     sub_info['ext'] = 'vtt'
2198                     sub_info['protocol'] = 'm3u8_native'
2199                 lang = media.get('LANGUAGE') or 'und'
2200                 subtitles.setdefault(lang, []).append(sub_info)
2201             if media_type not in ('VIDEO', 'AUDIO'):
2202                 return
2203             media_url = media.get('URI')
2204             if media_url:
2205                 manifest_url = format_url(media_url)
2206                 formats.extend({
2207                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2208                     'format_note': name,
2209                     'format_index': idx,
2210                     'url': manifest_url,
2211                     'manifest_url': m3u8_url,
2212                     'language': media.get('LANGUAGE'),
2213                     'ext': ext,
2214                     'protocol': entry_protocol,
2215                     'preference': preference,
2216                     'quality': quality,
2217                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2218                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2219
2220         def build_stream_name():
2221             # Despite specification does not mention NAME attribute for
2222             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2223             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2224             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2225             stream_name = last_stream_inf.get('NAME')
2226             if stream_name:
2227                 return stream_name
2228             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2229             # from corresponding rendition group
2230             stream_group_id = last_stream_inf.get('VIDEO')
2231             if not stream_group_id:
2232                 return
2233             stream_group = groups.get(stream_group_id)
2234             if not stream_group:
2235                 return stream_group_id
2236             rendition = stream_group[0]
2237             return rendition.get('NAME') or stream_group_id
2238
2239         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2240         # chance to detect video only formats when EXT-X-STREAM-INF tags
2241         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2242         for line in m3u8_doc.splitlines():
2243             if line.startswith('#EXT-X-MEDIA:'):
2244                 extract_media(line)
2245
2246         for line in m3u8_doc.splitlines():
2247             if line.startswith('#EXT-X-STREAM-INF:'):
2248                 last_stream_inf = parse_m3u8_attributes(line)
2249             elif line.startswith('#') or not line.strip():
2250                 continue
2251             else:
2252                 tbr = float_or_none(
2253                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2254                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2255                 manifest_url = format_url(line.strip())
2256
2257                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2258                     format_id = [m3u8_id, None, idx]
2259                     # Bandwidth of live streams may differ over time thus making
2260                     # format_id unpredictable. So it's better to keep provided
2261                     # format_id intact.
2262                     if not live:
2263                         stream_name = build_stream_name()
2264                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2265                     f = {
2266                         'format_id': join_nonempty(*format_id),
2267                         'format_index': idx,
2268                         'url': manifest_url,
2269                         'manifest_url': m3u8_url,
2270                         'tbr': tbr,
2271                         'ext': ext,
2272                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2273                         'protocol': entry_protocol,
2274                         'preference': preference,
2275                         'quality': quality,
2276                     }
2277                     resolution = last_stream_inf.get('RESOLUTION')
2278                     if resolution:
2279                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2280                         if mobj:
2281                             f['width'] = int(mobj.group('width'))
2282                             f['height'] = int(mobj.group('height'))
2283                     # Unified Streaming Platform
2284                     mobj = re.search(
2285                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2286                     if mobj:
2287                         abr, vbr = mobj.groups()
2288                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2289                         f.update({
2290                             'vbr': vbr,
2291                             'abr': abr,
2292                         })
2293                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2294                     f.update(codecs)
2295                     audio_group_id = last_stream_inf.get('AUDIO')
2296                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2297                     # references a rendition group MUST have a CODECS attribute.
2298                     # However, this is not always respected, for example, [2]
2299                     # contains EXT-X-STREAM-INF tag which references AUDIO
2300                     # rendition group but does not have CODECS and despite
2301                     # referencing an audio group it represents a complete
2302                     # (with audio and video) format. So, for such cases we will
2303                     # ignore references to rendition groups and treat them
2304                     # as complete formats.
2305                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2306                         audio_group = groups.get(audio_group_id)
2307                         if audio_group and audio_group[0].get('URI'):
2308                             # TODO: update acodec for audio only formats with
2309                             # the same GROUP-ID
2310                             f['acodec'] = 'none'
2311                     if not f.get('ext'):
2312                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2313                     formats.append(f)
2314
2315                     # for DailyMotion
2316                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2317                     if progressive_uri:
2318                         http_f = f.copy()
2319                         del http_f['manifest_url']
2320                         http_f.update({
2321                             'format_id': f['format_id'].replace('hls-', 'http-'),
2322                             'protocol': 'http',
2323                             'url': progressive_uri,
2324                         })
2325                         formats.append(http_f)
2326
2327                 last_stream_inf = {}
2328         return formats, subtitles
2329
2330     def _extract_m3u8_vod_duration(
2331             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2332
2333         m3u8_vod = self._download_webpage(
2334             m3u8_vod_url, video_id,
2335             note='Downloading m3u8 VOD manifest' if note is None else note,
2336             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2337             fatal=False, data=data, headers=headers, query=query)
2338
2339         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2340
2341     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2342         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2343             return None
2344
2345         return int(sum(
2346             float(line[len('#EXTINF:'):].split(',')[0])
2347             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2348
2349     @staticmethod
2350     def _xpath_ns(path, namespace=None):
2351         if not namespace:
2352             return path
2353         out = []
2354         for c in path.split('/'):
2355             if not c or c == '.':
2356                 out.append(c)
2357             else:
2358                 out.append('{%s}%s' % (namespace, c))
2359         return '/'.join(out)
2360
2361     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2362         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2363
2364         if smil is False:
2365             assert not fatal
2366             return [], {}
2367
2368         namespace = self._parse_smil_namespace(smil)
2369
2370         fmts = self._parse_smil_formats(
2371             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2372         subs = self._parse_smil_subtitles(
2373             smil, namespace=namespace)
2374
2375         return fmts, subs
2376
2377     def _extract_smil_formats(self, *args, **kwargs):
2378         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2379         if subs:
2380             self._report_ignoring_subs('SMIL')
2381         return fmts
2382
2383     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2384         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2385         if smil is False:
2386             return {}
2387         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2388
2389     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2390         return self._download_xml(
2391             smil_url, video_id, 'Downloading SMIL file',
2392             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2393
2394     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2395         namespace = self._parse_smil_namespace(smil)
2396
2397         formats = self._parse_smil_formats(
2398             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2399         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2400
2401         video_id = os.path.splitext(url_basename(smil_url))[0]
2402         title = None
2403         description = None
2404         upload_date = None
2405         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2406             name = meta.attrib.get('name')
2407             content = meta.attrib.get('content')
2408             if not name or not content:
2409                 continue
2410             if not title and name == 'title':
2411                 title = content
2412             elif not description and name in ('description', 'abstract'):
2413                 description = content
2414             elif not upload_date and name == 'date':
2415                 upload_date = unified_strdate(content)
2416
2417         thumbnails = [{
2418             'id': image.get('type'),
2419             'url': image.get('src'),
2420             'width': int_or_none(image.get('width')),
2421             'height': int_or_none(image.get('height')),
2422         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2423
2424         return {
2425             'id': video_id,
2426             'title': title or video_id,
2427             'description': description,
2428             'upload_date': upload_date,
2429             'thumbnails': thumbnails,
2430             'formats': formats,
2431             'subtitles': subtitles,
2432         }
2433
2434     def _parse_smil_namespace(self, smil):
2435         return self._search_regex(
2436             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2437
2438     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2439         base = smil_url
2440         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2441             b = meta.get('base') or meta.get('httpBase')
2442             if b:
2443                 base = b
2444                 break
2445
2446         formats = []
2447         rtmp_count = 0
2448         http_count = 0
2449         m3u8_count = 0
2450         imgs_count = 0
2451
2452         srcs = set()
2453         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2454         for medium in media:
2455             src = medium.get('src')
2456             if not src or src in srcs:
2457                 continue
2458             srcs.add(src)
2459
2460             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2461             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2462             width = int_or_none(medium.get('width'))
2463             height = int_or_none(medium.get('height'))
2464             proto = medium.get('proto')
2465             ext = medium.get('ext')
2466             src_ext = determine_ext(src)
2467             streamer = medium.get('streamer') or base
2468
2469             if proto == 'rtmp' or streamer.startswith('rtmp'):
2470                 rtmp_count += 1
2471                 formats.append({
2472                     'url': streamer,
2473                     'play_path': src,
2474                     'ext': 'flv',
2475                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2476                     'tbr': bitrate,
2477                     'filesize': filesize,
2478                     'width': width,
2479                     'height': height,
2480                 })
2481                 if transform_rtmp_url:
2482                     streamer, src = transform_rtmp_url(streamer, src)
2483                     formats[-1].update({
2484                         'url': streamer,
2485                         'play_path': src,
2486                     })
2487                 continue
2488
2489             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2490             src_url = src_url.strip()
2491
2492             if proto == 'm3u8' or src_ext == 'm3u8':
2493                 m3u8_formats = self._extract_m3u8_formats(
2494                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2495                 if len(m3u8_formats) == 1:
2496                     m3u8_count += 1
2497                     m3u8_formats[0].update({
2498                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2499                         'tbr': bitrate,
2500                         'width': width,
2501                         'height': height,
2502                     })
2503                 formats.extend(m3u8_formats)
2504             elif src_ext == 'f4m':
2505                 f4m_url = src_url
2506                 if not f4m_params:
2507                     f4m_params = {
2508                         'hdcore': '3.2.0',
2509                         'plugin': 'flowplayer-3.2.0.1',
2510                     }
2511                 f4m_url += '&' if '?' in f4m_url else '?'
2512                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2513                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2514             elif src_ext == 'mpd':
2515                 formats.extend(self._extract_mpd_formats(
2516                     src_url, video_id, mpd_id='dash', fatal=False))
2517             elif re.search(r'\.ism/[Mm]anifest', src_url):
2518                 formats.extend(self._extract_ism_formats(
2519                     src_url, video_id, ism_id='mss', fatal=False))
2520             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2521                 http_count += 1
2522                 formats.append({
2523                     'url': src_url,
2524                     'ext': ext or src_ext or 'flv',
2525                     'format_id': 'http-%d' % (bitrate or http_count),
2526                     'tbr': bitrate,
2527                     'filesize': filesize,
2528                     'width': width,
2529                     'height': height,
2530                 })
2531
2532         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2533             src = medium.get('src')
2534             if not src or src in srcs:
2535                 continue
2536             srcs.add(src)
2537
2538             imgs_count += 1
2539             formats.append({
2540                 'format_id': 'imagestream-%d' % (imgs_count),
2541                 'url': src,
2542                 'ext': mimetype2ext(medium.get('type')),
2543                 'acodec': 'none',
2544                 'vcodec': 'none',
2545                 'width': int_or_none(medium.get('width')),
2546                 'height': int_or_none(medium.get('height')),
2547                 'format_note': 'SMIL storyboards',
2548             })
2549
2550         return formats
2551
2552     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2553         urls = []
2554         subtitles = {}
2555         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2556             src = textstream.get('src')
2557             if not src or src in urls:
2558                 continue
2559             urls.append(src)
2560             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2561             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2562             subtitles.setdefault(lang, []).append({
2563                 'url': src,
2564                 'ext': ext,
2565             })
2566         return subtitles
2567
2568     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2569         xspf = self._download_xml(
2570             xspf_url, playlist_id, 'Downloading xpsf playlist',
2571             'Unable to download xspf manifest', fatal=fatal)
2572         if xspf is False:
2573             return []
2574         return self._parse_xspf(
2575             xspf, playlist_id, xspf_url=xspf_url,
2576             xspf_base_url=base_url(xspf_url))
2577
2578     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2579         NS_MAP = {
2580             'xspf': 'http://xspf.org/ns/0/',
2581             's1': 'http://static.streamone.nl/player/ns/0',
2582         }
2583
2584         entries = []
2585         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2586             title = xpath_text(
2587                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2588             description = xpath_text(
2589                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2590             thumbnail = xpath_text(
2591                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2592             duration = float_or_none(
2593                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2594
2595             formats = []
2596             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2597                 format_url = urljoin(xspf_base_url, location.text)
2598                 if not format_url:
2599                     continue
2600                 formats.append({
2601                     'url': format_url,
2602                     'manifest_url': xspf_url,
2603                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2604                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2605                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2606                 })
2607             self._sort_formats(formats)
2608
2609             entries.append({
2610                 'id': playlist_id,
2611                 'title': title,
2612                 'description': description,
2613                 'thumbnail': thumbnail,
2614                 'duration': duration,
2615                 'formats': formats,
2616             })
2617         return entries
2618
2619     def _extract_mpd_formats(self, *args, **kwargs):
2620         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2621         if subs:
2622             self._report_ignoring_subs('DASH')
2623         return fmts
2624
2625     def _extract_mpd_formats_and_subtitles(
2626             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2627             fatal=True, data=None, headers={}, query={}):
2628         res = self._download_xml_handle(
2629             mpd_url, video_id,
2630             note='Downloading MPD manifest' if note is None else note,
2631             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2632             fatal=fatal, data=data, headers=headers, query=query)
2633         if res is False:
2634             return [], {}
2635         mpd_doc, urlh = res
2636         if mpd_doc is None:
2637             return [], {}
2638         mpd_base_url = base_url(urlh.geturl())
2639
2640         return self._parse_mpd_formats_and_subtitles(
2641             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2642
2643     def _parse_mpd_formats(self, *args, **kwargs):
2644         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2645         if subs:
2646             self._report_ignoring_subs('DASH')
2647         return fmts
2648
2649     def _parse_mpd_formats_and_subtitles(
2650             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2651         """
2652         Parse formats from MPD manifest.
2653         References:
2654          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2655             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2656          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2657         """
2658         if not self.get_param('dynamic_mpd', True):
2659             if mpd_doc.get('type') == 'dynamic':
2660                 return [], {}
2661
2662         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2663
2664         def _add_ns(path):
2665             return self._xpath_ns(path, namespace)
2666
2667         def is_drm_protected(element):
2668             return element.find(_add_ns('ContentProtection')) is not None
2669
2670         def extract_multisegment_info(element, ms_parent_info):
2671             ms_info = ms_parent_info.copy()
2672
2673             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2674             # common attributes and elements.  We will only extract relevant
2675             # for us.
2676             def extract_common(source):
2677                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2678                 if segment_timeline is not None:
2679                     s_e = segment_timeline.findall(_add_ns('S'))
2680                     if s_e:
2681                         ms_info['total_number'] = 0
2682                         ms_info['s'] = []
2683                         for s in s_e:
2684                             r = int(s.get('r', 0))
2685                             ms_info['total_number'] += 1 + r
2686                             ms_info['s'].append({
2687                                 't': int(s.get('t', 0)),
2688                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2689                                 'd': int(s.attrib['d']),
2690                                 'r': r,
2691                             })
2692                 start_number = source.get('startNumber')
2693                 if start_number:
2694                     ms_info['start_number'] = int(start_number)
2695                 timescale = source.get('timescale')
2696                 if timescale:
2697                     ms_info['timescale'] = int(timescale)
2698                 segment_duration = source.get('duration')
2699                 if segment_duration:
2700                     ms_info['segment_duration'] = float(segment_duration)
2701
2702             def extract_Initialization(source):
2703                 initialization = source.find(_add_ns('Initialization'))
2704                 if initialization is not None:
2705                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2706
2707             segment_list = element.find(_add_ns('SegmentList'))
2708             if segment_list is not None:
2709                 extract_common(segment_list)
2710                 extract_Initialization(segment_list)
2711                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2712                 if segment_urls_e:
2713                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2714             else:
2715                 segment_template = element.find(_add_ns('SegmentTemplate'))
2716                 if segment_template is not None:
2717                     extract_common(segment_template)
2718                     media = segment_template.get('media')
2719                     if media:
2720                         ms_info['media'] = media
2721                     initialization = segment_template.get('initialization')
2722                     if initialization:
2723                         ms_info['initialization'] = initialization
2724                     else:
2725                         extract_Initialization(segment_template)
2726             return ms_info
2727
2728         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2729         formats, subtitles = [], {}
2730         stream_numbers = collections.defaultdict(int)
2731         for period in mpd_doc.findall(_add_ns('Period')):
2732             period_duration = parse_duration(period.get('duration')) or mpd_duration
2733             period_ms_info = extract_multisegment_info(period, {
2734                 'start_number': 1,
2735                 'timescale': 1,
2736             })
2737             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2738                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2739                 for representation in adaptation_set.findall(_add_ns('Representation')):
2740                     representation_attrib = adaptation_set.attrib.copy()
2741                     representation_attrib.update(representation.attrib)
2742                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2743                     mime_type = representation_attrib['mimeType']
2744                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2745
2746                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2747                     if content_type not in ('video', 'audio', 'text'):
2748                         if mime_type == 'image/jpeg':
2749                             content_type = mime_type
2750                         elif codecs['vcodec'] != 'none':
2751                             content_type = 'video'
2752                         elif codecs['acodec'] != 'none':
2753                             content_type = 'audio'
2754                         elif codecs.get('tcodec', 'none') != 'none':
2755                             content_type = 'text'
2756                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2757                             content_type = 'text'
2758                         else:
2759                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2760                             continue
2761
2762                     base_url = ''
2763                     for element in (representation, adaptation_set, period, mpd_doc):
2764                         base_url_e = element.find(_add_ns('BaseURL'))
2765                         if base_url_e is not None:
2766                             base_url = base_url_e.text + base_url
2767                             if re.match(r'^https?://', base_url):
2768                                 break
2769                     if mpd_base_url and base_url.startswith('/'):
2770                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2771                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2772                         if not mpd_base_url.endswith('/'):
2773                             mpd_base_url += '/'
2774                         base_url = mpd_base_url + base_url
2775                     representation_id = representation_attrib.get('id')
2776                     lang = representation_attrib.get('lang')
2777                     url_el = representation.find(_add_ns('BaseURL'))
2778                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2779                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2780                     if representation_id is not None:
2781                         format_id = representation_id
2782                     else:
2783                         format_id = content_type
2784                     if mpd_id:
2785                         format_id = mpd_id + '-' + format_id
2786                     if content_type in ('video', 'audio'):
2787                         f = {
2788                             'format_id': format_id,
2789                             'manifest_url': mpd_url,
2790                             'ext': mimetype2ext(mime_type),
2791                             'width': int_or_none(representation_attrib.get('width')),
2792                             'height': int_or_none(representation_attrib.get('height')),
2793                             'tbr': float_or_none(bandwidth, 1000),
2794                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2795                             'fps': int_or_none(representation_attrib.get('frameRate')),
2796                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2797                             'format_note': 'DASH %s' % content_type,
2798                             'filesize': filesize,
2799                             'container': mimetype2ext(mime_type) + '_dash',
2800                             **codecs
2801                         }
2802                     elif content_type == 'text':
2803                         f = {
2804                             'ext': mimetype2ext(mime_type),
2805                             'manifest_url': mpd_url,
2806                             'filesize': filesize,
2807                         }
2808                     elif content_type == 'image/jpeg':
2809                         # See test case in VikiIE
2810                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2811                         f = {
2812                             'format_id': format_id,
2813                             'ext': 'mhtml',
2814                             'manifest_url': mpd_url,
2815                             'format_note': 'DASH storyboards (jpeg)',
2816                             'acodec': 'none',
2817                             'vcodec': 'none',
2818                         }
2819                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2820                         f['has_drm'] = True
2821                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2822
2823                     def prepare_template(template_name, identifiers):
2824                         tmpl = representation_ms_info[template_name]
2825                         # First of, % characters outside $...$ templates
2826                         # must be escaped by doubling for proper processing
2827                         # by % operator string formatting used further (see
2828                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2829                         t = ''
2830                         in_template = False
2831                         for c in tmpl:
2832                             t += c
2833                             if c == '$':
2834                                 in_template = not in_template
2835                             elif c == '%' and not in_template:
2836                                 t += c
2837                         # Next, $...$ templates are translated to their
2838                         # %(...) counterparts to be used with % operator
2839                         if representation_id is not None:
2840                             t = t.replace('$RepresentationID$', representation_id)
2841                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2842                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2843                         t.replace('$$', '$')
2844                         return t
2845
2846                     # @initialization is a regular template like @media one
2847                     # so it should be handled just the same way (see
2848                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2849                     if 'initialization' in representation_ms_info:
2850                         initialization_template = prepare_template(
2851                             'initialization',
2852                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2853                             # $Time$ shall not be included for @initialization thus
2854                             # only $Bandwidth$ remains
2855                             ('Bandwidth', ))
2856                         representation_ms_info['initialization_url'] = initialization_template % {
2857                             'Bandwidth': bandwidth,
2858                         }
2859
2860                     def location_key(location):
2861                         return 'url' if re.match(r'^https?://', location) else 'path'
2862
2863                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2864
2865                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2866                         media_location_key = location_key(media_template)
2867
2868                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2869                         # can't be used at the same time
2870                         if '%(Number' in media_template and 's' not in representation_ms_info:
2871                             segment_duration = None
2872                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2873                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2874                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2875                             representation_ms_info['fragments'] = [{
2876                                 media_location_key: media_template % {
2877                                     'Number': segment_number,
2878                                     'Bandwidth': bandwidth,
2879                                 },
2880                                 'duration': segment_duration,
2881                             } for segment_number in range(
2882                                 representation_ms_info['start_number'],
2883                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2884                         else:
2885                             # $Number*$ or $Time$ in media template with S list available
2886                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2887                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2888                             representation_ms_info['fragments'] = []
2889                             segment_time = 0
2890                             segment_d = None
2891                             segment_number = representation_ms_info['start_number']
2892
2893                             def add_segment_url():
2894                                 segment_url = media_template % {
2895                                     'Time': segment_time,
2896                                     'Bandwidth': bandwidth,
2897                                     'Number': segment_number,
2898                                 }
2899                                 representation_ms_info['fragments'].append({
2900                                     media_location_key: segment_url,
2901                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2902                                 })
2903
2904                             for num, s in enumerate(representation_ms_info['s']):
2905                                 segment_time = s.get('t') or segment_time
2906                                 segment_d = s['d']
2907                                 add_segment_url()
2908                                 segment_number += 1
2909                                 for r in range(s.get('r', 0)):
2910                                     segment_time += segment_d
2911                                     add_segment_url()
2912                                     segment_number += 1
2913                                 segment_time += segment_d
2914                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2915                         # No media template
2916                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2917                         # or any YouTube dashsegments video
2918                         fragments = []
2919                         segment_index = 0
2920                         timescale = representation_ms_info['timescale']
2921                         for s in representation_ms_info['s']:
2922                             duration = float_or_none(s['d'], timescale)
2923                             for r in range(s.get('r', 0) + 1):
2924                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2925                                 fragments.append({
2926                                     location_key(segment_uri): segment_uri,
2927                                     'duration': duration,
2928                                 })
2929                                 segment_index += 1
2930                         representation_ms_info['fragments'] = fragments
2931                     elif 'segment_urls' in representation_ms_info:
2932                         # Segment URLs with no SegmentTimeline
2933                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2934                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2935                         fragments = []
2936                         segment_duration = float_or_none(
2937                             representation_ms_info['segment_duration'],
2938                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2939                         for segment_url in representation_ms_info['segment_urls']:
2940                             fragment = {
2941                                 location_key(segment_url): segment_url,
2942                             }
2943                             if segment_duration:
2944                                 fragment['duration'] = segment_duration
2945                             fragments.append(fragment)
2946                         representation_ms_info['fragments'] = fragments
2947                     # If there is a fragments key available then we correctly recognized fragmented media.
2948                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2949                     # assumption is not necessarily correct since we may simply have no support for
2950                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2951                     if 'fragments' in representation_ms_info:
2952                         f.update({
2953                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2954                             'url': mpd_url or base_url,
2955                             'fragment_base_url': base_url,
2956                             'fragments': [],
2957                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2958                         })
2959                         if 'initialization_url' in representation_ms_info:
2960                             initialization_url = representation_ms_info['initialization_url']
2961                             if not f.get('url'):
2962                                 f['url'] = initialization_url
2963                             f['fragments'].append({location_key(initialization_url): initialization_url})
2964                         f['fragments'].extend(representation_ms_info['fragments'])
2965                     else:
2966                         # Assuming direct URL to unfragmented media.
2967                         f['url'] = base_url
2968                     if content_type in ('video', 'audio', 'image/jpeg'):
2969                         f['manifest_stream_number'] = stream_numbers[f['url']]
2970                         stream_numbers[f['url']] += 1
2971                         formats.append(f)
2972                     elif content_type == 'text':
2973                         subtitles.setdefault(lang or 'und', []).append(f)
2974
2975         return formats, subtitles
2976
2977     def _extract_ism_formats(self, *args, **kwargs):
2978         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2979         if subs:
2980             self._report_ignoring_subs('ISM')
2981         return fmts
2982
2983     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2984         res = self._download_xml_handle(
2985             ism_url, video_id,
2986             note='Downloading ISM manifest' if note is None else note,
2987             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2988             fatal=fatal, data=data, headers=headers, query=query)
2989         if res is False:
2990             return [], {}
2991         ism_doc, urlh = res
2992         if ism_doc is None:
2993             return [], {}
2994
2995         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2996
2997     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2998         """
2999         Parse formats from ISM manifest.
3000         References:
3001          1. [MS-SSTR]: Smooth Streaming Protocol,
3002             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3003         """
3004         if ism_doc.get('IsLive') == 'TRUE':
3005             return [], {}
3006
3007         duration = int(ism_doc.attrib['Duration'])
3008         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3009
3010         formats = []
3011         subtitles = {}
3012         for stream in ism_doc.findall('StreamIndex'):
3013             stream_type = stream.get('Type')
3014             if stream_type not in ('video', 'audio', 'text'):
3015                 continue
3016             url_pattern = stream.attrib['Url']
3017             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3018             stream_name = stream.get('Name')
3019             stream_language = stream.get('Language', 'und')
3020             for track in stream.findall('QualityLevel'):
3021                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3022                 # TODO: add support for WVC1 and WMAP
3023                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3024                     self.report_warning('%s is not a supported codec' % fourcc)
3025                     continue
3026                 tbr = int(track.attrib['Bitrate']) // 1000
3027                 # [1] does not mention Width and Height attributes. However,
3028                 # they're often present while MaxWidth and MaxHeight are
3029                 # missing, so should be used as fallbacks
3030                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3031                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3032                 sampling_rate = int_or_none(track.get('SamplingRate'))
3033
3034                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3035                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3036
3037                 fragments = []
3038                 fragment_ctx = {
3039                     'time': 0,
3040                 }
3041                 stream_fragments = stream.findall('c')
3042                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3043                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3044                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3045                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3046                     if not fragment_ctx['duration']:
3047                         try:
3048                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3049                         except IndexError:
3050                             next_fragment_time = duration
3051                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3052                     for _ in range(fragment_repeat):
3053                         fragments.append({
3054                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3055                             'duration': fragment_ctx['duration'] / stream_timescale,
3056                         })
3057                         fragment_ctx['time'] += fragment_ctx['duration']
3058
3059                 if stream_type == 'text':
3060                     subtitles.setdefault(stream_language, []).append({
3061                         'ext': 'ismt',
3062                         'protocol': 'ism',
3063                         'url': ism_url,
3064                         'manifest_url': ism_url,
3065                         'fragments': fragments,
3066                         '_download_params': {
3067                             'stream_type': stream_type,
3068                             'duration': duration,
3069                             'timescale': stream_timescale,
3070                             'fourcc': fourcc,
3071                             'language': stream_language,
3072                             'codec_private_data': track.get('CodecPrivateData'),
3073                         }
3074                     })
3075                 elif stream_type in ('video', 'audio'):
3076                     formats.append({
3077                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3078                         'url': ism_url,
3079                         'manifest_url': ism_url,
3080                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3081                         'width': width,
3082                         'height': height,
3083                         'tbr': tbr,
3084                         'asr': sampling_rate,
3085                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3086                         'acodec': 'none' if stream_type == 'video' else fourcc,
3087                         'protocol': 'ism',
3088                         'fragments': fragments,
3089                         'has_drm': ism_doc.find('Protection') is not None,
3090                         '_download_params': {
3091                             'stream_type': stream_type,
3092                             'duration': duration,
3093                             'timescale': stream_timescale,
3094                             'width': width or 0,
3095                             'height': height or 0,
3096                             'fourcc': fourcc,
3097                             'language': stream_language,
3098                             'codec_private_data': track.get('CodecPrivateData'),
3099                             'sampling_rate': sampling_rate,
3100                             'channels': int_or_none(track.get('Channels', 2)),
3101                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3102                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3103                         },
3104                     })
3105         return formats, subtitles
3106
3107     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3108         def absolute_url(item_url):
3109             return urljoin(base_url, item_url)
3110
3111         def parse_content_type(content_type):
3112             if not content_type:
3113                 return {}
3114             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3115             if ctr:
3116                 mimetype, codecs = ctr.groups()
3117                 f = parse_codecs(codecs)
3118                 f['ext'] = mimetype2ext(mimetype)
3119                 return f
3120             return {}
3121
3122         def _media_formats(src, cur_media_type, type_info={}):
3123             full_url = absolute_url(src)
3124             ext = type_info.get('ext') or determine_ext(full_url)
3125             if ext == 'm3u8':
3126                 is_plain_url = False
3127                 formats = self._extract_m3u8_formats(
3128                     full_url, video_id, ext='mp4',
3129                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3130                     preference=preference, quality=quality, fatal=False)
3131             elif ext == 'mpd':
3132                 is_plain_url = False
3133                 formats = self._extract_mpd_formats(
3134                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3135             else:
3136                 is_plain_url = True
3137                 formats = [{
3138                     'url': full_url,
3139                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3140                 }]
3141             return is_plain_url, formats
3142
3143         entries = []
3144         # amp-video and amp-audio are very similar to their HTML5 counterparts
3145         # so we wll include them right here (see
3146         # https://www.ampproject.org/docs/reference/components/amp-video)
3147         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3148         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3149         media_tags = [(media_tag, media_tag_name, media_type, '')
3150                       for media_tag, media_tag_name, media_type
3151                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3152         media_tags.extend(re.findall(
3153             # We only allow video|audio followed by a whitespace or '>'.
3154             # Allowing more characters may end up in significant slow down (see
3155             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3156             # http://www.porntrex.com/maps/videositemap.xml).
3157             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3158         for media_tag, _, media_type, media_content in media_tags:
3159             media_info = {
3160                 'formats': [],
3161                 'subtitles': {},
3162             }
3163             media_attributes = extract_attributes(media_tag)
3164             src = strip_or_none(media_attributes.get('src'))
3165             if src:
3166                 _, formats = _media_formats(src, media_type)
3167                 media_info['formats'].extend(formats)
3168             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3169             if media_content:
3170                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3171                     s_attr = extract_attributes(source_tag)
3172                     # data-video-src and data-src are non standard but seen
3173                     # several times in the wild
3174                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3175                     if not src:
3176                         continue
3177                     f = parse_content_type(s_attr.get('type'))
3178                     is_plain_url, formats = _media_formats(src, media_type, f)
3179                     if is_plain_url:
3180                         # width, height, res, label and title attributes are
3181                         # all not standard but seen several times in the wild
3182                         labels = [
3183                             s_attr.get(lbl)
3184                             for lbl in ('label', 'title')
3185                             if str_or_none(s_attr.get(lbl))
3186                         ]
3187                         width = int_or_none(s_attr.get('width'))
3188                         height = (int_or_none(s_attr.get('height'))
3189                                   or int_or_none(s_attr.get('res')))
3190                         if not width or not height:
3191                             for lbl in labels:
3192                                 resolution = parse_resolution(lbl)
3193                                 if not resolution:
3194                                     continue
3195                                 width = width or resolution.get('width')
3196                                 height = height or resolution.get('height')
3197                         for lbl in labels:
3198                             tbr = parse_bitrate(lbl)
3199                             if tbr:
3200                                 break
3201                         else:
3202                             tbr = None
3203                         f.update({
3204                             'width': width,
3205                             'height': height,
3206                             'tbr': tbr,
3207                             'format_id': s_attr.get('label') or s_attr.get('title'),
3208                         })
3209                         f.update(formats[0])
3210                         media_info['formats'].append(f)
3211                     else:
3212                         media_info['formats'].extend(formats)
3213                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3214                     track_attributes = extract_attributes(track_tag)
3215                     kind = track_attributes.get('kind')
3216                     if not kind or kind in ('subtitles', 'captions'):
3217                         src = strip_or_none(track_attributes.get('src'))
3218                         if not src:
3219                             continue
3220                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3221                         media_info['subtitles'].setdefault(lang, []).append({
3222                             'url': absolute_url(src),
3223                         })
3224             for f in media_info['formats']:
3225                 f.setdefault('http_headers', {})['Referer'] = base_url
3226             if media_info['formats'] or media_info['subtitles']:
3227                 entries.append(media_info)
3228         return entries
3229
3230     def _extract_akamai_formats(self, *args, **kwargs):
3231         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3232         if subs:
3233             self._report_ignoring_subs('akamai')
3234         return fmts
3235
3236     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3237         signed = 'hdnea=' in manifest_url
3238         if not signed:
3239             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3240             manifest_url = re.sub(
3241                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3242                 '', manifest_url).strip('?')
3243
3244         formats = []
3245         subtitles = {}
3246
3247         hdcore_sign = 'hdcore=3.7.0'
3248         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3249         hds_host = hosts.get('hds')
3250         if hds_host:
3251             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3252         if 'hdcore=' not in f4m_url:
3253             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3254         f4m_formats = self._extract_f4m_formats(
3255             f4m_url, video_id, f4m_id='hds', fatal=False)
3256         for entry in f4m_formats:
3257             entry.update({'extra_param_to_segment_url': hdcore_sign})
3258         formats.extend(f4m_formats)
3259
3260         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3261         hls_host = hosts.get('hls')
3262         if hls_host:
3263             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3264         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3265             m3u8_url, video_id, 'mp4', 'm3u8_native',
3266             m3u8_id='hls', fatal=False)
3267         formats.extend(m3u8_formats)
3268         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3269
3270         http_host = hosts.get('http')
3271         if http_host and m3u8_formats and not signed:
3272             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3273             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3274             qualities_length = len(qualities)
3275             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3276                 i = 0
3277                 for f in m3u8_formats:
3278                     if f['vcodec'] != 'none':
3279                         for protocol in ('http', 'https'):
3280                             http_f = f.copy()
3281                             del http_f['manifest_url']
3282                             http_url = re.sub(
3283                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3284                             http_f.update({
3285                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3286                                 'url': http_url,
3287                                 'protocol': protocol,
3288                             })
3289                             formats.append(http_f)
3290                         i += 1
3291
3292         return formats, subtitles
3293
3294     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3295         query = compat_urlparse.urlparse(url).query
3296         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3297         mobj = re.search(
3298             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3299         url_base = mobj.group('url')
3300         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3301         formats = []
3302
3303         def manifest_url(manifest):
3304             m_url = '%s/%s' % (http_base_url, manifest)
3305             if query:
3306                 m_url += '?%s' % query
3307             return m_url
3308
3309         if 'm3u8' not in skip_protocols:
3310             formats.extend(self._extract_m3u8_formats(
3311                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3312                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3313         if 'f4m' not in skip_protocols:
3314             formats.extend(self._extract_f4m_formats(
3315                 manifest_url('manifest.f4m'),
3316                 video_id, f4m_id='hds', fatal=False))
3317         if 'dash' not in skip_protocols:
3318             formats.extend(self._extract_mpd_formats(
3319                 manifest_url('manifest.mpd'),
3320                 video_id, mpd_id='dash', fatal=False))
3321         if re.search(r'(?:/smil:|\.smil)', url_base):
3322             if 'smil' not in skip_protocols:
3323                 rtmp_formats = self._extract_smil_formats(
3324                     manifest_url('jwplayer.smil'),
3325                     video_id, fatal=False)
3326                 for rtmp_format in rtmp_formats:
3327                     rtsp_format = rtmp_format.copy()
3328                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3329                     del rtsp_format['play_path']
3330                     del rtsp_format['ext']
3331                     rtsp_format.update({
3332                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3333                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3334                         'protocol': 'rtsp',
3335                     })
3336                     formats.extend([rtmp_format, rtsp_format])
3337         else:
3338             for protocol in ('rtmp', 'rtsp'):
3339                 if protocol not in skip_protocols:
3340                     formats.append({
3341                         'url': '%s:%s' % (protocol, url_base),
3342                         'format_id': protocol,
3343                         'protocol': protocol,
3344                     })
3345         return formats
3346
3347     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3348         mobj = re.search(
3349             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3350             webpage)
3351         if mobj:
3352             try:
3353                 jwplayer_data = self._parse_json(mobj.group('options'),
3354                                                  video_id=video_id,
3355                                                  transform_source=transform_source)
3356             except ExtractorError:
3357                 pass
3358             else:
3359                 if isinstance(jwplayer_data, dict):
3360                     return jwplayer_data
3361
3362     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3363         jwplayer_data = self._find_jwplayer_data(
3364             webpage, video_id, transform_source=js_to_json)
3365         return self._parse_jwplayer_data(
3366             jwplayer_data, video_id, *args, **kwargs)
3367
3368     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3369                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3370         # JWPlayer backward compatibility: flattened playlists
3371         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3372         if 'playlist' not in jwplayer_data:
3373             jwplayer_data = {'playlist': [jwplayer_data]}
3374
3375         entries = []
3376
3377         # JWPlayer backward compatibility: single playlist item
3378         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3379         if not isinstance(jwplayer_data['playlist'], list):
3380             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3381
3382         for video_data in jwplayer_data['playlist']:
3383             # JWPlayer backward compatibility: flattened sources
3384             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3385             if 'sources' not in video_data:
3386                 video_data['sources'] = [video_data]
3387
3388             this_video_id = video_id or video_data['mediaid']
3389
3390             formats = self._parse_jwplayer_formats(
3391                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3392                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3393
3394             subtitles = {}
3395             tracks = video_data.get('tracks')
3396             if tracks and isinstance(tracks, list):
3397                 for track in tracks:
3398                     if not isinstance(track, dict):
3399                         continue
3400                     track_kind = track.get('kind')
3401                     if not track_kind or not isinstance(track_kind, compat_str):
3402                         continue
3403                     if track_kind.lower() not in ('captions', 'subtitles'):
3404                         continue
3405                     track_url = urljoin(base_url, track.get('file'))
3406                     if not track_url:
3407                         continue
3408                     subtitles.setdefault(track.get('label') or 'en', []).append({
3409                         'url': self._proto_relative_url(track_url)
3410                     })
3411
3412             entry = {
3413                 'id': this_video_id,
3414                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3415                 'description': clean_html(video_data.get('description')),
3416                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3417                 'timestamp': int_or_none(video_data.get('pubdate')),
3418                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3419                 'subtitles': subtitles,
3420             }
3421             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3422             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3423                 entry.update({
3424                     '_type': 'url_transparent',
3425                     'url': formats[0]['url'],
3426                 })
3427             else:
3428                 self._sort_formats(formats)
3429                 entry['formats'] = formats
3430             entries.append(entry)
3431         if len(entries) == 1:
3432             return entries[0]
3433         else:
3434             return self.playlist_result(entries)
3435
3436     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3437                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3438         urls = []
3439         formats = []
3440         for source in jwplayer_sources_data:
3441             if not isinstance(source, dict):
3442                 continue
3443             source_url = urljoin(
3444                 base_url, self._proto_relative_url(source.get('file')))
3445             if not source_url or source_url in urls:
3446                 continue
3447             urls.append(source_url)
3448             source_type = source.get('type') or ''
3449             ext = mimetype2ext(source_type) or determine_ext(source_url)
3450             if source_type == 'hls' or ext == 'm3u8':
3451                 formats.extend(self._extract_m3u8_formats(
3452                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3453                     m3u8_id=m3u8_id, fatal=False))
3454             elif source_type == 'dash' or ext == 'mpd':
3455                 formats.extend(self._extract_mpd_formats(
3456                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3457             elif ext == 'smil':
3458                 formats.extend(self._extract_smil_formats(
3459                     source_url, video_id, fatal=False))
3460             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3461             elif source_type.startswith('audio') or ext in (
3462                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3463                 formats.append({
3464                     'url': source_url,
3465                     'vcodec': 'none',
3466                     'ext': ext,
3467                 })
3468             else:
3469                 height = int_or_none(source.get('height'))
3470                 if height is None:
3471                     # Often no height is provided but there is a label in
3472                     # format like "1080p", "720p SD", or 1080.
3473                     height = int_or_none(self._search_regex(
3474                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3475                         'height', default=None))
3476                 a_format = {
3477                     'url': source_url,
3478                     'width': int_or_none(source.get('width')),
3479                     'height': height,
3480                     'tbr': int_or_none(source.get('bitrate')),
3481                     'ext': ext,
3482                 }
3483                 if source_url.startswith('rtmp'):
3484                     a_format['ext'] = 'flv'
3485                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3486                     # of jwplayer.flash.swf
3487                     rtmp_url_parts = re.split(
3488                         r'((?:mp4|mp3|flv):)', source_url, 1)
3489                     if len(rtmp_url_parts) == 3:
3490                         rtmp_url, prefix, play_path = rtmp_url_parts
3491                         a_format.update({
3492                             'url': rtmp_url,
3493                             'play_path': prefix + play_path,
3494                         })
3495                     if rtmp_params:
3496                         a_format.update(rtmp_params)
3497                 formats.append(a_format)
3498         return formats
3499
3500     def _live_title(self, name):
3501         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3502         return name
3503
3504     def _int(self, v, name, fatal=False, **kwargs):
3505         res = int_or_none(v, **kwargs)
3506         if 'get_attr' in kwargs:
3507             print(getattr(v, kwargs['get_attr']))
3508         if res is None:
3509             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3510             if fatal:
3511                 raise ExtractorError(msg)
3512             else:
3513                 self.report_warning(msg)
3514         return res
3515
3516     def _float(self, v, name, fatal=False, **kwargs):
3517         res = float_or_none(v, **kwargs)
3518         if res is None:
3519             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3520             if fatal:
3521                 raise ExtractorError(msg)
3522             else:
3523                 self.report_warning(msg)
3524         return res
3525
3526     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3527                     path='/', secure=False, discard=False, rest={}, **kwargs):
3528         cookie = compat_cookiejar_Cookie(
3529             0, name, value, port, port is not None, domain, True,
3530             domain.startswith('.'), path, True, secure, expire_time,
3531             discard, None, None, rest)
3532         self._downloader.cookiejar.set_cookie(cookie)
3533
3534     def _get_cookies(self, url):
3535         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3536         req = sanitized_Request(url)
3537         self._downloader.cookiejar.add_cookie_header(req)
3538         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3539
3540     def _apply_first_set_cookie_header(self, url_handle, cookie):
3541         """
3542         Apply first Set-Cookie header instead of the last. Experimental.
3543
3544         Some sites (e.g. [1-3]) may serve two cookies under the same name
3545         in Set-Cookie header and expect the first (old) one to be set rather
3546         than second (new). However, as of RFC6265 the newer one cookie
3547         should be set into cookie store what actually happens.
3548         We will workaround this issue by resetting the cookie to
3549         the first one manually.
3550         1. https://new.vk.com/
3551         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3552         3. https://learning.oreilly.com/
3553         """
3554         for header, cookies in url_handle.headers.items():
3555             if header.lower() != 'set-cookie':
3556                 continue
3557             if sys.version_info[0] >= 3:
3558                 cookies = cookies.encode('iso-8859-1')
3559             cookies = cookies.decode('utf-8')
3560             cookie_value = re.search(
3561                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3562             if cookie_value:
3563                 value, domain = cookie_value.groups()
3564                 self._set_cookie(domain, cookie, value)
3565                 break
3566
3567     def get_testcases(self, include_onlymatching=False):
3568         t = getattr(self, '_TEST', None)
3569         if t:
3570             assert not hasattr(self, '_TESTS'), \
3571                 '%s has _TEST and _TESTS' % type(self).__name__
3572             tests = [t]
3573         else:
3574             tests = getattr(self, '_TESTS', [])
3575         for t in tests:
3576             if not include_onlymatching and t.get('only_matching', False):
3577                 continue
3578             t['name'] = type(self).__name__[:-len('IE')]
3579             yield t
3580
3581     def is_suitable(self, age_limit):
3582         """ Test whether the extractor is generally suitable for the given
3583         age limit (i.e. pornographic sites are not, all others usually are) """
3584
3585         any_restricted = False
3586         for tc in self.get_testcases(include_onlymatching=False):
3587             if tc.get('playlist', []):
3588                 tc = tc['playlist'][0]
3589             is_restricted = age_restricted(
3590                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3591             if not is_restricted:
3592                 return True
3593             any_restricted = any_restricted or is_restricted
3594         return not any_restricted
3595
3596     def extract_subtitles(self, *args, **kwargs):
3597         if (self.get_param('writesubtitles', False)
3598                 or self.get_param('listsubtitles')):
3599             return self._get_subtitles(*args, **kwargs)
3600         return {}
3601
3602     def _get_subtitles(self, *args, **kwargs):
3603         raise NotImplementedError('This method must be implemented by subclasses')
3604
3605     def extract_comments(self, *args, **kwargs):
3606         if not self.get_param('getcomments'):
3607             return None
3608         generator = self._get_comments(*args, **kwargs)
3609
3610         def extractor():
3611             comments = []
3612             interrupted = True
3613             try:
3614                 while True:
3615                     comments.append(next(generator))
3616             except StopIteration:
3617                 interrupted = False
3618             except KeyboardInterrupt:
3619                 self.to_screen('Interrupted by user')
3620             except Exception as e:
3621                 if self.get_param('ignoreerrors') is not True:
3622                     raise
3623                 self._downloader.report_error(e)
3624             comment_count = len(comments)
3625             self.to_screen(f'Extracted {comment_count} comments')
3626             return {
3627                 'comments': comments,
3628                 'comment_count': None if interrupted else comment_count
3629             }
3630         return extractor
3631
3632     def _get_comments(self, *args, **kwargs):
3633         raise NotImplementedError('This method must be implemented by subclasses')
3634
3635     @staticmethod
3636     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3637         """ Merge subtitle items for one language. Items with duplicated URLs
3638         will be dropped. """
3639         list1_urls = set([item['url'] for item in subtitle_list1])
3640         ret = list(subtitle_list1)
3641         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3642         return ret
3643
3644     @classmethod
3645     def _merge_subtitles(cls, *dicts, target=None):
3646         """ Merge subtitle dictionaries, language by language. """
3647         if target is None:
3648             target = {}
3649         for d in dicts:
3650             for lang, subs in d.items():
3651                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3652         return target
3653
3654     def extract_automatic_captions(self, *args, **kwargs):
3655         if (self.get_param('writeautomaticsub', False)
3656                 or self.get_param('listsubtitles')):
3657             return self._get_automatic_captions(*args, **kwargs)
3658         return {}
3659
3660     def _get_automatic_captions(self, *args, **kwargs):
3661         raise NotImplementedError('This method must be implemented by subclasses')
3662
3663     def mark_watched(self, *args, **kwargs):
3664         if not self.get_param('mark_watched', False):
3665             return
3666         if (self._get_login_info()[0] is not None
3667                 or self.get_param('cookiefile')
3668                 or self.get_param('cookiesfrombrowser')):
3669             self._mark_watched(*args, **kwargs)
3670
3671     def _mark_watched(self, *args, **kwargs):
3672         raise NotImplementedError('This method must be implemented by subclasses')
3673
3674     def geo_verification_headers(self):
3675         headers = {}
3676         geo_verification_proxy = self.get_param('geo_verification_proxy')
3677         if geo_verification_proxy:
3678             headers['Ytdl-request-proxy'] = geo_verification_proxy
3679         return headers
3680
3681     def _generic_id(self, url):
3682         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3683
3684     def _generic_title(self, url):
3685         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3686
3687     @staticmethod
3688     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3689         all_known = all(map(
3690             lambda x: x is not None,
3691             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3692         return (
3693             'private' if is_private
3694             else 'premium_only' if needs_premium
3695             else 'subscriber_only' if needs_subscription
3696             else 'needs_auth' if needs_auth
3697             else 'unlisted' if is_unlisted
3698             else 'public' if all_known
3699             else None)
3700
3701     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3702         '''
3703         @returns            A list of values for the extractor argument given by "key"
3704                             or "default" if no such key is present
3705         @param default      The default value to return when the key is not present (default: [])
3706         @param casesense    When false, the values are converted to lower case
3707         '''
3708         val = traverse_obj(
3709             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3710         if val is None:
3711             return [] if default is NO_DEFAULT else default
3712         return list(val) if casesense else [x.lower() for x in val]
3713
3714
3715 class SearchInfoExtractor(InfoExtractor):
3716     """
3717     Base class for paged search queries extractors.
3718     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3719     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3720     """
3721
3722     _MAX_RESULTS = float('inf')
3723
3724     @classmethod
3725     def _make_valid_url(cls):
3726         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3727
3728     def _real_extract(self, query):
3729         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3730         if prefix == '':
3731             return self._get_n_results(query, 1)
3732         elif prefix == 'all':
3733             return self._get_n_results(query, self._MAX_RESULTS)
3734         else:
3735             n = int(prefix)
3736             if n <= 0:
3737                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3738             elif n > self._MAX_RESULTS:
3739                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3740                 n = self._MAX_RESULTS
3741             return self._get_n_results(query, n)
3742
3743     def _get_n_results(self, query, n):
3744         """Get a specified number of results for a query.
3745         Either this function or _search_results must be overridden by subclasses """
3746         return self.playlist_result(
3747             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3748             query, query)
3749
3750     def _search_results(self, query):
3751         """Returns an iterator of search results"""
3752         raise NotImplementedError('This method must be implemented by subclasses')
3753
3754     @property
3755     def SEARCH_KEY(self):
3756         return self._SEARCH_KEY