yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import collections
   6 import hashlib
   7 import itertools
   8 import json
   9 import netrc
  10 import os
  11 import random
  12 import re
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar_Cookie,
  19     compat_cookies_SimpleCookie,
  20     compat_etree_Element,
  21     compat_etree_fromstring,
  22     compat_expanduser,
  23     compat_getpass,
  24     compat_http_client,
  25     compat_os_name,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_parse_unquote,
  29     compat_urllib_parse_urlencode,
  30     compat_urllib_request,
  31     compat_urlparse,
  32     compat_xml_parse_error,
  33 )
  34 from ..downloader import FileDownloader
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     error_to_compat_str,
  49     extract_attributes,
  50     ExtractorError,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     format_field,
  54     GeoRestrictedError,
  55     GeoUtils,
  56     int_or_none,
  57     join_nonempty,
  58     js_to_json,
  59     JSON_LD_RE,
  60     mimetype2ext,
  61     network_exceptions,
  62     NO_DEFAULT,
  63     orderedSet,
  64     parse_bitrate,
  65     parse_codecs,
  66     parse_duration,
  67     parse_iso8601,
  68     parse_m3u8_attributes,
  69     parse_resolution,
  70     RegexNotFoundError,
  71     sanitize_filename,
  72     sanitized_Request,
  73     str_or_none,
  74     str_to_int,
  75     strip_or_none,
  76     traverse_obj,
  77     unescapeHTML,
  78     UnsupportedError,
  79     unified_strdate,
  80     unified_timestamp,
  81     update_Request,
  82     update_url_query,
  83     url_basename,
  84     url_or_none,
  85     urljoin,
  86     variadic,
  87     xpath_element,
  88     xpath_text,
  89     xpath_with_ns,
  90 )
  91
  92
  93 class InfoExtractor(object):
  94     """Information Extractor class.
  95
  96     Information extractors are the classes that, given a URL, extract
  97     information about the video (or videos) the URL refers to. This
  98     information includes the real video URL, the video title, author and
  99     others. The information is stored in a dictionary which is then
 100     passed to the YoutubeDL. The YoutubeDL processes this
 101     information possibly downloading the video to the file system, among
 102     other possible outcomes.
 103
 104     The type field determines the type of the result.
 105     By far the most common value (and the default if _type is missing) is
 106     "video", which indicates a single video.
 107
 108     For a video, the dictionaries must include the following fields:
 109
 110     id:             Video identifier.
 111     title:          Video title, unescaped.
 112
 113     Additionally, it must contain either a formats entry or a url one:
 114
 115     formats:        A list of dictionaries for each format available, ordered
 116                     from worst to best quality.
 117
 118                     Potential fields:
 119                     * url        The mandatory URL representing the media:
 120                                    for plain file media - HTTP URL of this file,
 121                                    for RTMP - RTMP URL,
 122                                    for HLS - URL of the M3U8 media playlist,
 123                                    for HDS - URL of the F4M manifest,
 124                                    for DASH
 125                                      - HTTP URL to plain file media (in case of
 126                                        unfragmented media)
 127                                      - URL of the MPD manifest or base URL
 128                                        representing the media if MPD manifest
 129                                        is parsed from a string (in case of
 130                                        fragmented media)
 131                                    for MSS - URL of the ISM manifest.
 132                     * manifest_url
 133                                  The URL of the manifest file in case of
 134                                  fragmented media:
 135                                    for HLS - URL of the M3U8 master playlist,
 136                                    for HDS - URL of the F4M manifest,
 137                                    for DASH - URL of the MPD manifest,
 138                                    for MSS - URL of the ISM manifest.
 139                     * ext        Will be calculated from URL if missing
 140                     * format     A human-readable description of the format
 141                                  ("mp4 container with h264/opus").
 142                                  Calculated from the format_id, width, height.
 143                                  and format_note fields if missing.
 144                     * format_id  A short description of the format
 145                                  ("mp4_h264_opus" or "19").
 146                                 Technically optional, but strongly recommended.
 147                     * format_note Additional info about the format
 148                                  ("3D" or "DASH video")
 149                     * width      Width of the video, if known
 150                     * height     Height of the video, if known
 151                     * resolution Textual description of width and height
 152                     * dynamic_range The dynamic range of the video. One of:
 153                                  "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
 154                     * tbr        Average bitrate of audio and video in KBit/s
 155                     * abr        Average audio bitrate in KBit/s
 156                     * acodec     Name of the audio codec in use
 157                     * asr        Audio sampling rate in Hertz
 158                     * vbr        Average video bitrate in KBit/s
 159                     * fps        Frame rate
 160                     * vcodec     Name of the video codec in use
 161                     * container  Name of the container format
 162                     * filesize   The number of bytes, if known in advance
 163                     * filesize_approx  An estimate for the number of bytes
 164                     * player_url SWF Player URL (used for rtmpdump).
 165                     * protocol   The protocol that will be used for the actual
 166                                  download, lower-case. One of "http", "https" or
 167                                  one of the protocols defined in downloader.PROTOCOL_MAP
 168                     * fragment_base_url
 169                                  Base URL for fragments. Each fragment's path
 170                                  value (if present) will be relative to
 171                                  this URL.
 172                     * fragments  A list of fragments of a fragmented media.
 173                                  Each fragment entry must contain either an url
 174                                  or a path. If an url is present it should be
 175                                  considered by a client. Otherwise both path and
 176                                  fragment_base_url must be present. Here is
 177                                  the list of all potential fields:
 178                                  * "url" - fragment's URL
 179                                  * "path" - fragment's path relative to
 180                                             fragment_base_url
 181                                  * "duration" (optional, int or float)
 182                                  * "filesize" (optional, int)
 183                     * is_from_start  Is a live format that can be downloaded
 184                                 from the start. Boolean
 185                     * preference Order number of this format. If this field is
 186                                  present and not None, the formats get sorted
 187                                  by this field, regardless of all other values.
 188                                  -1 for default (order by other properties),
 189                                  -2 or smaller for less than default.
 190                                  < -1000 to hide the format (if there is
 191                                     another one which is strictly better)
 192                     * language   Language code, e.g. "de" or "en-US".
 193                     * language_preference  Is this in the language mentioned in
 194                                  the URL?
 195                                  10 if it's what the URL is about,
 196                                  -1 for default (don't know),
 197                                  -10 otherwise, other values reserved for now.
 198                     * quality    Order number of the video quality of this
 199                                  format, irrespective of the file format.
 200                                  -1 for default (order by other properties),
 201                                  -2 or smaller for less than default.
 202                     * source_preference  Order number for this video source
 203                                   (quality takes higher priority)
 204                                  -1 for default (order by other properties),
 205                                  -2 or smaller for less than default.
 206                     * http_headers  A dictionary of additional HTTP headers
 207                                  to add to the request.
 208                     * stretched_ratio  If given and not 1, indicates that the
 209                                  video's pixels are not square.
 210                                  width : height ratio as float.
 211                     * no_resume  The server does not support resuming the
 212                                  (HTTP or RTMP) download. Boolean.
 213                     * has_drm    The format has DRM and cannot be downloaded. Boolean
 214                     * downloader_options  A dictionary of downloader options as
 215                                  described in FileDownloader
 216                     RTMP formats can also have the additional fields: page_url,
 217                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 218                     rtmp_protocol, rtmp_real_time
 219
 220     url:            Final video URL.
 221     ext:            Video filename extension.
 222     format:         The video format, defaults to ext (used for --get-format)
 223     player_url:     SWF Player URL (used for rtmpdump).
 224
 225     The following fields are optional:
 226
 227     alt_title:      A secondary title of the video.
 228     display_id      An alternative identifier for the video, not necessarily
 229                     unique, but available before title. Typically, id is
 230                     something like "4234987", title "Dancing naked mole rats",
 231                     and display_id "dancing-naked-mole-rats"
 232     thumbnails:     A list of dictionaries, with the following entries:
 233                         * "id" (optional, string) - Thumbnail format ID
 234                         * "url"
 235                         * "preference" (optional, int) - quality of the image
 236                         * "width" (optional, int)
 237                         * "height" (optional, int)
 238                         * "resolution" (optional, string "{width}x{height}",
 239                                         deprecated)
 240                         * "filesize" (optional, int)
 241     thumbnail:      Full URL to a video thumbnail image.
 242     description:    Full video description.
 243     uploader:       Full name of the video uploader.
 244     license:        License name the video is licensed under.
 245     creator:        The creator of the video.
 246     timestamp:      UNIX timestamp of the moment the video was uploaded
 247     upload_date:    Video upload date (YYYYMMDD).
 248                     If not explicitly set, calculated from timestamp
 249     release_timestamp: UNIX timestamp of the moment the video was released.
 250                     If it is not clear whether to use timestamp or this, use the former
 251     release_date:   The date (YYYYMMDD) when the video was released.
 252                     If not explicitly set, calculated from release_timestamp
 253     modified_timestamp: UNIX timestamp of the moment the video was last modified.
 254     modified_date:   The date (YYYYMMDD) when the video was last modified.
 255                     If not explicitly set, calculated from modified_timestamp
 256     uploader_id:    Nickname or id of the video uploader.
 257     uploader_url:   Full URL to a personal webpage of the video uploader.
 258     channel:        Full name of the channel the video is uploaded on.
 259                     Note that channel fields may or may not repeat uploader
 260                     fields. This depends on a particular extractor.
 261     channel_id:     Id of the channel.
 262     channel_url:    Full URL to a channel webpage.
 263     location:       Physical location where the video was filmed.
 264     subtitles:      The available subtitles as a dictionary in the format
 265                     {tag: subformats}. "tag" is usually a language code, and
 266                     "subformats" is a list sorted from lower to higher
 267                     preference, each element is a dictionary with the "ext"
 268                     entry and one of:
 269                         * "data": The subtitles file contents
 270                         * "url": A URL pointing to the subtitles file
 271                     It can optionally also have:
 272                         * "name": Name or description of the subtitles
 273                     "ext" will be calculated from URL if missing
 274     automatic_captions: Like 'subtitles'; contains automatically generated
 275                     captions instead of normal subtitles
 276     duration:       Length of the video in seconds, as an integer or float.
 277     view_count:     How many users have watched the video on the platform.
 278     like_count:     Number of positive ratings of the video
 279     dislike_count:  Number of negative ratings of the video
 280     repost_count:   Number of reposts of the video
 281     average_rating: Average rating give by users, the scale used depends on the webpage
 282     comment_count:  Number of comments on the video
 283     comments:       A list of comments, each with one or more of the following
 284                     properties (all but one of text or html optional):
 285                         * "author" - human-readable name of the comment author
 286                         * "author_id" - user ID of the comment author
 287                         * "author_thumbnail" - The thumbnail of the comment author
 288                         * "id" - Comment ID
 289                         * "html" - Comment as HTML
 290                         * "text" - Plain text of the comment
 291                         * "timestamp" - UNIX timestamp of comment
 292                         * "parent" - ID of the comment this one is replying to.
 293                                      Set to "root" to indicate that this is a
 294                                      comment to the original video.
 295                         * "like_count" - Number of positive ratings of the comment
 296                         * "dislike_count" - Number of negative ratings of the comment
 297                         * "is_favorited" - Whether the comment is marked as
 298                                            favorite by the video uploader
 299                         * "author_is_uploader" - Whether the comment is made by
 300                                                  the video uploader
 301     age_limit:      Age restriction for the video, as an integer (years)
 302     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 303                     should allow to get the same result again. (It will be set
 304                     by YoutubeDL if it's missing)
 305     categories:     A list of categories that the video falls in, for example
 306                     ["Sports", "Berlin"]
 307     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 308     cast:           A list of the video cast
 309     is_live:        True, False, or None (=unknown). Whether this video is a
 310                     live stream that goes on instead of a fixed-length video.
 311     was_live:       True, False, or None (=unknown). Whether this video was
 312                     originally a live stream.
 313     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 314                     If absent, automatically set from is_live, was_live
 315     start_time:     Time in seconds where the reproduction should start, as
 316                     specified in the URL.
 317     end_time:       Time in seconds where the reproduction should end, as
 318                     specified in the URL.
 319     chapters:       A list of dictionaries, with the following entries:
 320                         * "start_time" - The start time of the chapter in seconds
 321                         * "end_time" - The end time of the chapter in seconds
 322                         * "title" (optional, string)
 323     playable_in_embed: Whether this video is allowed to play in embedded
 324                     players on other sites. Can be True (=always allowed),
 325                     False (=never allowed), None (=unknown), or a string
 326                     specifying the criteria for embedability (Eg: 'whitelist')
 327     availability:   Under what condition the video is available. One of
 328                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 329                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 330                     to set it
 331     __post_extractor: A function to be called just before the metadata is
 332                     written to either disk, logger or console. The function
 333                     must return a dict which will be added to the info_dict.
 334                     This is usefull for additional information that is
 335                     time-consuming to extract. Note that the fields thus
 336                     extracted will not be available to output template and
 337                     match_filter. So, only "comments" and "comment_count" are
 338                     currently allowed to be extracted via this method.
 339
 340     The following fields should only be used when the video belongs to some logical
 341     chapter or section:
 342
 343     chapter:        Name or title of the chapter the video belongs to.
 344     chapter_number: Number of the chapter the video belongs to, as an integer.
 345     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 346
 347     The following fields should only be used when the video is an episode of some
 348     series, programme or podcast:
 349
 350     series:         Title of the series or programme the video episode belongs to.
 351     series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
 352     season:         Title of the season the video episode belongs to.
 353     season_number:  Number of the season the video episode belongs to, as an integer.
 354     season_id:      Id of the season the video episode belongs to, as a unicode string.
 355     episode:        Title of the video episode. Unlike mandatory video title field,
 356                     this field should denote the exact title of the video episode
 357                     without any kind of decoration.
 358     episode_number: Number of the video episode within a season, as an integer.
 359     episode_id:     Id of the video episode, as a unicode string.
 360
 361     The following fields should only be used when the media is a track or a part of
 362     a music album:
 363
 364     track:          Title of the track.
 365     track_number:   Number of the track within an album or a disc, as an integer.
 366     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 367                     as a unicode string.
 368     artist:         Artist(s) of the track.
 369     genre:          Genre(s) of the track.
 370     album:          Title of the album the track belongs to.
 371     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 372     album_artist:   List of all artists appeared on the album (e.g.
 373                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 374                     and compilations).
 375     disc_number:    Number of the disc or other physical medium the track belongs to,
 376                     as an integer.
 377     release_year:   Year (YYYY) when the album was released.
 378     composer:       Composer of the piece
 379
 380     Unless mentioned otherwise, the fields should be Unicode strings.
 381
 382     Unless mentioned otherwise, None is equivalent to absence of information.
 383
 384
 385     _type "playlist" indicates multiple videos.
 386     There must be a key "entries", which is a list, an iterable, or a PagedList
 387     object, each element of which is a valid dictionary by this specification.
 388
 389     Additionally, playlists can have "id", "title", and any other relevent
 390     attributes with the same semantics as videos (see above).
 391
 392     It can also have the following optional fields:
 393
 394     playlist_count: The total number of videos in a playlist. If not given,
 395                     YoutubeDL tries to calculate it from "entries"
 396
 397
 398     _type "multi_video" indicates that there are multiple videos that
 399     form a single show, for examples multiple acts of an opera or TV episode.
 400     It must have an entries key like a playlist and contain all the keys
 401     required for a video at the same time.
 402
 403
 404     _type "url" indicates that the video must be extracted from another
 405     location, possibly by a different extractor. Its only required key is:
 406     "url" - the next URL to extract.
 407     The key "ie_key" can be set to the class name (minus the trailing "IE",
 408     e.g. "Youtube") if the extractor class is known in advance.
 409     Additionally, the dictionary may have any properties of the resolved entity
 410     known in advance, for example "title" if the title of the referred video is
 411     known ahead of time.
 412
 413
 414     _type "url_transparent" entities have the same specification as "url", but
 415     indicate that the given additional information is more precise than the one
 416     associated with the resolved URL.
 417     This is useful when a site employs a video service that hosts the video and
 418     its technical metadata, but that video service does not embed a useful
 419     title, description etc.
 420
 421
 422     Subclasses of this one should re-define the _real_initialize() and
 423     _real_extract() methods and define a _VALID_URL regexp.
 424     Probably, they should also be added to the list of extractors.
 425
 426     Subclasses may also override suitable() if necessary, but ensure the function
 427     signature is preserved and that this function imports everything it needs
 428     (except other extractors), so that lazy_extractors works correctly
 429
 430     _GEO_BYPASS attribute may be set to False in order to disable
 431     geo restriction bypass mechanisms for a particular extractor.
 432     Though it won't disable explicit geo restriction bypass based on
 433     country code provided with geo_bypass_country.
 434
 435     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 436     countries for this extractor. One of these countries will be used by
 437     geo restriction bypass mechanism right away in order to bypass
 438     geo restriction, of course, if the mechanism is not disabled.
 439
 440     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 441     IP blocks in CIDR notation for this extractor. One of these IP blocks
 442     will be used by geo restriction bypass mechanism similarly
 443     to _GEO_COUNTRIES.
 444
 445     The _WORKING attribute should be set to False for broken IEs
 446     in order to warn the users and skip the tests.
 447     """
 448
 449     _ready = False
 450     _downloader = None
 451     _x_forwarded_for_ip = None
 452     _GEO_BYPASS = True
 453     _GEO_COUNTRIES = None
 454     _GEO_IP_BLOCKS = None
 455     _WORKING = True
 456
 457     _LOGIN_HINTS = {
 458         'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
 459         'cookies': (
 460             'Use --cookies-from-browser or --cookies for the authentication. '
 461             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to manually pass cookies'),
 462         'password': 'Use --username and --password, or --netrc to provide account credentials',
 463     }
 464
 465     def __init__(self, downloader=None):
 466         """Constructor. Receives an optional downloader (a YoutubeDL instance).
 467         If a downloader is not passed during initialization,
 468         it must be set using "set_downloader()" before "extract()" is called"""
 469         self._ready = False
 470         self._x_forwarded_for_ip = None
 471         self._printed_messages = set()
 472         self.set_downloader(downloader)
 473
 474     @classmethod
 475     def _match_valid_url(cls, url):
 476         # This does not use has/getattr intentionally - we want to know whether
 477         # we have cached the regexp for *this* class, whereas getattr would also
 478         # match the superclass
 479         if '_VALID_URL_RE' not in cls.__dict__:
 480             if '_VALID_URL' not in cls.__dict__:
 481                 cls._VALID_URL = cls._make_valid_url()
 482             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 483         return cls._VALID_URL_RE.match(url)
 484
 485     @classmethod
 486     def suitable(cls, url):
 487         """Receives a URL and returns True if suitable for this IE."""
 488         # This function must import everything it needs (except other extractors),
 489         # so that lazy_extractors works correctly
 490         return cls._match_valid_url(url) is not None
 491
 492     @classmethod
 493     def _match_id(cls, url):
 494         return cls._match_valid_url(url).group('id')
 495
 496     @classmethod
 497     def get_temp_id(cls, url):
 498         try:
 499             return cls._match_id(url)
 500         except (IndexError, AttributeError):
 501             return None
 502
 503     @classmethod
 504     def working(cls):
 505         """Getter method for _WORKING."""
 506         return cls._WORKING
 507
 508     def initialize(self):
 509         """Initializes an instance (authentication, etc)."""
 510         self._printed_messages = set()
 511         self._initialize_geo_bypass({
 512             'countries': self._GEO_COUNTRIES,
 513             'ip_blocks': self._GEO_IP_BLOCKS,
 514         })
 515         if not self._ready:
 516             self._real_initialize()
 517             self._ready = True
 518
 519     def _initialize_geo_bypass(self, geo_bypass_context):
 520         """
 521         Initialize geo restriction bypass mechanism.
 522
 523         This method is used to initialize geo bypass mechanism based on faking
 524         X-Forwarded-For HTTP header. A random country from provided country list
 525         is selected and a random IP belonging to this country is generated. This
 526         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 527         HTTP requests.
 528
 529         This method will be used for initial geo bypass mechanism initialization
 530         during the instance initialization with _GEO_COUNTRIES and
 531         _GEO_IP_BLOCKS.
 532
 533         You may also manually call it from extractor's code if geo bypass
 534         information is not available beforehand (e.g. obtained during
 535         extraction) or due to some other reason. In this case you should pass
 536         this information in geo bypass context passed as first argument. It may
 537         contain following fields:
 538
 539         countries:  List of geo unrestricted countries (similar
 540                     to _GEO_COUNTRIES)
 541         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 542                     (similar to _GEO_IP_BLOCKS)
 543
 544         """
 545         if not self._x_forwarded_for_ip:
 546
 547             # Geo bypass mechanism is explicitly disabled by user
 548             if not self.get_param('geo_bypass', True):
 549                 return
 550
 551             if not geo_bypass_context:
 552                 geo_bypass_context = {}
 553
 554             # Backward compatibility: previously _initialize_geo_bypass
 555             # expected a list of countries, some 3rd party code may still use
 556             # it this way
 557             if isinstance(geo_bypass_context, (list, tuple)):
 558                 geo_bypass_context = {
 559                     'countries': geo_bypass_context,
 560                 }
 561
 562             # The whole point of geo bypass mechanism is to fake IP
 563             # as X-Forwarded-For HTTP header based on some IP block or
 564             # country code.
 565
 566             # Path 1: bypassing based on IP block in CIDR notation
 567
 568             # Explicit IP block specified by user, use it right away
 569             # regardless of whether extractor is geo bypassable or not
 570             ip_block = self.get_param('geo_bypass_ip_block', None)
 571
 572             # Otherwise use random IP block from geo bypass context but only
 573             # if extractor is known as geo bypassable
 574             if not ip_block:
 575                 ip_blocks = geo_bypass_context.get('ip_blocks')
 576                 if self._GEO_BYPASS and ip_blocks:
 577                     ip_block = random.choice(ip_blocks)
 578
 579             if ip_block:
 580                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 581                 self._downloader.write_debug(
 582                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 583                 return
 584
 585             # Path 2: bypassing based on country code
 586
 587             # Explicit country code specified by user, use it right away
 588             # regardless of whether extractor is geo bypassable or not
 589             country = self.get_param('geo_bypass_country', None)
 590
 591             # Otherwise use random country code from geo bypass context but
 592             # only if extractor is known as geo bypassable
 593             if not country:
 594                 countries = geo_bypass_context.get('countries')
 595                 if self._GEO_BYPASS and countries:
 596                     country = random.choice(countries)
 597
 598             if country:
 599                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 600                 self._downloader.write_debug(
 601                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 602
 603     def extract(self, url):
 604         """Extracts URL information and returns it in list of dicts."""
 605         try:
 606             for _ in range(2):
 607                 try:
 608                     self.initialize()
 609                     self.write_debug('Extracting URL: %s' % url)
 610                     ie_result = self._real_extract(url)
 611                     if ie_result is None:
 612                         return None
 613                     if self._x_forwarded_for_ip:
 614                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 615                     subtitles = ie_result.get('subtitles')
 616                     if (subtitles and 'live_chat' in subtitles
 617                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 618                         del subtitles['live_chat']
 619                     return ie_result
 620                 except GeoRestrictedError as e:
 621                     if self.__maybe_fake_ip_and_retry(e.countries):
 622                         continue
 623                     raise
 624         except UnsupportedError:
 625             raise
 626         except ExtractorError as e:
 627             kwargs = {
 628                 'video_id': e.video_id or self.get_temp_id(url),
 629                 'ie': self.IE_NAME,
 630                 'tb': e.traceback or sys.exc_info()[2],
 631                 'expected': e.expected,
 632                 'cause': e.cause
 633             }
 634             if hasattr(e, 'countries'):
 635                 kwargs['countries'] = e.countries
 636             raise type(e)(e.msg, **kwargs)
 637         except compat_http_client.IncompleteRead as e:
 638             raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
 639         except (KeyError, StopIteration) as e:
 640             raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
 641
 642     def __maybe_fake_ip_and_retry(self, countries):
 643         if (not self.get_param('geo_bypass_country', None)
 644                 and self._GEO_BYPASS
 645                 and self.get_param('geo_bypass', True)
 646                 and not self._x_forwarded_for_ip
 647                 and countries):
 648             country_code = random.choice(countries)
 649             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 650             if self._x_forwarded_for_ip:
 651                 self.report_warning(
 652                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 653                     % (self._x_forwarded_for_ip, country_code.upper()))
 654                 return True
 655         return False
 656
 657     def set_downloader(self, downloader):
 658         """Sets the downloader for this IE."""
 659         self._downloader = downloader
 660
 661     def _real_initialize(self):
 662         """Real initialization process. Redefine in subclasses."""
 663         pass
 664
 665     def _real_extract(self, url):
 666         """Real extraction process. Redefine in subclasses."""
 667         pass
 668
 669     @classmethod
 670     def ie_key(cls):
 671         """A string for getting the InfoExtractor with get_info_extractor"""
 672         return cls.__name__[:-2]
 673
 674     @property
 675     def IE_NAME(self):
 676         return compat_str(type(self).__name__[:-2])
 677
 678     @staticmethod
 679     def __can_accept_status_code(err, expected_status):
 680         assert isinstance(err, compat_urllib_error.HTTPError)
 681         if expected_status is None:
 682             return False
 683         elif callable(expected_status):
 684             return expected_status(err.code) is True
 685         else:
 686             return err.code in variadic(expected_status)
 687
 688     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 689         """
 690         Return the response handle.
 691
 692         See _download_webpage docstring for arguments specification.
 693         """
 694         if not self._downloader._first_webpage_request:
 695             sleep_interval = self.get_param('sleep_interval_requests') or 0
 696             if sleep_interval > 0:
 697                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 698                 time.sleep(sleep_interval)
 699         else:
 700             self._downloader._first_webpage_request = False
 701
 702         if note is None:
 703             self.report_download_webpage(video_id)
 704         elif note is not False:
 705             if video_id is None:
 706                 self.to_screen('%s' % (note,))
 707             else:
 708                 self.to_screen('%s: %s' % (video_id, note))
 709
 710         # Some sites check X-Forwarded-For HTTP header in order to figure out
 711         # the origin of the client behind proxy. This allows bypassing geo
 712         # restriction by faking this header's value to IP that belongs to some
 713         # geo unrestricted country. We will do so once we encounter any
 714         # geo restriction error.
 715         if self._x_forwarded_for_ip:
 716             if 'X-Forwarded-For' not in headers:
 717                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 718
 719         if isinstance(url_or_request, compat_urllib_request.Request):
 720             url_or_request = update_Request(
 721                 url_or_request, data=data, headers=headers, query=query)
 722         else:
 723             if query:
 724                 url_or_request = update_url_query(url_or_request, query)
 725             if data is not None or headers:
 726                 url_or_request = sanitized_Request(url_or_request, data, headers)
 727         try:
 728             return self._downloader.urlopen(url_or_request)
 729         except network_exceptions as err:
 730             if isinstance(err, compat_urllib_error.HTTPError):
 731                 if self.__can_accept_status_code(err, expected_status):
 732                     # Retain reference to error to prevent file object from
 733                     # being closed before it can be read. Works around the
 734                     # effects of <https://bugs.python.org/issue15002>
 735                     # introduced in Python 3.4.1.
 736                     err.fp._error = err
 737                     return err.fp
 738
 739             if errnote is False:
 740                 return False
 741             if errnote is None:
 742                 errnote = 'Unable to download webpage'
 743
 744             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 745             if fatal:
 746                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 747             else:
 748                 self.report_warning(errmsg)
 749                 return False
 750
 751     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 752         """
 753         Return a tuple (page content as string, URL handle).
 754
 755         See _download_webpage docstring for arguments specification.
 756         """
 757         # Strip hashes from the URL (#1038)
 758         if isinstance(url_or_request, (compat_str, str)):
 759             url_or_request = url_or_request.partition('#')[0]
 760
 761         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 762         if urlh is False:
 763             assert not fatal
 764             return False
 765         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 766         return (content, urlh)
 767
 768     @staticmethod
 769     def _guess_encoding_from_content(content_type, webpage_bytes):
 770         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 771         if m:
 772             encoding = m.group(1)
 773         else:
 774             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 775                           webpage_bytes[:1024])
 776             if m:
 777                 encoding = m.group(1).decode('ascii')
 778             elif webpage_bytes.startswith(b'\xff\xfe'):
 779                 encoding = 'utf-16'
 780             else:
 781                 encoding = 'utf-8'
 782
 783         return encoding
 784
 785     def __check_blocked(self, content):
 786         first_block = content[:512]
 787         if ('<title>Access to this site is blocked</title>' in content
 788                 and 'Websense' in first_block):
 789             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 790             blocked_iframe = self._html_search_regex(
 791                 r'<iframe src="([^"]+)"', content,
 792                 'Websense information URL', default=None)
 793             if blocked_iframe:
 794                 msg += ' Visit %s for more details' % blocked_iframe
 795             raise ExtractorError(msg, expected=True)
 796         if '<title>The URL you requested has been blocked</title>' in first_block:
 797             msg = (
 798                 'Access to this webpage has been blocked by Indian censorship. '
 799                 'Use a VPN or proxy server (with --proxy) to route around it.')
 800             block_msg = self._html_search_regex(
 801                 r'</h1><p>(.*?)</p>',
 802                 content, 'block message', default=None)
 803             if block_msg:
 804                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 805             raise ExtractorError(msg, expected=True)
 806         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 807                 and 'blocklist.rkn.gov.ru' in content):
 808             raise ExtractorError(
 809                 'Access to this webpage has been blocked by decision of the Russian government. '
 810                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 811                 expected=True)
 812
 813     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 814         content_type = urlh.headers.get('Content-Type', '')
 815         webpage_bytes = urlh.read()
 816         if prefix is not None:
 817             webpage_bytes = prefix + webpage_bytes
 818         if not encoding:
 819             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 820         if self.get_param('dump_intermediate_pages', False):
 821             self.to_screen('Dumping request to ' + urlh.geturl())
 822             dump = base64.b64encode(webpage_bytes).decode('ascii')
 823             self._downloader.to_screen(dump)
 824         if self.get_param('write_pages', False):
 825             basen = '%s_%s' % (video_id, urlh.geturl())
 826             trim_length = self.get_param('trim_file_name') or 240
 827             if len(basen) > trim_length:
 828                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 829                 basen = basen[:trim_length - len(h)] + h
 830             raw_filename = basen + '.dump'
 831             filename = sanitize_filename(raw_filename, restricted=True)
 832             self.to_screen('Saving request to ' + filename)
 833             # Working around MAX_PATH limitation on Windows (see
 834             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 835             if compat_os_name == 'nt':
 836                 absfilepath = os.path.abspath(filename)
 837                 if len(absfilepath) > 259:
 838                     filename = '\\\\?\\' + absfilepath
 839             with open(filename, 'wb') as outf:
 840                 outf.write(webpage_bytes)
 841
 842         try:
 843             content = webpage_bytes.decode(encoding, 'replace')
 844         except LookupError:
 845             content = webpage_bytes.decode('utf-8', 'replace')
 846
 847         self.__check_blocked(content)
 848
 849         return content
 850
 851     def _download_webpage(
 852             self, url_or_request, video_id, note=None, errnote=None,
 853             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 854             headers={}, query={}, expected_status=None):
 855         """
 856         Return the data of the page as a string.
 857
 858         Arguments:
 859         url_or_request -- plain text URL as a string or
 860             a compat_urllib_request.Requestobject
 861         video_id -- Video/playlist/item identifier (string)
 862
 863         Keyword arguments:
 864         note -- note printed before downloading (string)
 865         errnote -- note printed in case of an error (string)
 866         fatal -- flag denoting whether error should be considered fatal,
 867             i.e. whether it should cause ExtractionError to be raised,
 868             otherwise a warning will be reported and extraction continued
 869         tries -- number of tries
 870         timeout -- sleep interval between tries
 871         encoding -- encoding for a page content decoding, guessed automatically
 872             when not explicitly specified
 873         data -- POST data (bytes)
 874         headers -- HTTP headers (dict)
 875         query -- URL query (dict)
 876         expected_status -- allows to accept failed HTTP requests (non 2xx
 877             status code) by explicitly specifying a set of accepted status
 878             codes. Can be any of the following entities:
 879                 - an integer type specifying an exact failed status code to
 880                   accept
 881                 - a list or a tuple of integer types specifying a list of
 882                   failed status codes to accept
 883                 - a callable accepting an actual failed status code and
 884                   returning True if it should be accepted
 885             Note that this argument does not affect success status codes (2xx)
 886             which are always accepted.
 887         """
 888
 889         success = False
 890         try_count = 0
 891         while success is False:
 892             try:
 893                 res = self._download_webpage_handle(
 894                     url_or_request, video_id, note, errnote, fatal,
 895                     encoding=encoding, data=data, headers=headers, query=query,
 896                     expected_status=expected_status)
 897                 success = True
 898             except compat_http_client.IncompleteRead as e:
 899                 try_count += 1
 900                 if try_count >= tries:
 901                     raise e
 902                 self._sleep(timeout, video_id)
 903         if res is False:
 904             return res
 905         else:
 906             content, _ = res
 907             return content
 908
 909     def _download_xml_handle(
 910             self, url_or_request, video_id, note='Downloading XML',
 911             errnote='Unable to download XML', transform_source=None,
 912             fatal=True, encoding=None, data=None, headers={}, query={},
 913             expected_status=None):
 914         """
 915         Return a tuple (xml as an compat_etree_Element, URL handle).
 916
 917         See _download_webpage docstring for arguments specification.
 918         """
 919         res = self._download_webpage_handle(
 920             url_or_request, video_id, note, errnote, fatal=fatal,
 921             encoding=encoding, data=data, headers=headers, query=query,
 922             expected_status=expected_status)
 923         if res is False:
 924             return res
 925         xml_string, urlh = res
 926         return self._parse_xml(
 927             xml_string, video_id, transform_source=transform_source,
 928             fatal=fatal), urlh
 929
 930     def _download_xml(
 931             self, url_or_request, video_id,
 932             note='Downloading XML', errnote='Unable to download XML',
 933             transform_source=None, fatal=True, encoding=None,
 934             data=None, headers={}, query={}, expected_status=None):
 935         """
 936         Return the xml as an compat_etree_Element.
 937
 938         See _download_webpage docstring for arguments specification.
 939         """
 940         res = self._download_xml_handle(
 941             url_or_request, video_id, note=note, errnote=errnote,
 942             transform_source=transform_source, fatal=fatal, encoding=encoding,
 943             data=data, headers=headers, query=query,
 944             expected_status=expected_status)
 945         return res if res is False else res[0]
 946
 947     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 948         if transform_source:
 949             xml_string = transform_source(xml_string)
 950         try:
 951             return compat_etree_fromstring(xml_string.encode('utf-8'))
 952         except compat_xml_parse_error as ve:
 953             errmsg = '%s: Failed to parse XML ' % video_id
 954             if fatal:
 955                 raise ExtractorError(errmsg, cause=ve)
 956             else:
 957                 self.report_warning(errmsg + str(ve))
 958
 959     def _download_json_handle(
 960             self, url_or_request, video_id, note='Downloading JSON metadata',
 961             errnote='Unable to download JSON metadata', transform_source=None,
 962             fatal=True, encoding=None, data=None, headers={}, query={},
 963             expected_status=None):
 964         """
 965         Return a tuple (JSON object, URL handle).
 966
 967         See _download_webpage docstring for arguments specification.
 968         """
 969         res = self._download_webpage_handle(
 970             url_or_request, video_id, note, errnote, fatal=fatal,
 971             encoding=encoding, data=data, headers=headers, query=query,
 972             expected_status=expected_status)
 973         if res is False:
 974             return res
 975         json_string, urlh = res
 976         return self._parse_json(
 977             json_string, video_id, transform_source=transform_source,
 978             fatal=fatal), urlh
 979
 980     def _download_json(
 981             self, url_or_request, video_id, note='Downloading JSON metadata',
 982             errnote='Unable to download JSON metadata', transform_source=None,
 983             fatal=True, encoding=None, data=None, headers={}, query={},
 984             expected_status=None):
 985         """
 986         Return the JSON object as a dict.
 987
 988         See _download_webpage docstring for arguments specification.
 989         """
 990         res = self._download_json_handle(
 991             url_or_request, video_id, note=note, errnote=errnote,
 992             transform_source=transform_source, fatal=fatal, encoding=encoding,
 993             data=data, headers=headers, query=query,
 994             expected_status=expected_status)
 995         return res if res is False else res[0]
 996
 997     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 998         if transform_source:
 999             json_string = transform_source(json_string)
1000         try:
1001             return json.loads(json_string)
1002         except ValueError as ve:
1003             errmsg = '%s: Failed to parse JSON ' % video_id
1004             if fatal:
1005                 raise ExtractorError(errmsg, cause=ve)
1006             else:
1007                 self.report_warning(errmsg + str(ve))
1008
1009     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1010         return self._parse_json(
1011             data[data.find('{'):data.rfind('}') + 1],
1012             video_id, transform_source, fatal)
1013
1014     def _download_socket_json_handle(
1015             self, url_or_request, video_id, note='Polling socket',
1016             errnote='Unable to poll socket', transform_source=None,
1017             fatal=True, encoding=None, data=None, headers={}, query={},
1018             expected_status=None):
1019         """
1020         Return a tuple (JSON object, URL handle).
1021
1022         See _download_webpage docstring for arguments specification.
1023         """
1024         res = self._download_webpage_handle(
1025             url_or_request, video_id, note, errnote, fatal=fatal,
1026             encoding=encoding, data=data, headers=headers, query=query,
1027             expected_status=expected_status)
1028         if res is False:
1029             return res
1030         webpage, urlh = res
1031         return self._parse_socket_response_as_json(
1032             webpage, video_id, transform_source=transform_source,
1033             fatal=fatal), urlh
1034
1035     def _download_socket_json(
1036             self, url_or_request, video_id, note='Polling socket',
1037             errnote='Unable to poll socket', transform_source=None,
1038             fatal=True, encoding=None, data=None, headers={}, query={},
1039             expected_status=None):
1040         """
1041         Return the JSON object as a dict.
1042
1043         See _download_webpage docstring for arguments specification.
1044         """
1045         res = self._download_socket_json_handle(
1046             url_or_request, video_id, note=note, errnote=errnote,
1047             transform_source=transform_source, fatal=fatal, encoding=encoding,
1048             data=data, headers=headers, query=query,
1049             expected_status=expected_status)
1050         return res if res is False else res[0]
1051
1052     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1053         idstr = format_field(video_id, template='%s: ')
1054         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1055         if only_once:
1056             if f'WARNING: {msg}' in self._printed_messages:
1057                 return
1058             self._printed_messages.add(f'WARNING: {msg}')
1059         self._downloader.report_warning(msg, *args, **kwargs)
1060
1061     def to_screen(self, msg, *args, **kwargs):
1062         """Print msg to screen, prefixing it with '[ie_name]'"""
1063         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1064
1065     def write_debug(self, msg, *args, **kwargs):
1066         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1067
1068     def get_param(self, name, default=None, *args, **kwargs):
1069         if self._downloader:
1070             return self._downloader.params.get(name, default, *args, **kwargs)
1071         return default
1072
1073     def report_drm(self, video_id, partial=False):
1074         self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1075
1076     def report_extraction(self, id_or_name):
1077         """Report information extraction."""
1078         self.to_screen('%s: Extracting information' % id_or_name)
1079
1080     def report_download_webpage(self, video_id):
1081         """Report webpage download."""
1082         self.to_screen('%s: Downloading webpage' % video_id)
1083
1084     def report_age_confirmation(self):
1085         """Report attempt to confirm age."""
1086         self.to_screen('Confirming age')
1087
1088     def report_login(self):
1089         """Report attempt to log in."""
1090         self.to_screen('Logging in')
1091
1092     def raise_login_required(
1093             self, msg='This video is only available for registered users',
1094             metadata_available=False, method='any'):
1095         if metadata_available and (
1096                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1097             self.report_warning(msg)
1098         if method is not None:
1099             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1100         raise ExtractorError(msg, expected=True)
1101
1102     def raise_geo_restricted(
1103             self, msg='This video is not available from your location due to geo restriction',
1104             countries=None, metadata_available=False):
1105         if metadata_available and (
1106                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1107             self.report_warning(msg)
1108         else:
1109             raise GeoRestrictedError(msg, countries=countries)
1110
1111     def raise_no_formats(self, msg, expected=False, video_id=None):
1112         if expected and (
1113                 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1114             self.report_warning(msg, video_id)
1115         elif isinstance(msg, ExtractorError):
1116             raise msg
1117         else:
1118             raise ExtractorError(msg, expected=expected, video_id=video_id)
1119
1120     # Methods for following #608
1121     @staticmethod
1122     def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1123         """Returns a URL that points to a page that should be processed"""
1124         # TODO: ie should be the class used for getting the info
1125         video_info = {'_type': 'url',
1126                       'url': url,
1127                       'ie_key': ie}
1128         video_info.update(kwargs)
1129         if video_id is not None:
1130             video_info['id'] = video_id
1131         if video_title is not None:
1132             video_info['title'] = video_title
1133         return video_info
1134
1135     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1136         urls = orderedSet(
1137             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1138             for m in matches)
1139         return self.playlist_result(
1140             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1141
1142     @staticmethod
1143     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1144         """Returns a playlist"""
1145         video_info = {'_type': 'playlist',
1146                       'entries': entries}
1147         video_info.update(kwargs)
1148         if playlist_id:
1149             video_info['id'] = playlist_id
1150         if playlist_title:
1151             video_info['title'] = playlist_title
1152         if playlist_description is not None:
1153             video_info['description'] = playlist_description
1154         return video_info
1155
1156     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1157         """
1158         Perform a regex search on the given string, using a single or a list of
1159         patterns returning the first matching group.
1160         In case of failure return a default value or raise a WARNING or a
1161         RegexNotFoundError, depending on fatal, specifying the field name.
1162         """
1163         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1164             mobj = re.search(pattern, string, flags)
1165         else:
1166             for p in pattern:
1167                 mobj = re.search(p, string, flags)
1168                 if mobj:
1169                     break
1170
1171         _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1172
1173         if mobj:
1174             if group is None:
1175                 # return the first matching group
1176                 return next(g for g in mobj.groups() if g is not None)
1177             elif isinstance(group, (list, tuple)):
1178                 return tuple(mobj.group(g) for g in group)
1179             else:
1180                 return mobj.group(group)
1181         elif default is not NO_DEFAULT:
1182             return default
1183         elif fatal:
1184             raise RegexNotFoundError('Unable to extract %s' % _name)
1185         else:
1186             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1187             return None
1188
1189     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1190         """
1191         Like _search_regex, but strips HTML tags and unescapes entities.
1192         """
1193         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1194         if res:
1195             return clean_html(res).strip()
1196         else:
1197             return res
1198
1199     def _get_netrc_login_info(self, netrc_machine=None):
1200         username = None
1201         password = None
1202         netrc_machine = netrc_machine or self._NETRC_MACHINE
1203
1204         if self.get_param('usenetrc', False):
1205             try:
1206                 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1207                 if os.path.isdir(netrc_file):
1208                     netrc_file = os.path.join(netrc_file, '.netrc')
1209                 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1210                 if info is not None:
1211                     username = info[0]
1212                     password = info[2]
1213                 else:
1214                     raise netrc.NetrcParseError(
1215                         'No authenticators for %s' % netrc_machine)
1216             except (IOError, netrc.NetrcParseError) as err:
1217                 self.report_warning(
1218                     'parsing .netrc: %s' % error_to_compat_str(err))
1219
1220         return username, password
1221
1222     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1223         """
1224         Get the login info as (username, password)
1225         First look for the manually specified credentials using username_option
1226         and password_option as keys in params dictionary. If no such credentials
1227         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1228         value.
1229         If there's no info available, return (None, None)
1230         """
1231
1232         # Attempt to use provided username and password or .netrc data
1233         username = self.get_param(username_option)
1234         if username is not None:
1235             password = self.get_param(password_option)
1236         else:
1237             username, password = self._get_netrc_login_info(netrc_machine)
1238
1239         return username, password
1240
1241     def _get_tfa_info(self, note='two-factor verification code'):
1242         """
1243         Get the two-factor authentication info
1244         TODO - asking the user will be required for sms/phone verify
1245         currently just uses the command line option
1246         If there's no info available, return None
1247         """
1248
1249         tfa = self.get_param('twofactor')
1250         if tfa is not None:
1251             return tfa
1252
1253         return compat_getpass('Type %s and press [Return]: ' % note)
1254
1255     # Helper functions for extracting OpenGraph info
1256     @staticmethod
1257     def _og_regexes(prop):
1258         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1259         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1260                        % {'prop': re.escape(prop)})
1261         template = r'<meta[^>]+?%s[^>]+?%s'
1262         return [
1263             template % (property_re, content_re),
1264             template % (content_re, property_re),
1265         ]
1266
1267     @staticmethod
1268     def _meta_regex(prop):
1269         return r'''(?isx)<meta
1270                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1271                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1272
1273     def _og_search_property(self, prop, html, name=None, **kargs):
1274         prop = variadic(prop)
1275         if name is None:
1276             name = 'OpenGraph %s' % prop[0]
1277         og_regexes = []
1278         for p in prop:
1279             og_regexes.extend(self._og_regexes(p))
1280         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1281         if escaped is None:
1282             return None
1283         return unescapeHTML(escaped)
1284
1285     def _og_search_thumbnail(self, html, **kargs):
1286         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1287
1288     def _og_search_description(self, html, **kargs):
1289         return self._og_search_property('description', html, fatal=False, **kargs)
1290
1291     def _og_search_title(self, html, **kargs):
1292         return self._og_search_property('title', html, **kargs)
1293
1294     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1295         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1296         if secure:
1297             regexes = self._og_regexes('video:secure_url') + regexes
1298         return self._html_search_regex(regexes, html, name, **kargs)
1299
1300     def _og_search_url(self, html, **kargs):
1301         return self._og_search_property('url', html, **kargs)
1302
1303     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1304         name = variadic(name)
1305         if display_name is None:
1306             display_name = name[0]
1307         return self._html_search_regex(
1308             [self._meta_regex(n) for n in name],
1309             html, display_name, fatal=fatal, group='content', **kwargs)
1310
1311     def _dc_search_uploader(self, html):
1312         return self._html_search_meta('dc.creator', html, 'uploader')
1313
1314     def _rta_search(self, html):
1315         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1316         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1317                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1318                      html):
1319             return 18
1320         return 0
1321
1322     def _media_rating_search(self, html):
1323         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1324         rating = self._html_search_meta('rating', html)
1325
1326         if not rating:
1327             return None
1328
1329         RATING_TABLE = {
1330             'safe for kids': 0,
1331             'general': 8,
1332             '14 years': 14,
1333             'mature': 17,
1334             'restricted': 19,
1335         }
1336         return RATING_TABLE.get(rating.lower())
1337
1338     def _family_friendly_search(self, html):
1339         # See http://schema.org/VideoObject
1340         family_friendly = self._html_search_meta(
1341             'isFamilyFriendly', html, default=None)
1342
1343         if not family_friendly:
1344             return None
1345
1346         RATING_TABLE = {
1347             '1': 0,
1348             'true': 0,
1349             '0': 18,
1350             'false': 18,
1351         }
1352         return RATING_TABLE.get(family_friendly.lower())
1353
1354     def _twitter_search_player(self, html):
1355         return self._html_search_meta('twitter:player', html,
1356                                       'twitter card player')
1357
1358     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1359         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1360         default = kwargs.get('default', NO_DEFAULT)
1361         # JSON-LD may be malformed and thus `fatal` should be respected.
1362         # At the same time `default` may be passed that assumes `fatal=False`
1363         # for _search_regex. Let's simulate the same behavior here as well.
1364         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1365         json_ld = []
1366         for mobj in json_ld_list:
1367             json_ld_item = self._parse_json(
1368                 mobj.group('json_ld'), video_id, fatal=fatal)
1369             if not json_ld_item:
1370                 continue
1371             if isinstance(json_ld_item, dict):
1372                 json_ld.append(json_ld_item)
1373             elif isinstance(json_ld_item, (list, tuple)):
1374                 json_ld.extend(json_ld_item)
1375         if json_ld:
1376             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1377         if json_ld:
1378             return json_ld
1379         if default is not NO_DEFAULT:
1380             return default
1381         elif fatal:
1382             raise RegexNotFoundError('Unable to extract JSON-LD')
1383         else:
1384             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1385             return {}
1386
1387     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1388         if isinstance(json_ld, compat_str):
1389             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1390         if not json_ld:
1391             return {}
1392         info = {}
1393         if not isinstance(json_ld, (list, tuple, dict)):
1394             return info
1395         if isinstance(json_ld, dict):
1396             json_ld = [json_ld]
1397
1398         INTERACTION_TYPE_MAP = {
1399             'CommentAction': 'comment',
1400             'AgreeAction': 'like',
1401             'DisagreeAction': 'dislike',
1402             'LikeAction': 'like',
1403             'DislikeAction': 'dislike',
1404             'ListenAction': 'view',
1405             'WatchAction': 'view',
1406             'ViewAction': 'view',
1407         }
1408
1409         def extract_interaction_type(e):
1410             interaction_type = e.get('interactionType')
1411             if isinstance(interaction_type, dict):
1412                 interaction_type = interaction_type.get('@type')
1413             return str_or_none(interaction_type)
1414
1415         def extract_interaction_statistic(e):
1416             interaction_statistic = e.get('interactionStatistic')
1417             if isinstance(interaction_statistic, dict):
1418                 interaction_statistic = [interaction_statistic]
1419             if not isinstance(interaction_statistic, list):
1420                 return
1421             for is_e in interaction_statistic:
1422                 if not isinstance(is_e, dict):
1423                     continue
1424                 if is_e.get('@type') != 'InteractionCounter':
1425                     continue
1426                 interaction_type = extract_interaction_type(is_e)
1427                 if not interaction_type:
1428                     continue
1429                 # For interaction count some sites provide string instead of
1430                 # an integer (as per spec) with non digit characters (e.g. ",")
1431                 # so extracting count with more relaxed str_to_int
1432                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1433                 if interaction_count is None:
1434                     continue
1435                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1436                 if not count_kind:
1437                     continue
1438                 count_key = '%s_count' % count_kind
1439                 if info.get(count_key) is not None:
1440                     continue
1441                 info[count_key] = interaction_count
1442
1443         def extract_chapter_information(e):
1444             chapters = [{
1445                 'title': part.get('name'),
1446                 'start_time': part.get('startOffset'),
1447                 'end_time': part.get('endOffset'),
1448             } for part in e.get('hasPart', []) if part.get('@type') == 'Clip']
1449             for idx, (last_c, current_c, next_c) in enumerate(zip(
1450                     [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1451                 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1452                 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1453                 if None in current_c.values():
1454                     self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1455                     return
1456             if chapters:
1457                 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1458                 info['chapters'] = chapters
1459
1460         def extract_video_object(e):
1461             assert e['@type'] == 'VideoObject'
1462             author = e.get('author')
1463             info.update({
1464                 'url': url_or_none(e.get('contentUrl')),
1465                 'title': unescapeHTML(e.get('name')),
1466                 'description': unescapeHTML(e.get('description')),
1467                 'thumbnails': [{'url': url_or_none(url)}
1468                                for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
1469                 'duration': parse_duration(e.get('duration')),
1470                 'timestamp': unified_timestamp(e.get('uploadDate')),
1471                 # author can be an instance of 'Organization' or 'Person' types.
1472                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1473                 # however some websites are using 'Text' type instead.
1474                 # 1. https://schema.org/VideoObject
1475                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1476                 'filesize': float_or_none(e.get('contentSize')),
1477                 'tbr': int_or_none(e.get('bitrate')),
1478                 'width': int_or_none(e.get('width')),
1479                 'height': int_or_none(e.get('height')),
1480                 'view_count': int_or_none(e.get('interactionCount')),
1481             })
1482             extract_interaction_statistic(e)
1483             extract_chapter_information(e)
1484
1485         def traverse_json_ld(json_ld, at_top_level=True):
1486             for e in json_ld:
1487                 if at_top_level and '@context' not in e:
1488                     continue
1489                 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1490                     traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1491                     break
1492                 item_type = e.get('@type')
1493                 if expected_type is not None and expected_type != item_type:
1494                     continue
1495                 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1496                 if rating is not None:
1497                     info['average_rating'] = rating
1498                 if item_type in ('TVEpisode', 'Episode'):
1499                     episode_name = unescapeHTML(e.get('name'))
1500                     info.update({
1501                         'episode': episode_name,
1502                         'episode_number': int_or_none(e.get('episodeNumber')),
1503                         'description': unescapeHTML(e.get('description')),
1504                     })
1505                     if not info.get('title') and episode_name:
1506                         info['title'] = episode_name
1507                     part_of_season = e.get('partOfSeason')
1508                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1509                         info.update({
1510                             'season': unescapeHTML(part_of_season.get('name')),
1511                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1512                         })
1513                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1514                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1515                         info['series'] = unescapeHTML(part_of_series.get('name'))
1516                 elif item_type == 'Movie':
1517                     info.update({
1518                         'title': unescapeHTML(e.get('name')),
1519                         'description': unescapeHTML(e.get('description')),
1520                         'duration': parse_duration(e.get('duration')),
1521                         'timestamp': unified_timestamp(e.get('dateCreated')),
1522                     })
1523                 elif item_type in ('Article', 'NewsArticle'):
1524                     info.update({
1525                         'timestamp': parse_iso8601(e.get('datePublished')),
1526                         'title': unescapeHTML(e.get('headline')),
1527                         'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1528                     })
1529                 elif item_type == 'VideoObject':
1530                     extract_video_object(e)
1531                     if expected_type is None:
1532                         continue
1533                     else:
1534                         break
1535                 video = e.get('video')
1536                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1537                     extract_video_object(video)
1538                 if expected_type is None:
1539                     continue
1540                 else:
1541                     break
1542         traverse_json_ld(json_ld)
1543
1544         return dict((k, v) for k, v in info.items() if v is not None)
1545
1546     def _search_nextjs_data(self, webpage, video_id, **kw):
1547         return self._parse_json(
1548             self._search_regex(
1549                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1550                 webpage, 'next.js data', **kw),
1551             video_id, **kw)
1552
1553     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1554         ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1555         # not all website do this, but it can be changed
1556         # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1557         rectx = re.escape(context_name)
1558         js, arg_keys, arg_vals = self._search_regex(
1559             (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1560              r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1561             webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1562
1563         args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1564
1565         for key, val in args.items():
1566             if val in ('undefined', 'void 0'):
1567                 args[key] = 'null'
1568
1569         return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1570
1571     @staticmethod
1572     def _hidden_inputs(html):
1573         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1574         hidden_inputs = {}
1575         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1576             attrs = extract_attributes(input)
1577             if not input:
1578                 continue
1579             if attrs.get('type') not in ('hidden', 'submit'):
1580                 continue
1581             name = attrs.get('name') or attrs.get('id')
1582             value = attrs.get('value')
1583             if name and value is not None:
1584                 hidden_inputs[name] = value
1585         return hidden_inputs
1586
1587     def _form_hidden_inputs(self, form_id, html):
1588         form = self._search_regex(
1589             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1590             html, '%s form' % form_id, group='form')
1591         return self._hidden_inputs(form)
1592
1593     class FormatSort:
1594         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1595
1596         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1597                    'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1598                    'proto', 'ext', 'hasaud', 'source', 'id')  # These must not be aliases
1599         ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1600                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1601                         'fps', 'fs_approx', 'source', 'id')
1602
1603         settings = {
1604             'vcodec': {'type': 'ordered', 'regex': True,
1605                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1606             'acodec': {'type': 'ordered', 'regex': True,
1607                        'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1608             'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1609                     'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1610             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1611                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1612             'vext': {'type': 'ordered', 'field': 'video_ext',
1613                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1614                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1615             'aext': {'type': 'ordered', 'field': 'audio_ext',
1616                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1617                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1618             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1619             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1620                            'field': ('vcodec', 'acodec'),
1621                            'function': lambda it: int(any(v != 'none' for v in it))},
1622             'ie_pref': {'priority': True, 'type': 'extractor'},
1623             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1624             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1625             'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1626             'quality': {'convert': 'float', 'default': -1},
1627             'filesize': {'convert': 'bytes'},
1628             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1629             'id': {'convert': 'string', 'field': 'format_id'},
1630             'height': {'convert': 'float_none'},
1631             'width': {'convert': 'float_none'},
1632             'fps': {'convert': 'float_none'},
1633             'tbr': {'convert': 'float_none'},
1634             'vbr': {'convert': 'float_none'},
1635             'abr': {'convert': 'float_none'},
1636             'asr': {'convert': 'float_none'},
1637             'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1638
1639             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1640             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1641             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1642             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1643             'res': {'type': 'multiple', 'field': ('height', 'width'),
1644                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1645
1646             # For compatibility with youtube-dl
1647             'format_id': {'type': 'alias', 'field': 'id'},
1648             'preference': {'type': 'alias', 'field': 'ie_pref'},
1649             'language_preference': {'type': 'alias', 'field': 'lang'},
1650
1651             # Deprecated
1652             'dimension': {'type': 'alias', 'field': 'res'},
1653             'resolution': {'type': 'alias', 'field': 'res'},
1654             'extension': {'type': 'alias', 'field': 'ext'},
1655             'bitrate': {'type': 'alias', 'field': 'br'},
1656             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1657             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1658             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1659             'framerate': {'type': 'alias', 'field': 'fps'},
1660             'protocol': {'type': 'alias', 'field': 'proto'},
1661             'source_preference': {'type': 'alias', 'field': 'source'},
1662             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1663             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1664             'samplerate': {'type': 'alias', 'field': 'asr'},
1665             'video_ext': {'type': 'alias', 'field': 'vext'},
1666             'audio_ext': {'type': 'alias', 'field': 'aext'},
1667             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1668             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1669             'video': {'type': 'alias', 'field': 'hasvid'},
1670             'has_video': {'type': 'alias', 'field': 'hasvid'},
1671             'audio': {'type': 'alias', 'field': 'hasaud'},
1672             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1673             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1674             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1675         }
1676
1677         def __init__(self, ie, field_preference):
1678             self._order = []
1679             self.ydl = ie._downloader
1680             self.evaluate_params(self.ydl.params, field_preference)
1681             if ie.get_param('verbose'):
1682                 self.print_verbose_info(self.ydl.write_debug)
1683
1684         def _get_field_setting(self, field, key):
1685             if field not in self.settings:
1686                 if key in ('forced', 'priority'):
1687                     return False
1688                 self.ydl.deprecation_warning(
1689                     f'Using arbitrary fields ({field}) for format sorting is deprecated '
1690                     'and may be removed in a future version')
1691                 self.settings[field] = {}
1692             propObj = self.settings[field]
1693             if key not in propObj:
1694                 type = propObj.get('type')
1695                 if key == 'field':
1696                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1697                 elif key == 'convert':
1698                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1699                 else:
1700                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1701                 propObj[key] = default
1702             return propObj[key]
1703
1704         def _resolve_field_value(self, field, value, convertNone=False):
1705             if value is None:
1706                 if not convertNone:
1707                     return None
1708             else:
1709                 value = value.lower()
1710             conversion = self._get_field_setting(field, 'convert')
1711             if conversion == 'ignore':
1712                 return None
1713             if conversion == 'string':
1714                 return value
1715             elif conversion == 'float_none':
1716                 return float_or_none(value)
1717             elif conversion == 'bytes':
1718                 return FileDownloader.parse_bytes(value)
1719             elif conversion == 'order':
1720                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1721                 use_regex = self._get_field_setting(field, 'regex')
1722                 list_length = len(order_list)
1723                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1724                 if use_regex and value is not None:
1725                     for i, regex in enumerate(order_list):
1726                         if regex and re.match(regex, value):
1727                             return list_length - i
1728                     return list_length - empty_pos  # not in list
1729                 else:  # not regex or  value = None
1730                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1731             else:
1732                 if value.isnumeric():
1733                     return float(value)
1734                 else:
1735                     self.settings[field]['convert'] = 'string'
1736                     return value
1737
1738         def evaluate_params(self, params, sort_extractor):
1739             self._use_free_order = params.get('prefer_free_formats', False)
1740             self._sort_user = params.get('format_sort', [])
1741             self._sort_extractor = sort_extractor
1742
1743             def add_item(field, reverse, closest, limit_text):
1744                 field = field.lower()
1745                 if field in self._order:
1746                     return
1747                 self._order.append(field)
1748                 limit = self._resolve_field_value(field, limit_text)
1749                 data = {
1750                     'reverse': reverse,
1751                     'closest': False if limit is None else closest,
1752                     'limit_text': limit_text,
1753                     'limit': limit}
1754                 if field in self.settings:
1755                     self.settings[field].update(data)
1756                 else:
1757                     self.settings[field] = data
1758
1759             sort_list = (
1760                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1761                 + (tuple() if params.get('format_sort_force', False)
1762                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1763                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1764
1765             for item in sort_list:
1766                 match = re.match(self.regex, item)
1767                 if match is None:
1768                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1769                 field = match.group('field')
1770                 if field is None:
1771                     continue
1772                 if self._get_field_setting(field, 'type') == 'alias':
1773                     alias, field = field, self._get_field_setting(field, 'field')
1774                     if alias not in ('format_id', 'preference', 'language_preference'):
1775                         self.ydl.deprecation_warning(
1776                             f'Format sorting alias {alias} is deprecated '
1777                             f'and may be removed in a future version. Please use {field} instead')
1778                 reverse = match.group('reverse') is not None
1779                 closest = match.group('separator') == '~'
1780                 limit_text = match.group('limit')
1781
1782                 has_limit = limit_text is not None
1783                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1784                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1785
1786                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1787                 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1788                 limit_count = len(limits)
1789                 for (i, f) in enumerate(fields):
1790                     add_item(f, reverse, closest,
1791                              limits[i] if i < limit_count
1792                              else limits[0] if has_limit and not has_multiple_limits
1793                              else None)
1794
1795         def print_verbose_info(self, write_debug):
1796             if self._sort_user:
1797                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1798             if self._sort_extractor:
1799                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1800             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1801                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1802                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1803                               self._get_field_setting(field, 'limit_text'),
1804                               self._get_field_setting(field, 'limit'))
1805                 if self._get_field_setting(field, 'limit_text') is not None else '')
1806                 for field in self._order if self._get_field_setting(field, 'visible')]))
1807
1808         def _calculate_field_preference_from_value(self, format, field, type, value):
1809             reverse = self._get_field_setting(field, 'reverse')
1810             closest = self._get_field_setting(field, 'closest')
1811             limit = self._get_field_setting(field, 'limit')
1812
1813             if type == 'extractor':
1814                 maximum = self._get_field_setting(field, 'max')
1815                 if value is None or (maximum is not None and value >= maximum):
1816                     value = -1
1817             elif type == 'boolean':
1818                 in_list = self._get_field_setting(field, 'in_list')
1819                 not_in_list = self._get_field_setting(field, 'not_in_list')
1820                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1821             elif type == 'ordered':
1822                 value = self._resolve_field_value(field, value, True)
1823
1824             # try to convert to number
1825             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1826             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1827             if is_num:
1828                 value = val_num
1829
1830             return ((-10, 0) if value is None
1831                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1832                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1833                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1834                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1835                     else (-1, value, 0))
1836
1837         def _calculate_field_preference(self, format, field):
1838             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1839             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1840             if type == 'multiple':
1841                 type = 'field'  # Only 'field' is allowed in multiple for now
1842                 actual_fields = self._get_field_setting(field, 'field')
1843
1844                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1845             else:
1846                 value = get_value(field)
1847             return self._calculate_field_preference_from_value(format, field, type, value)
1848
1849         def calculate_preference(self, format):
1850             # Determine missing protocol
1851             if not format.get('protocol'):
1852                 format['protocol'] = determine_protocol(format)
1853
1854             # Determine missing ext
1855             if not format.get('ext') and 'url' in format:
1856                 format['ext'] = determine_ext(format['url'])
1857             if format.get('vcodec') == 'none':
1858                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1859                 format['video_ext'] = 'none'
1860             else:
1861                 format['video_ext'] = format['ext']
1862                 format['audio_ext'] = 'none'
1863             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1864             #    format['preference'] = -1000
1865
1866             # Determine missing bitrates
1867             if format.get('tbr') is None:
1868                 if format.get('vbr') is not None and format.get('abr') is not None:
1869                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1870             else:
1871                 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1872                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1873                 if format.get('acodec') != 'none' and format.get('abr') is None:
1874                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1875
1876             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1877
1878     def _sort_formats(self, formats, field_preference=[]):
1879         if not formats:
1880             return
1881         format_sort = self.FormatSort(self, field_preference)
1882         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1883
1884     def _check_formats(self, formats, video_id):
1885         if formats:
1886             formats[:] = filter(
1887                 lambda f: self._is_valid_url(
1888                     f['url'], video_id,
1889                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1890                 formats)
1891
1892     @staticmethod
1893     def _remove_duplicate_formats(formats):
1894         format_urls = set()
1895         unique_formats = []
1896         for f in formats:
1897             if f['url'] not in format_urls:
1898                 format_urls.add(f['url'])
1899                 unique_formats.append(f)
1900         formats[:] = unique_formats
1901
1902     def _is_valid_url(self, url, video_id, item='video', headers={}):
1903         url = self._proto_relative_url(url, scheme='http:')
1904         # For now assume non HTTP(S) URLs always valid
1905         if not (url.startswith('http://') or url.startswith('https://')):
1906             return True
1907         try:
1908             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1909             return True
1910         except ExtractorError as e:
1911             self.to_screen(
1912                 '%s: %s URL is invalid, skipping: %s'
1913                 % (video_id, item, error_to_compat_str(e.cause)))
1914             return False
1915
1916     def http_scheme(self):
1917         """ Either "http:" or "https:", depending on the user's preferences """
1918         return (
1919             'http:'
1920             if self.get_param('prefer_insecure', False)
1921             else 'https:')
1922
1923     def _proto_relative_url(self, url, scheme=None):
1924         if url is None:
1925             return url
1926         if url.startswith('//'):
1927             if scheme is None:
1928                 scheme = self.http_scheme()
1929             return scheme + url
1930         else:
1931             return url
1932
1933     def _sleep(self, timeout, video_id, msg_template=None):
1934         if msg_template is None:
1935             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1936         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1937         self.to_screen(msg)
1938         time.sleep(timeout)
1939
1940     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1941                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1942                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1943         manifest = self._download_xml(
1944             manifest_url, video_id, 'Downloading f4m manifest',
1945             'Unable to download f4m manifest',
1946             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1947             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1948             transform_source=transform_source,
1949             fatal=fatal, data=data, headers=headers, query=query)
1950
1951         if manifest is False:
1952             return []
1953
1954         return self._parse_f4m_formats(
1955             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1956             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1957
1958     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1959                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1960                            fatal=True, m3u8_id=None):
1961         if not isinstance(manifest, compat_etree_Element) and not fatal:
1962             return []
1963
1964         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1965         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1966         if akamai_pv is not None and ';' in akamai_pv.text:
1967             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1968             if playerVerificationChallenge.strip() != '':
1969                 return []
1970
1971         formats = []
1972         manifest_version = '1.0'
1973         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1974         if not media_nodes:
1975             manifest_version = '2.0'
1976             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1977         # Remove unsupported DRM protected media from final formats
1978         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1979         media_nodes = remove_encrypted_media(media_nodes)
1980         if not media_nodes:
1981             return formats
1982
1983         manifest_base_url = get_base_url(manifest)
1984
1985         bootstrap_info = xpath_element(
1986             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1987             'bootstrap info', default=None)
1988
1989         vcodec = None
1990         mime_type = xpath_text(
1991             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1992             'base URL', default=None)
1993         if mime_type and mime_type.startswith('audio/'):
1994             vcodec = 'none'
1995
1996         for i, media_el in enumerate(media_nodes):
1997             tbr = int_or_none(media_el.attrib.get('bitrate'))
1998             width = int_or_none(media_el.attrib.get('width'))
1999             height = int_or_none(media_el.attrib.get('height'))
2000             format_id = join_nonempty(f4m_id, tbr or i)
2001             # If <bootstrapInfo> is present, the specified f4m is a
2002             # stream-level manifest, and only set-level manifests may refer to
2003             # external resources.  See section 11.4 and section 4 of F4M spec
2004             if bootstrap_info is None:
2005                 media_url = None
2006                 # @href is introduced in 2.0, see section 11.6 of F4M spec
2007                 if manifest_version == '2.0':
2008                     media_url = media_el.attrib.get('href')
2009                 if media_url is None:
2010                     media_url = media_el.attrib.get('url')
2011                 if not media_url:
2012                     continue
2013                 manifest_url = (
2014                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
2015                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2016                 # If media_url is itself a f4m manifest do the recursive extraction
2017                 # since bitrates in parent manifest (this one) and media_url manifest
2018                 # may differ leading to inability to resolve the format by requested
2019                 # bitrate in f4m downloader
2020                 ext = determine_ext(manifest_url)
2021                 if ext == 'f4m':
2022                     f4m_formats = self._extract_f4m_formats(
2023                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2024                         transform_source=transform_source, fatal=fatal)
2025                     # Sometimes stream-level manifest contains single media entry that
2026                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2027                     # At the same time parent's media entry in set-level manifest may
2028                     # contain it. We will copy it from parent in such cases.
2029                     if len(f4m_formats) == 1:
2030                         f = f4m_formats[0]
2031                         f.update({
2032                             'tbr': f.get('tbr') or tbr,
2033                             'width': f.get('width') or width,
2034                             'height': f.get('height') or height,
2035                             'format_id': f.get('format_id') if not tbr else format_id,
2036                             'vcodec': vcodec,
2037                         })
2038                     formats.extend(f4m_formats)
2039                     continue
2040                 elif ext == 'm3u8':
2041                     formats.extend(self._extract_m3u8_formats(
2042                         manifest_url, video_id, 'mp4', preference=preference,
2043                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2044                     continue
2045             formats.append({
2046                 'format_id': format_id,
2047                 'url': manifest_url,
2048                 'manifest_url': manifest_url,
2049                 'ext': 'flv' if bootstrap_info is not None else None,
2050                 'protocol': 'f4m',
2051                 'tbr': tbr,
2052                 'width': width,
2053                 'height': height,
2054                 'vcodec': vcodec,
2055                 'preference': preference,
2056                 'quality': quality,
2057             })
2058         return formats
2059
2060     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2061         return {
2062             'format_id': join_nonempty(m3u8_id, 'meta'),
2063             'url': m3u8_url,
2064             'ext': ext,
2065             'protocol': 'm3u8',
2066             'preference': preference - 100 if preference else -100,
2067             'quality': quality,
2068             'resolution': 'multiple',
2069             'format_note': 'Quality selection URL',
2070         }
2071
2072     def _report_ignoring_subs(self, name):
2073         self.report_warning(bug_reports_message(
2074             f'Ignoring subtitle tracks found in the {name} manifest; '
2075             'if any subtitle tracks are missing,'
2076         ), only_once=True)
2077
2078     def _extract_m3u8_formats(self, *args, **kwargs):
2079         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2080         if subs:
2081             self._report_ignoring_subs('HLS')
2082         return fmts
2083
2084     def _extract_m3u8_formats_and_subtitles(
2085             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2086             preference=None, quality=None, m3u8_id=None, note=None,
2087             errnote=None, fatal=True, live=False, data=None, headers={},
2088             query={}):
2089
2090         res = self._download_webpage_handle(
2091             m3u8_url, video_id,
2092             note='Downloading m3u8 information' if note is None else note,
2093             errnote='Failed to download m3u8 information' if errnote is None else errnote,
2094             fatal=fatal, data=data, headers=headers, query=query)
2095
2096         if res is False:
2097             return [], {}
2098
2099         m3u8_doc, urlh = res
2100         m3u8_url = urlh.geturl()
2101
2102         return self._parse_m3u8_formats_and_subtitles(
2103             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2104             preference=preference, quality=quality, m3u8_id=m3u8_id,
2105             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2106             headers=headers, query=query, video_id=video_id)
2107
2108     def _parse_m3u8_formats_and_subtitles(
2109             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2110             preference=None, quality=None, m3u8_id=None, live=False, note=None,
2111             errnote=None, fatal=True, data=None, headers={}, query={},
2112             video_id=None):
2113         formats, subtitles = [], {}
2114
2115         has_drm = re.search('|'.join([
2116             r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
2117             r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
2118         ]), m3u8_doc)
2119
2120         def format_url(url):
2121             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2122
2123         if self.get_param('hls_split_discontinuity', False):
2124             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2125                 if not m3u8_doc:
2126                     if not manifest_url:
2127                         return []
2128                     m3u8_doc = self._download_webpage(
2129                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2130                         note=False, errnote='Failed to download m3u8 playlist information')
2131                     if m3u8_doc is False:
2132                         return []
2133                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2134
2135         else:
2136             def _extract_m3u8_playlist_indices(*args, **kwargs):
2137                 return [None]
2138
2139         # References:
2140         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2141         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2142         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2143
2144         # We should try extracting formats only from master playlists [1, 4.3.4],
2145         # i.e. playlists that describe available qualities. On the other hand
2146         # media playlists [1, 4.3.3] should be returned as is since they contain
2147         # just the media without qualities renditions.
2148         # Fortunately, master playlist can be easily distinguished from media
2149         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2150         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2151         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2152         # media playlist and MUST NOT appear in master playlist thus we can
2153         # clearly detect media playlist with this criterion.
2154
2155         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2156             formats = [{
2157                 'format_id': join_nonempty(m3u8_id, idx),
2158                 'format_index': idx,
2159                 'url': m3u8_url,
2160                 'ext': ext,
2161                 'protocol': entry_protocol,
2162                 'preference': preference,
2163                 'quality': quality,
2164                 'has_drm': has_drm,
2165             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2166
2167             return formats, subtitles
2168
2169         groups = {}
2170         last_stream_inf = {}
2171
2172         def extract_media(x_media_line):
2173             media = parse_m3u8_attributes(x_media_line)
2174             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2175             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2176             if not (media_type and group_id and name):
2177                 return
2178             groups.setdefault(group_id, []).append(media)
2179             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2180             if media_type == 'SUBTITLES':
2181                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2182                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2183                 # However, lack of URI has been spotted in the wild.
2184                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2185                 if not media.get('URI'):
2186                     return
2187                 url = format_url(media['URI'])
2188                 sub_info = {
2189                     'url': url,
2190                     'ext': determine_ext(url),
2191                 }
2192                 if sub_info['ext'] == 'm3u8':
2193                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2194                     # files may contain is WebVTT:
2195                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2196                     sub_info['ext'] = 'vtt'
2197                     sub_info['protocol'] = 'm3u8_native'
2198                 lang = media.get('LANGUAGE') or 'und'
2199                 subtitles.setdefault(lang, []).append(sub_info)
2200             if media_type not in ('VIDEO', 'AUDIO'):
2201                 return
2202             media_url = media.get('URI')
2203             if media_url:
2204                 manifest_url = format_url(media_url)
2205                 formats.extend({
2206                     'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2207                     'format_note': name,
2208                     'format_index': idx,
2209                     'url': manifest_url,
2210                     'manifest_url': m3u8_url,
2211                     'language': media.get('LANGUAGE'),
2212                     'ext': ext,
2213                     'protocol': entry_protocol,
2214                     'preference': preference,
2215                     'quality': quality,
2216                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2217                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2218
2219         def build_stream_name():
2220             # Despite specification does not mention NAME attribute for
2221             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2222             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2223             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2224             stream_name = last_stream_inf.get('NAME')
2225             if stream_name:
2226                 return stream_name
2227             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2228             # from corresponding rendition group
2229             stream_group_id = last_stream_inf.get('VIDEO')
2230             if not stream_group_id:
2231                 return
2232             stream_group = groups.get(stream_group_id)
2233             if not stream_group:
2234                 return stream_group_id
2235             rendition = stream_group[0]
2236             return rendition.get('NAME') or stream_group_id
2237
2238         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2239         # chance to detect video only formats when EXT-X-STREAM-INF tags
2240         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2241         for line in m3u8_doc.splitlines():
2242             if line.startswith('#EXT-X-MEDIA:'):
2243                 extract_media(line)
2244
2245         for line in m3u8_doc.splitlines():
2246             if line.startswith('#EXT-X-STREAM-INF:'):
2247                 last_stream_inf = parse_m3u8_attributes(line)
2248             elif line.startswith('#') or not line.strip():
2249                 continue
2250             else:
2251                 tbr = float_or_none(
2252                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2253                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2254                 manifest_url = format_url(line.strip())
2255
2256                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2257                     format_id = [m3u8_id, None, idx]
2258                     # Bandwidth of live streams may differ over time thus making
2259                     # format_id unpredictable. So it's better to keep provided
2260                     # format_id intact.
2261                     if not live:
2262                         stream_name = build_stream_name()
2263                         format_id[1] = stream_name or '%d' % (tbr or len(formats))
2264                     f = {
2265                         'format_id': join_nonempty(*format_id),
2266                         'format_index': idx,
2267                         'url': manifest_url,
2268                         'manifest_url': m3u8_url,
2269                         'tbr': tbr,
2270                         'ext': ext,
2271                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2272                         'protocol': entry_protocol,
2273                         'preference': preference,
2274                         'quality': quality,
2275                     }
2276                     resolution = last_stream_inf.get('RESOLUTION')
2277                     if resolution:
2278                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2279                         if mobj:
2280                             f['width'] = int(mobj.group('width'))
2281                             f['height'] = int(mobj.group('height'))
2282                     # Unified Streaming Platform
2283                     mobj = re.search(
2284                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2285                     if mobj:
2286                         abr, vbr = mobj.groups()
2287                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2288                         f.update({
2289                             'vbr': vbr,
2290                             'abr': abr,
2291                         })
2292                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2293                     f.update(codecs)
2294                     audio_group_id = last_stream_inf.get('AUDIO')
2295                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2296                     # references a rendition group MUST have a CODECS attribute.
2297                     # However, this is not always respected, for example, [2]
2298                     # contains EXT-X-STREAM-INF tag which references AUDIO
2299                     # rendition group but does not have CODECS and despite
2300                     # referencing an audio group it represents a complete
2301                     # (with audio and video) format. So, for such cases we will
2302                     # ignore references to rendition groups and treat them
2303                     # as complete formats.
2304                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2305                         audio_group = groups.get(audio_group_id)
2306                         if audio_group and audio_group[0].get('URI'):
2307                             # TODO: update acodec for audio only formats with
2308                             # the same GROUP-ID
2309                             f['acodec'] = 'none'
2310                     if not f.get('ext'):
2311                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2312                     formats.append(f)
2313
2314                     # for DailyMotion
2315                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2316                     if progressive_uri:
2317                         http_f = f.copy()
2318                         del http_f['manifest_url']
2319                         http_f.update({
2320                             'format_id': f['format_id'].replace('hls-', 'http-'),
2321                             'protocol': 'http',
2322                             'url': progressive_uri,
2323                         })
2324                         formats.append(http_f)
2325
2326                 last_stream_inf = {}
2327         return formats, subtitles
2328
2329     def _extract_m3u8_vod_duration(
2330             self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2331
2332         m3u8_vod = self._download_webpage(
2333             m3u8_vod_url, video_id,
2334             note='Downloading m3u8 VOD manifest' if note is None else note,
2335             errnote='Failed to download VOD manifest' if errnote is None else errnote,
2336             fatal=False, data=data, headers=headers, query=query)
2337
2338         return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2339
2340     def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2341         if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2342             return None
2343
2344         return int(sum(
2345             float(line[len('#EXTINF:'):].split(',')[0])
2346             for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2347
2348     @staticmethod
2349     def _xpath_ns(path, namespace=None):
2350         if not namespace:
2351             return path
2352         out = []
2353         for c in path.split('/'):
2354             if not c or c == '.':
2355                 out.append(c)
2356             else:
2357                 out.append('{%s}%s' % (namespace, c))
2358         return '/'.join(out)
2359
2360     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2361         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2362
2363         if smil is False:
2364             assert not fatal
2365             return [], {}
2366
2367         namespace = self._parse_smil_namespace(smil)
2368
2369         fmts = self._parse_smil_formats(
2370             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2371         subs = self._parse_smil_subtitles(
2372             smil, namespace=namespace)
2373
2374         return fmts, subs
2375
2376     def _extract_smil_formats(self, *args, **kwargs):
2377         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2378         if subs:
2379             self._report_ignoring_subs('SMIL')
2380         return fmts
2381
2382     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2383         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2384         if smil is False:
2385             return {}
2386         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2387
2388     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2389         return self._download_xml(
2390             smil_url, video_id, 'Downloading SMIL file',
2391             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2392
2393     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2394         namespace = self._parse_smil_namespace(smil)
2395
2396         formats = self._parse_smil_formats(
2397             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2398         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2399
2400         video_id = os.path.splitext(url_basename(smil_url))[0]
2401         title = None
2402         description = None
2403         upload_date = None
2404         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2405             name = meta.attrib.get('name')
2406             content = meta.attrib.get('content')
2407             if not name or not content:
2408                 continue
2409             if not title and name == 'title':
2410                 title = content
2411             elif not description and name in ('description', 'abstract'):
2412                 description = content
2413             elif not upload_date and name == 'date':
2414                 upload_date = unified_strdate(content)
2415
2416         thumbnails = [{
2417             'id': image.get('type'),
2418             'url': image.get('src'),
2419             'width': int_or_none(image.get('width')),
2420             'height': int_or_none(image.get('height')),
2421         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2422
2423         return {
2424             'id': video_id,
2425             'title': title or video_id,
2426             'description': description,
2427             'upload_date': upload_date,
2428             'thumbnails': thumbnails,
2429             'formats': formats,
2430             'subtitles': subtitles,
2431         }
2432
2433     def _parse_smil_namespace(self, smil):
2434         return self._search_regex(
2435             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2436
2437     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2438         base = smil_url
2439         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2440             b = meta.get('base') or meta.get('httpBase')
2441             if b:
2442                 base = b
2443                 break
2444
2445         formats = []
2446         rtmp_count = 0
2447         http_count = 0
2448         m3u8_count = 0
2449         imgs_count = 0
2450
2451         srcs = set()
2452         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2453         for medium in media:
2454             src = medium.get('src')
2455             if not src or src in srcs:
2456                 continue
2457             srcs.add(src)
2458
2459             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2460             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2461             width = int_or_none(medium.get('width'))
2462             height = int_or_none(medium.get('height'))
2463             proto = medium.get('proto')
2464             ext = medium.get('ext')
2465             src_ext = determine_ext(src)
2466             streamer = medium.get('streamer') or base
2467
2468             if proto == 'rtmp' or streamer.startswith('rtmp'):
2469                 rtmp_count += 1
2470                 formats.append({
2471                     'url': streamer,
2472                     'play_path': src,
2473                     'ext': 'flv',
2474                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2475                     'tbr': bitrate,
2476                     'filesize': filesize,
2477                     'width': width,
2478                     'height': height,
2479                 })
2480                 if transform_rtmp_url:
2481                     streamer, src = transform_rtmp_url(streamer, src)
2482                     formats[-1].update({
2483                         'url': streamer,
2484                         'play_path': src,
2485                     })
2486                 continue
2487
2488             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2489             src_url = src_url.strip()
2490
2491             if proto == 'm3u8' or src_ext == 'm3u8':
2492                 m3u8_formats = self._extract_m3u8_formats(
2493                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2494                 if len(m3u8_formats) == 1:
2495                     m3u8_count += 1
2496                     m3u8_formats[0].update({
2497                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2498                         'tbr': bitrate,
2499                         'width': width,
2500                         'height': height,
2501                     })
2502                 formats.extend(m3u8_formats)
2503             elif src_ext == 'f4m':
2504                 f4m_url = src_url
2505                 if not f4m_params:
2506                     f4m_params = {
2507                         'hdcore': '3.2.0',
2508                         'plugin': 'flowplayer-3.2.0.1',
2509                     }
2510                 f4m_url += '&' if '?' in f4m_url else '?'
2511                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2512                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2513             elif src_ext == 'mpd':
2514                 formats.extend(self._extract_mpd_formats(
2515                     src_url, video_id, mpd_id='dash', fatal=False))
2516             elif re.search(r'\.ism/[Mm]anifest', src_url):
2517                 formats.extend(self._extract_ism_formats(
2518                     src_url, video_id, ism_id='mss', fatal=False))
2519             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2520                 http_count += 1
2521                 formats.append({
2522                     'url': src_url,
2523                     'ext': ext or src_ext or 'flv',
2524                     'format_id': 'http-%d' % (bitrate or http_count),
2525                     'tbr': bitrate,
2526                     'filesize': filesize,
2527                     'width': width,
2528                     'height': height,
2529                 })
2530
2531         for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2532             src = medium.get('src')
2533             if not src or src in srcs:
2534                 continue
2535             srcs.add(src)
2536
2537             imgs_count += 1
2538             formats.append({
2539                 'format_id': 'imagestream-%d' % (imgs_count),
2540                 'url': src,
2541                 'ext': mimetype2ext(medium.get('type')),
2542                 'acodec': 'none',
2543                 'vcodec': 'none',
2544                 'width': int_or_none(medium.get('width')),
2545                 'height': int_or_none(medium.get('height')),
2546                 'format_note': 'SMIL storyboards',
2547             })
2548
2549         return formats
2550
2551     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2552         urls = []
2553         subtitles = {}
2554         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2555             src = textstream.get('src')
2556             if not src or src in urls:
2557                 continue
2558             urls.append(src)
2559             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2560             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2561             subtitles.setdefault(lang, []).append({
2562                 'url': src,
2563                 'ext': ext,
2564             })
2565         return subtitles
2566
2567     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2568         xspf = self._download_xml(
2569             xspf_url, playlist_id, 'Downloading xpsf playlist',
2570             'Unable to download xspf manifest', fatal=fatal)
2571         if xspf is False:
2572             return []
2573         return self._parse_xspf(
2574             xspf, playlist_id, xspf_url=xspf_url,
2575             xspf_base_url=base_url(xspf_url))
2576
2577     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2578         NS_MAP = {
2579             'xspf': 'http://xspf.org/ns/0/',
2580             's1': 'http://static.streamone.nl/player/ns/0',
2581         }
2582
2583         entries = []
2584         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2585             title = xpath_text(
2586                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2587             description = xpath_text(
2588                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2589             thumbnail = xpath_text(
2590                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2591             duration = float_or_none(
2592                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2593
2594             formats = []
2595             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2596                 format_url = urljoin(xspf_base_url, location.text)
2597                 if not format_url:
2598                     continue
2599                 formats.append({
2600                     'url': format_url,
2601                     'manifest_url': xspf_url,
2602                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2603                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2604                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2605                 })
2606             self._sort_formats(formats)
2607
2608             entries.append({
2609                 'id': playlist_id,
2610                 'title': title,
2611                 'description': description,
2612                 'thumbnail': thumbnail,
2613                 'duration': duration,
2614                 'formats': formats,
2615             })
2616         return entries
2617
2618     def _extract_mpd_formats(self, *args, **kwargs):
2619         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2620         if subs:
2621             self._report_ignoring_subs('DASH')
2622         return fmts
2623
2624     def _extract_mpd_formats_and_subtitles(
2625             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2626             fatal=True, data=None, headers={}, query={}):
2627         res = self._download_xml_handle(
2628             mpd_url, video_id,
2629             note='Downloading MPD manifest' if note is None else note,
2630             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2631             fatal=fatal, data=data, headers=headers, query=query)
2632         if res is False:
2633             return [], {}
2634         mpd_doc, urlh = res
2635         if mpd_doc is None:
2636             return [], {}
2637         mpd_base_url = base_url(urlh.geturl())
2638
2639         return self._parse_mpd_formats_and_subtitles(
2640             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2641
2642     def _parse_mpd_formats(self, *args, **kwargs):
2643         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2644         if subs:
2645             self._report_ignoring_subs('DASH')
2646         return fmts
2647
2648     def _parse_mpd_formats_and_subtitles(
2649             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2650         """
2651         Parse formats from MPD manifest.
2652         References:
2653          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2654             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2655          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2656         """
2657         if not self.get_param('dynamic_mpd', True):
2658             if mpd_doc.get('type') == 'dynamic':
2659                 return [], {}
2660
2661         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2662
2663         def _add_ns(path):
2664             return self._xpath_ns(path, namespace)
2665
2666         def is_drm_protected(element):
2667             return element.find(_add_ns('ContentProtection')) is not None
2668
2669         def extract_multisegment_info(element, ms_parent_info):
2670             ms_info = ms_parent_info.copy()
2671
2672             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2673             # common attributes and elements.  We will only extract relevant
2674             # for us.
2675             def extract_common(source):
2676                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2677                 if segment_timeline is not None:
2678                     s_e = segment_timeline.findall(_add_ns('S'))
2679                     if s_e:
2680                         ms_info['total_number'] = 0
2681                         ms_info['s'] = []
2682                         for s in s_e:
2683                             r = int(s.get('r', 0))
2684                             ms_info['total_number'] += 1 + r
2685                             ms_info['s'].append({
2686                                 't': int(s.get('t', 0)),
2687                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2688                                 'd': int(s.attrib['d']),
2689                                 'r': r,
2690                             })
2691                 start_number = source.get('startNumber')
2692                 if start_number:
2693                     ms_info['start_number'] = int(start_number)
2694                 timescale = source.get('timescale')
2695                 if timescale:
2696                     ms_info['timescale'] = int(timescale)
2697                 segment_duration = source.get('duration')
2698                 if segment_duration:
2699                     ms_info['segment_duration'] = float(segment_duration)
2700
2701             def extract_Initialization(source):
2702                 initialization = source.find(_add_ns('Initialization'))
2703                 if initialization is not None:
2704                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2705
2706             segment_list = element.find(_add_ns('SegmentList'))
2707             if segment_list is not None:
2708                 extract_common(segment_list)
2709                 extract_Initialization(segment_list)
2710                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2711                 if segment_urls_e:
2712                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2713             else:
2714                 segment_template = element.find(_add_ns('SegmentTemplate'))
2715                 if segment_template is not None:
2716                     extract_common(segment_template)
2717                     media = segment_template.get('media')
2718                     if media:
2719                         ms_info['media'] = media
2720                     initialization = segment_template.get('initialization')
2721                     if initialization:
2722                         ms_info['initialization'] = initialization
2723                     else:
2724                         extract_Initialization(segment_template)
2725             return ms_info
2726
2727         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2728         formats, subtitles = [], {}
2729         stream_numbers = collections.defaultdict(int)
2730         for period in mpd_doc.findall(_add_ns('Period')):
2731             period_duration = parse_duration(period.get('duration')) or mpd_duration
2732             period_ms_info = extract_multisegment_info(period, {
2733                 'start_number': 1,
2734                 'timescale': 1,
2735             })
2736             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2737                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2738                 for representation in adaptation_set.findall(_add_ns('Representation')):
2739                     representation_attrib = adaptation_set.attrib.copy()
2740                     representation_attrib.update(representation.attrib)
2741                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2742                     mime_type = representation_attrib['mimeType']
2743                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2744
2745                     codecs = parse_codecs(representation_attrib.get('codecs', ''))
2746                     if content_type not in ('video', 'audio', 'text'):
2747                         if mime_type == 'image/jpeg':
2748                             content_type = mime_type
2749                         elif codecs['vcodec'] != 'none':
2750                             content_type = 'video'
2751                         elif codecs['acodec'] != 'none':
2752                             content_type = 'audio'
2753                         elif codecs.get('tcodec', 'none') != 'none':
2754                             content_type = 'text'
2755                         elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2756                             content_type = 'text'
2757                         else:
2758                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2759                             continue
2760
2761                     base_url = ''
2762                     for element in (representation, adaptation_set, period, mpd_doc):
2763                         base_url_e = element.find(_add_ns('BaseURL'))
2764                         if base_url_e is not None:
2765                             base_url = base_url_e.text + base_url
2766                             if re.match(r'^https?://', base_url):
2767                                 break
2768                     if mpd_base_url and base_url.startswith('/'):
2769                         base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2770                     elif mpd_base_url and not re.match(r'^https?://', base_url):
2771                         if not mpd_base_url.endswith('/'):
2772                             mpd_base_url += '/'
2773                         base_url = mpd_base_url + base_url
2774                     representation_id = representation_attrib.get('id')
2775                     lang = representation_attrib.get('lang')
2776                     url_el = representation.find(_add_ns('BaseURL'))
2777                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2778                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2779                     if representation_id is not None:
2780                         format_id = representation_id
2781                     else:
2782                         format_id = content_type
2783                     if mpd_id:
2784                         format_id = mpd_id + '-' + format_id
2785                     if content_type in ('video', 'audio'):
2786                         f = {
2787                             'format_id': format_id,
2788                             'manifest_url': mpd_url,
2789                             'ext': mimetype2ext(mime_type),
2790                             'width': int_or_none(representation_attrib.get('width')),
2791                             'height': int_or_none(representation_attrib.get('height')),
2792                             'tbr': float_or_none(bandwidth, 1000),
2793                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2794                             'fps': int_or_none(representation_attrib.get('frameRate')),
2795                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2796                             'format_note': 'DASH %s' % content_type,
2797                             'filesize': filesize,
2798                             'container': mimetype2ext(mime_type) + '_dash',
2799                             **codecs
2800                         }
2801                     elif content_type == 'text':
2802                         f = {
2803                             'ext': mimetype2ext(mime_type),
2804                             'manifest_url': mpd_url,
2805                             'filesize': filesize,
2806                         }
2807                     elif content_type == 'image/jpeg':
2808                         # See test case in VikiIE
2809                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2810                         f = {
2811                             'format_id': format_id,
2812                             'ext': 'mhtml',
2813                             'manifest_url': mpd_url,
2814                             'format_note': 'DASH storyboards (jpeg)',
2815                             'acodec': 'none',
2816                             'vcodec': 'none',
2817                         }
2818                     if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2819                         f['has_drm'] = True
2820                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2821
2822                     def prepare_template(template_name, identifiers):
2823                         tmpl = representation_ms_info[template_name]
2824                         # First of, % characters outside $...$ templates
2825                         # must be escaped by doubling for proper processing
2826                         # by % operator string formatting used further (see
2827                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2828                         t = ''
2829                         in_template = False
2830                         for c in tmpl:
2831                             t += c
2832                             if c == '$':
2833                                 in_template = not in_template
2834                             elif c == '%' and not in_template:
2835                                 t += c
2836                         # Next, $...$ templates are translated to their
2837                         # %(...) counterparts to be used with % operator
2838                         if representation_id is not None:
2839                             t = t.replace('$RepresentationID$', representation_id)
2840                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2841                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2842                         t.replace('$$', '$')
2843                         return t
2844
2845                     # @initialization is a regular template like @media one
2846                     # so it should be handled just the same way (see
2847                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2848                     if 'initialization' in representation_ms_info:
2849                         initialization_template = prepare_template(
2850                             'initialization',
2851                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2852                             # $Time$ shall not be included for @initialization thus
2853                             # only $Bandwidth$ remains
2854                             ('Bandwidth', ))
2855                         representation_ms_info['initialization_url'] = initialization_template % {
2856                             'Bandwidth': bandwidth,
2857                         }
2858
2859                     def location_key(location):
2860                         return 'url' if re.match(r'^https?://', location) else 'path'
2861
2862                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2863
2864                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2865                         media_location_key = location_key(media_template)
2866
2867                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2868                         # can't be used at the same time
2869                         if '%(Number' in media_template and 's' not in representation_ms_info:
2870                             segment_duration = None
2871                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2872                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2873                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2874                             representation_ms_info['fragments'] = [{
2875                                 media_location_key: media_template % {
2876                                     'Number': segment_number,
2877                                     'Bandwidth': bandwidth,
2878                                 },
2879                                 'duration': segment_duration,
2880                             } for segment_number in range(
2881                                 representation_ms_info['start_number'],
2882                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2883                         else:
2884                             # $Number*$ or $Time$ in media template with S list available
2885                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2886                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2887                             representation_ms_info['fragments'] = []
2888                             segment_time = 0
2889                             segment_d = None
2890                             segment_number = representation_ms_info['start_number']
2891
2892                             def add_segment_url():
2893                                 segment_url = media_template % {
2894                                     'Time': segment_time,
2895                                     'Bandwidth': bandwidth,
2896                                     'Number': segment_number,
2897                                 }
2898                                 representation_ms_info['fragments'].append({
2899                                     media_location_key: segment_url,
2900                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2901                                 })
2902
2903                             for num, s in enumerate(representation_ms_info['s']):
2904                                 segment_time = s.get('t') or segment_time
2905                                 segment_d = s['d']
2906                                 add_segment_url()
2907                                 segment_number += 1
2908                                 for r in range(s.get('r', 0)):
2909                                     segment_time += segment_d
2910                                     add_segment_url()
2911                                     segment_number += 1
2912                                 segment_time += segment_d
2913                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2914                         # No media template
2915                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2916                         # or any YouTube dashsegments video
2917                         fragments = []
2918                         segment_index = 0
2919                         timescale = representation_ms_info['timescale']
2920                         for s in representation_ms_info['s']:
2921                             duration = float_or_none(s['d'], timescale)
2922                             for r in range(s.get('r', 0) + 1):
2923                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2924                                 fragments.append({
2925                                     location_key(segment_uri): segment_uri,
2926                                     'duration': duration,
2927                                 })
2928                                 segment_index += 1
2929                         representation_ms_info['fragments'] = fragments
2930                     elif 'segment_urls' in representation_ms_info:
2931                         # Segment URLs with no SegmentTimeline
2932                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2933                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2934                         fragments = []
2935                         segment_duration = float_or_none(
2936                             representation_ms_info['segment_duration'],
2937                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2938                         for segment_url in representation_ms_info['segment_urls']:
2939                             fragment = {
2940                                 location_key(segment_url): segment_url,
2941                             }
2942                             if segment_duration:
2943                                 fragment['duration'] = segment_duration
2944                             fragments.append(fragment)
2945                         representation_ms_info['fragments'] = fragments
2946                     # If there is a fragments key available then we correctly recognized fragmented media.
2947                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2948                     # assumption is not necessarily correct since we may simply have no support for
2949                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2950                     if 'fragments' in representation_ms_info:
2951                         f.update({
2952                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2953                             'url': mpd_url or base_url,
2954                             'fragment_base_url': base_url,
2955                             'fragments': [],
2956                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2957                         })
2958                         if 'initialization_url' in representation_ms_info:
2959                             initialization_url = representation_ms_info['initialization_url']
2960                             if not f.get('url'):
2961                                 f['url'] = initialization_url
2962                             f['fragments'].append({location_key(initialization_url): initialization_url})
2963                         f['fragments'].extend(representation_ms_info['fragments'])
2964                     else:
2965                         # Assuming direct URL to unfragmented media.
2966                         f['url'] = base_url
2967                     if content_type in ('video', 'audio', 'image/jpeg'):
2968                         f['manifest_stream_number'] = stream_numbers[f['url']]
2969                         stream_numbers[f['url']] += 1
2970                         formats.append(f)
2971                     elif content_type == 'text':
2972                         subtitles.setdefault(lang or 'und', []).append(f)
2973
2974         return formats, subtitles
2975
2976     def _extract_ism_formats(self, *args, **kwargs):
2977         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2978         if subs:
2979             self._report_ignoring_subs('ISM')
2980         return fmts
2981
2982     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2983         res = self._download_xml_handle(
2984             ism_url, video_id,
2985             note='Downloading ISM manifest' if note is None else note,
2986             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2987             fatal=fatal, data=data, headers=headers, query=query)
2988         if res is False:
2989             return [], {}
2990         ism_doc, urlh = res
2991         if ism_doc is None:
2992             return [], {}
2993
2994         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2995
2996     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2997         """
2998         Parse formats from ISM manifest.
2999         References:
3000          1. [MS-SSTR]: Smooth Streaming Protocol,
3001             https://msdn.microsoft.com/en-us/library/ff469518.aspx
3002         """
3003         if ism_doc.get('IsLive') == 'TRUE':
3004             return [], {}
3005
3006         duration = int(ism_doc.attrib['Duration'])
3007         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3008
3009         formats = []
3010         subtitles = {}
3011         for stream in ism_doc.findall('StreamIndex'):
3012             stream_type = stream.get('Type')
3013             if stream_type not in ('video', 'audio', 'text'):
3014                 continue
3015             url_pattern = stream.attrib['Url']
3016             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3017             stream_name = stream.get('Name')
3018             stream_language = stream.get('Language', 'und')
3019             for track in stream.findall('QualityLevel'):
3020                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
3021                 # TODO: add support for WVC1 and WMAP
3022                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
3023                     self.report_warning('%s is not a supported codec' % fourcc)
3024                     continue
3025                 tbr = int(track.attrib['Bitrate']) // 1000
3026                 # [1] does not mention Width and Height attributes. However,
3027                 # they're often present while MaxWidth and MaxHeight are
3028                 # missing, so should be used as fallbacks
3029                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3030                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3031                 sampling_rate = int_or_none(track.get('SamplingRate'))
3032
3033                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3034                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3035
3036                 fragments = []
3037                 fragment_ctx = {
3038                     'time': 0,
3039                 }
3040                 stream_fragments = stream.findall('c')
3041                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3042                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3043                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3044                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3045                     if not fragment_ctx['duration']:
3046                         try:
3047                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3048                         except IndexError:
3049                             next_fragment_time = duration
3050                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3051                     for _ in range(fragment_repeat):
3052                         fragments.append({
3053                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
3054                             'duration': fragment_ctx['duration'] / stream_timescale,
3055                         })
3056                         fragment_ctx['time'] += fragment_ctx['duration']
3057
3058                 if stream_type == 'text':
3059                     subtitles.setdefault(stream_language, []).append({
3060                         'ext': 'ismt',
3061                         'protocol': 'ism',
3062                         'url': ism_url,
3063                         'manifest_url': ism_url,
3064                         'fragments': fragments,
3065                         '_download_params': {
3066                             'stream_type': stream_type,
3067                             'duration': duration,
3068                             'timescale': stream_timescale,
3069                             'fourcc': fourcc,
3070                             'language': stream_language,
3071                             'codec_private_data': track.get('CodecPrivateData'),
3072                         }
3073                     })
3074                 elif stream_type in ('video', 'audio'):
3075                     formats.append({
3076                         'format_id': join_nonempty(ism_id, stream_name, tbr),
3077                         'url': ism_url,
3078                         'manifest_url': ism_url,
3079                         'ext': 'ismv' if stream_type == 'video' else 'isma',
3080                         'width': width,
3081                         'height': height,
3082                         'tbr': tbr,
3083                         'asr': sampling_rate,
3084                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
3085                         'acodec': 'none' if stream_type == 'video' else fourcc,
3086                         'protocol': 'ism',
3087                         'fragments': fragments,
3088                         'has_drm': ism_doc.find('Protection') is not None,
3089                         '_download_params': {
3090                             'stream_type': stream_type,
3091                             'duration': duration,
3092                             'timescale': stream_timescale,
3093                             'width': width or 0,
3094                             'height': height or 0,
3095                             'fourcc': fourcc,
3096                             'language': stream_language,
3097                             'codec_private_data': track.get('CodecPrivateData'),
3098                             'sampling_rate': sampling_rate,
3099                             'channels': int_or_none(track.get('Channels', 2)),
3100                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3101                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3102                         },
3103                     })
3104         return formats, subtitles
3105
3106     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
3107         def absolute_url(item_url):
3108             return urljoin(base_url, item_url)
3109
3110         def parse_content_type(content_type):
3111             if not content_type:
3112                 return {}
3113             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3114             if ctr:
3115                 mimetype, codecs = ctr.groups()
3116                 f = parse_codecs(codecs)
3117                 f['ext'] = mimetype2ext(mimetype)
3118                 return f
3119             return {}
3120
3121         def _media_formats(src, cur_media_type, type_info={}):
3122             full_url = absolute_url(src)
3123             ext = type_info.get('ext') or determine_ext(full_url)
3124             if ext == 'm3u8':
3125                 is_plain_url = False
3126                 formats = self._extract_m3u8_formats(
3127                     full_url, video_id, ext='mp4',
3128                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3129                     preference=preference, quality=quality, fatal=False)
3130             elif ext == 'mpd':
3131                 is_plain_url = False
3132                 formats = self._extract_mpd_formats(
3133                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3134             else:
3135                 is_plain_url = True
3136                 formats = [{
3137                     'url': full_url,
3138                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3139                 }]
3140             return is_plain_url, formats
3141
3142         entries = []
3143         # amp-video and amp-audio are very similar to their HTML5 counterparts
3144         # so we wll include them right here (see
3145         # https://www.ampproject.org/docs/reference/components/amp-video)
3146         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3147         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3148         media_tags = [(media_tag, media_tag_name, media_type, '')
3149                       for media_tag, media_tag_name, media_type
3150                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3151         media_tags.extend(re.findall(
3152             # We only allow video|audio followed by a whitespace or '>'.
3153             # Allowing more characters may end up in significant slow down (see
3154             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3155             # http://www.porntrex.com/maps/videositemap.xml).
3156             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3157         for media_tag, _, media_type, media_content in media_tags:
3158             media_info = {
3159                 'formats': [],
3160                 'subtitles': {},
3161             }
3162             media_attributes = extract_attributes(media_tag)
3163             src = strip_or_none(media_attributes.get('src'))
3164             if src:
3165                 _, formats = _media_formats(src, media_type)
3166                 media_info['formats'].extend(formats)
3167             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3168             if media_content:
3169                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3170                     s_attr = extract_attributes(source_tag)
3171                     # data-video-src and data-src are non standard but seen
3172                     # several times in the wild
3173                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3174                     if not src:
3175                         continue
3176                     f = parse_content_type(s_attr.get('type'))
3177                     is_plain_url, formats = _media_formats(src, media_type, f)
3178                     if is_plain_url:
3179                         # width, height, res, label and title attributes are
3180                         # all not standard but seen several times in the wild
3181                         labels = [
3182                             s_attr.get(lbl)
3183                             for lbl in ('label', 'title')
3184                             if str_or_none(s_attr.get(lbl))
3185                         ]
3186                         width = int_or_none(s_attr.get('width'))
3187                         height = (int_or_none(s_attr.get('height'))
3188                                   or int_or_none(s_attr.get('res')))
3189                         if not width or not height:
3190                             for lbl in labels:
3191                                 resolution = parse_resolution(lbl)
3192                                 if not resolution:
3193                                     continue
3194                                 width = width or resolution.get('width')
3195                                 height = height or resolution.get('height')
3196                         for lbl in labels:
3197                             tbr = parse_bitrate(lbl)
3198                             if tbr:
3199                                 break
3200                         else:
3201                             tbr = None
3202                         f.update({
3203                             'width': width,
3204                             'height': height,
3205                             'tbr': tbr,
3206                             'format_id': s_attr.get('label') or s_attr.get('title'),
3207                         })
3208                         f.update(formats[0])
3209                         media_info['formats'].append(f)
3210                     else:
3211                         media_info['formats'].extend(formats)
3212                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3213                     track_attributes = extract_attributes(track_tag)
3214                     kind = track_attributes.get('kind')
3215                     if not kind or kind in ('subtitles', 'captions'):
3216                         src = strip_or_none(track_attributes.get('src'))
3217                         if not src:
3218                             continue
3219                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3220                         media_info['subtitles'].setdefault(lang, []).append({
3221                             'url': absolute_url(src),
3222                         })
3223             for f in media_info['formats']:
3224                 f.setdefault('http_headers', {})['Referer'] = base_url
3225             if media_info['formats'] or media_info['subtitles']:
3226                 entries.append(media_info)
3227         return entries
3228
3229     def _extract_akamai_formats(self, *args, **kwargs):
3230         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3231         if subs:
3232             self._report_ignoring_subs('akamai')
3233         return fmts
3234
3235     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3236         signed = 'hdnea=' in manifest_url
3237         if not signed:
3238             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3239             manifest_url = re.sub(
3240                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3241                 '', manifest_url).strip('?')
3242
3243         formats = []
3244         subtitles = {}
3245
3246         hdcore_sign = 'hdcore=3.7.0'
3247         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3248         hds_host = hosts.get('hds')
3249         if hds_host:
3250             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3251         if 'hdcore=' not in f4m_url:
3252             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3253         f4m_formats = self._extract_f4m_formats(
3254             f4m_url, video_id, f4m_id='hds', fatal=False)
3255         for entry in f4m_formats:
3256             entry.update({'extra_param_to_segment_url': hdcore_sign})
3257         formats.extend(f4m_formats)
3258
3259         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3260         hls_host = hosts.get('hls')
3261         if hls_host:
3262             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3263         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3264             m3u8_url, video_id, 'mp4', 'm3u8_native',
3265             m3u8_id='hls', fatal=False)
3266         formats.extend(m3u8_formats)
3267         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3268
3269         http_host = hosts.get('http')
3270         if http_host and m3u8_formats and not signed:
3271             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3272             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3273             qualities_length = len(qualities)
3274             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3275                 i = 0
3276                 for f in m3u8_formats:
3277                     if f['vcodec'] != 'none':
3278                         for protocol in ('http', 'https'):
3279                             http_f = f.copy()
3280                             del http_f['manifest_url']
3281                             http_url = re.sub(
3282                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3283                             http_f.update({
3284                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3285                                 'url': http_url,
3286                                 'protocol': protocol,
3287                             })
3288                             formats.append(http_f)
3289                         i += 1
3290
3291         return formats, subtitles
3292
3293     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3294         query = compat_urlparse.urlparse(url).query
3295         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3296         mobj = re.search(
3297             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3298         url_base = mobj.group('url')
3299         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3300         formats = []
3301
3302         def manifest_url(manifest):
3303             m_url = '%s/%s' % (http_base_url, manifest)
3304             if query:
3305                 m_url += '?%s' % query
3306             return m_url
3307
3308         if 'm3u8' not in skip_protocols:
3309             formats.extend(self._extract_m3u8_formats(
3310                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3311                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3312         if 'f4m' not in skip_protocols:
3313             formats.extend(self._extract_f4m_formats(
3314                 manifest_url('manifest.f4m'),
3315                 video_id, f4m_id='hds', fatal=False))
3316         if 'dash' not in skip_protocols:
3317             formats.extend(self._extract_mpd_formats(
3318                 manifest_url('manifest.mpd'),
3319                 video_id, mpd_id='dash', fatal=False))
3320         if re.search(r'(?:/smil:|\.smil)', url_base):
3321             if 'smil' not in skip_protocols:
3322                 rtmp_formats = self._extract_smil_formats(
3323                     manifest_url('jwplayer.smil'),
3324                     video_id, fatal=False)
3325                 for rtmp_format in rtmp_formats:
3326                     rtsp_format = rtmp_format.copy()
3327                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3328                     del rtsp_format['play_path']
3329                     del rtsp_format['ext']
3330                     rtsp_format.update({
3331                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3332                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3333                         'protocol': 'rtsp',
3334                     })
3335                     formats.extend([rtmp_format, rtsp_format])
3336         else:
3337             for protocol in ('rtmp', 'rtsp'):
3338                 if protocol not in skip_protocols:
3339                     formats.append({
3340                         'url': '%s:%s' % (protocol, url_base),
3341                         'format_id': protocol,
3342                         'protocol': protocol,
3343                     })
3344         return formats
3345
3346     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3347         mobj = re.search(
3348             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3349             webpage)
3350         if mobj:
3351             try:
3352                 jwplayer_data = self._parse_json(mobj.group('options'),
3353                                                  video_id=video_id,
3354                                                  transform_source=transform_source)
3355             except ExtractorError:
3356                 pass
3357             else:
3358                 if isinstance(jwplayer_data, dict):
3359                     return jwplayer_data
3360
3361     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3362         jwplayer_data = self._find_jwplayer_data(
3363             webpage, video_id, transform_source=js_to_json)
3364         return self._parse_jwplayer_data(
3365             jwplayer_data, video_id, *args, **kwargs)
3366
3367     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3368                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3369         # JWPlayer backward compatibility: flattened playlists
3370         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3371         if 'playlist' not in jwplayer_data:
3372             jwplayer_data = {'playlist': [jwplayer_data]}
3373
3374         entries = []
3375
3376         # JWPlayer backward compatibility: single playlist item
3377         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3378         if not isinstance(jwplayer_data['playlist'], list):
3379             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3380
3381         for video_data in jwplayer_data['playlist']:
3382             # JWPlayer backward compatibility: flattened sources
3383             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3384             if 'sources' not in video_data:
3385                 video_data['sources'] = [video_data]
3386
3387             this_video_id = video_id or video_data['mediaid']
3388
3389             formats = self._parse_jwplayer_formats(
3390                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3391                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3392
3393             subtitles = {}
3394             tracks = video_data.get('tracks')
3395             if tracks and isinstance(tracks, list):
3396                 for track in tracks:
3397                     if not isinstance(track, dict):
3398                         continue
3399                     track_kind = track.get('kind')
3400                     if not track_kind or not isinstance(track_kind, compat_str):
3401                         continue
3402                     if track_kind.lower() not in ('captions', 'subtitles'):
3403                         continue
3404                     track_url = urljoin(base_url, track.get('file'))
3405                     if not track_url:
3406                         continue
3407                     subtitles.setdefault(track.get('label') or 'en', []).append({
3408                         'url': self._proto_relative_url(track_url)
3409                     })
3410
3411             entry = {
3412                 'id': this_video_id,
3413                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3414                 'description': clean_html(video_data.get('description')),
3415                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3416                 'timestamp': int_or_none(video_data.get('pubdate')),
3417                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3418                 'subtitles': subtitles,
3419             }
3420             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3421             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3422                 entry.update({
3423                     '_type': 'url_transparent',
3424                     'url': formats[0]['url'],
3425                 })
3426             else:
3427                 self._sort_formats(formats)
3428                 entry['formats'] = formats
3429             entries.append(entry)
3430         if len(entries) == 1:
3431             return entries[0]
3432         else:
3433             return self.playlist_result(entries)
3434
3435     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3436                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3437         urls = []
3438         formats = []
3439         for source in jwplayer_sources_data:
3440             if not isinstance(source, dict):
3441                 continue
3442             source_url = urljoin(
3443                 base_url, self._proto_relative_url(source.get('file')))
3444             if not source_url or source_url in urls:
3445                 continue
3446             urls.append(source_url)
3447             source_type = source.get('type') or ''
3448             ext = mimetype2ext(source_type) or determine_ext(source_url)
3449             if source_type == 'hls' or ext == 'm3u8':
3450                 formats.extend(self._extract_m3u8_formats(
3451                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3452                     m3u8_id=m3u8_id, fatal=False))
3453             elif source_type == 'dash' or ext == 'mpd':
3454                 formats.extend(self._extract_mpd_formats(
3455                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3456             elif ext == 'smil':
3457                 formats.extend(self._extract_smil_formats(
3458                     source_url, video_id, fatal=False))
3459             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3460             elif source_type.startswith('audio') or ext in (
3461                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3462                 formats.append({
3463                     'url': source_url,
3464                     'vcodec': 'none',
3465                     'ext': ext,
3466                 })
3467             else:
3468                 height = int_or_none(source.get('height'))
3469                 if height is None:
3470                     # Often no height is provided but there is a label in
3471                     # format like "1080p", "720p SD", or 1080.
3472                     height = int_or_none(self._search_regex(
3473                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3474                         'height', default=None))
3475                 a_format = {
3476                     'url': source_url,
3477                     'width': int_or_none(source.get('width')),
3478                     'height': height,
3479                     'tbr': int_or_none(source.get('bitrate')),
3480                     'ext': ext,
3481                 }
3482                 if source_url.startswith('rtmp'):
3483                     a_format['ext'] = 'flv'
3484                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3485                     # of jwplayer.flash.swf
3486                     rtmp_url_parts = re.split(
3487                         r'((?:mp4|mp3|flv):)', source_url, 1)
3488                     if len(rtmp_url_parts) == 3:
3489                         rtmp_url, prefix, play_path = rtmp_url_parts
3490                         a_format.update({
3491                             'url': rtmp_url,
3492                             'play_path': prefix + play_path,
3493                         })
3494                     if rtmp_params:
3495                         a_format.update(rtmp_params)
3496                 formats.append(a_format)
3497         return formats
3498
3499     def _live_title(self, name):
3500         self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3501         return name
3502
3503     def _int(self, v, name, fatal=False, **kwargs):
3504         res = int_or_none(v, **kwargs)
3505         if 'get_attr' in kwargs:
3506             print(getattr(v, kwargs['get_attr']))
3507         if res is None:
3508             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3509             if fatal:
3510                 raise ExtractorError(msg)
3511             else:
3512                 self.report_warning(msg)
3513         return res
3514
3515     def _float(self, v, name, fatal=False, **kwargs):
3516         res = float_or_none(v, **kwargs)
3517         if res is None:
3518             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3519             if fatal:
3520                 raise ExtractorError(msg)
3521             else:
3522                 self.report_warning(msg)
3523         return res
3524
3525     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3526                     path='/', secure=False, discard=False, rest={}, **kwargs):
3527         cookie = compat_cookiejar_Cookie(
3528             0, name, value, port, port is not None, domain, True,
3529             domain.startswith('.'), path, True, secure, expire_time,
3530             discard, None, None, rest)
3531         self._downloader.cookiejar.set_cookie(cookie)
3532
3533     def _get_cookies(self, url):
3534         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3535         req = sanitized_Request(url)
3536         self._downloader.cookiejar.add_cookie_header(req)
3537         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3538
3539     def _apply_first_set_cookie_header(self, url_handle, cookie):
3540         """
3541         Apply first Set-Cookie header instead of the last. Experimental.
3542
3543         Some sites (e.g. [1-3]) may serve two cookies under the same name
3544         in Set-Cookie header and expect the first (old) one to be set rather
3545         than second (new). However, as of RFC6265 the newer one cookie
3546         should be set into cookie store what actually happens.
3547         We will workaround this issue by resetting the cookie to
3548         the first one manually.
3549         1. https://new.vk.com/
3550         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3551         3. https://learning.oreilly.com/
3552         """
3553         for header, cookies in url_handle.headers.items():
3554             if header.lower() != 'set-cookie':
3555                 continue
3556             if sys.version_info[0] >= 3:
3557                 cookies = cookies.encode('iso-8859-1')
3558             cookies = cookies.decode('utf-8')
3559             cookie_value = re.search(
3560                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3561             if cookie_value:
3562                 value, domain = cookie_value.groups()
3563                 self._set_cookie(domain, cookie, value)
3564                 break
3565
3566     def get_testcases(self, include_onlymatching=False):
3567         t = getattr(self, '_TEST', None)
3568         if t:
3569             assert not hasattr(self, '_TESTS'), \
3570                 '%s has _TEST and _TESTS' % type(self).__name__
3571             tests = [t]
3572         else:
3573             tests = getattr(self, '_TESTS', [])
3574         for t in tests:
3575             if not include_onlymatching and t.get('only_matching', False):
3576                 continue
3577             t['name'] = type(self).__name__[:-len('IE')]
3578             yield t
3579
3580     def is_suitable(self, age_limit):
3581         """ Test whether the extractor is generally suitable for the given
3582         age limit (i.e. pornographic sites are not, all others usually are) """
3583
3584         any_restricted = False
3585         for tc in self.get_testcases(include_onlymatching=False):
3586             if tc.get('playlist', []):
3587                 tc = tc['playlist'][0]
3588             is_restricted = age_restricted(
3589                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3590             if not is_restricted:
3591                 return True
3592             any_restricted = any_restricted or is_restricted
3593         return not any_restricted
3594
3595     def extract_subtitles(self, *args, **kwargs):
3596         if (self.get_param('writesubtitles', False)
3597                 or self.get_param('listsubtitles')):
3598             return self._get_subtitles(*args, **kwargs)
3599         return {}
3600
3601     def _get_subtitles(self, *args, **kwargs):
3602         raise NotImplementedError('This method must be implemented by subclasses')
3603
3604     def extract_comments(self, *args, **kwargs):
3605         if not self.get_param('getcomments'):
3606             return None
3607         generator = self._get_comments(*args, **kwargs)
3608
3609         def extractor():
3610             comments = []
3611             interrupted = True
3612             try:
3613                 while True:
3614                     comments.append(next(generator))
3615             except StopIteration:
3616                 interrupted = False
3617             except KeyboardInterrupt:
3618                 self.to_screen('Interrupted by user')
3619             except Exception as e:
3620                 if self.get_param('ignoreerrors') is not True:
3621                     raise
3622                 self._downloader.report_error(e)
3623             comment_count = len(comments)
3624             self.to_screen(f'Extracted {comment_count} comments')
3625             return {
3626                 'comments': comments,
3627                 'comment_count': None if interrupted else comment_count
3628             }
3629         return extractor
3630
3631     def _get_comments(self, *args, **kwargs):
3632         raise NotImplementedError('This method must be implemented by subclasses')
3633
3634     @staticmethod
3635     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3636         """ Merge subtitle items for one language. Items with duplicated URLs
3637         will be dropped. """
3638         list1_urls = set([item['url'] for item in subtitle_list1])
3639         ret = list(subtitle_list1)
3640         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3641         return ret
3642
3643     @classmethod
3644     def _merge_subtitles(cls, *dicts, target=None):
3645         """ Merge subtitle dictionaries, language by language. """
3646         if target is None:
3647             target = {}
3648         for d in dicts:
3649             for lang, subs in d.items():
3650                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3651         return target
3652
3653     def extract_automatic_captions(self, *args, **kwargs):
3654         if (self.get_param('writeautomaticsub', False)
3655                 or self.get_param('listsubtitles')):
3656             return self._get_automatic_captions(*args, **kwargs)
3657         return {}
3658
3659     def _get_automatic_captions(self, *args, **kwargs):
3660         raise NotImplementedError('This method must be implemented by subclasses')
3661
3662     def mark_watched(self, *args, **kwargs):
3663         if not self.get_param('mark_watched', False):
3664             return
3665         if (self._get_login_info()[0] is not None
3666                 or self.get_param('cookiefile')
3667                 or self.get_param('cookiesfrombrowser')):
3668             self._mark_watched(*args, **kwargs)
3669
3670     def _mark_watched(self, *args, **kwargs):
3671         raise NotImplementedError('This method must be implemented by subclasses')
3672
3673     def geo_verification_headers(self):
3674         headers = {}
3675         geo_verification_proxy = self.get_param('geo_verification_proxy')
3676         if geo_verification_proxy:
3677             headers['Ytdl-request-proxy'] = geo_verification_proxy
3678         return headers
3679
3680     def _generic_id(self, url):
3681         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3682
3683     def _generic_title(self, url):
3684         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3685
3686     @staticmethod
3687     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3688         all_known = all(map(
3689             lambda x: x is not None,
3690             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3691         return (
3692             'private' if is_private
3693             else 'premium_only' if needs_premium
3694             else 'subscriber_only' if needs_subscription
3695             else 'needs_auth' if needs_auth
3696             else 'unlisted' if is_unlisted
3697             else 'public' if all_known
3698             else None)
3699
3700     def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3701         '''
3702         @returns            A list of values for the extractor argument given by "key"
3703                             or "default" if no such key is present
3704         @param default      The default value to return when the key is not present (default: [])
3705         @param casesense    When false, the values are converted to lower case
3706         '''
3707         val = traverse_obj(
3708             self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
3709         if val is None:
3710             return [] if default is NO_DEFAULT else default
3711         return list(val) if casesense else [x.lower() for x in val]
3712
3713
3714 class SearchInfoExtractor(InfoExtractor):
3715     """
3716     Base class for paged search queries extractors.
3717     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3718     Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3719     """
3720
3721     _MAX_RESULTS = float('inf')
3722
3723     @classmethod
3724     def _make_valid_url(cls):
3725         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3726
3727     def _real_extract(self, query):
3728         prefix, query = self._match_valid_url(query).group('prefix', 'query')
3729         if prefix == '':
3730             return self._get_n_results(query, 1)
3731         elif prefix == 'all':
3732             return self._get_n_results(query, self._MAX_RESULTS)
3733         else:
3734             n = int(prefix)
3735             if n <= 0:
3736                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3737             elif n > self._MAX_RESULTS:
3738                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3739                 n = self._MAX_RESULTS
3740             return self._get_n_results(query, n)
3741
3742     def _get_n_results(self, query, n):
3743         """Get a specified number of results for a query.
3744         Either this function or _search_results must be overridden by subclasses """
3745         return self.playlist_result(
3746             itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3747             query, query)
3748
3749     def _search_results(self, query):
3750         """Returns an iterator of search results"""
3751         raise NotImplementedError('This method must be implemented by subclasses')
3752
3753     @property
3754     def SEARCH_KEY(self):
3755         return self._SEARCH_KEY