yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar_Cookie,
  18     compat_cookies_SimpleCookie,
  19     compat_etree_Element,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader import FileDownloader
  33 from ..downloader.f4m import (
  34     get_base_url,
  35     remove_encrypted_media,
  36 )
  37 from ..utils import (
  38     age_restricted,
  39     base_url,
  40     bug_reports_message,
  41     clean_html,
  42     compiled_regex_type,
  43     determine_ext,
  44     determine_protocol,
  45     dict_get,
  46     error_to_compat_str,
  47     extract_attributes,
  48     ExtractorError,
  49     fix_xml_ampersands,
  50     float_or_none,
  51     format_field,
  52     GeoRestrictedError,
  53     GeoUtils,
  54     int_or_none,
  55     js_to_json,
  56     JSON_LD_RE,
  57     mimetype2ext,
  58     network_exceptions,
  59     NO_DEFAULT,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     RegexNotFoundError,
  68     sanitize_filename,
  69     sanitized_Request,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     traverse_obj,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     url_basename,
  80     url_or_none,
  81     urljoin,
  82     variadic,
  83     xpath_element,
  84     xpath_text,
  85     xpath_with_ns,
  86 )
  87
  88
  89 class InfoExtractor(object):
  90     """Information Extractor class.
  91
  92     Information extractors are the classes that, given a URL, extract
  93     information about the video (or videos) the URL refers to. This
  94     information includes the real video URL, the video title, author and
  95     others. The information is stored in a dictionary which is then
  96     passed to the YoutubeDL. The YoutubeDL processes this
  97     information possibly downloading the video to the file system, among
  98     other possible outcomes.
  99
 100     The type field determines the type of the result.
 101     By far the most common value (and the default if _type is missing) is
 102     "video", which indicates a single video.
 103
 104     For a video, the dictionaries must include the following fields:
 105
 106     id:             Video identifier.
 107     title:          Video title, unescaped.
 108
 109     Additionally, it must contain either a formats entry or a url one:
 110
 111     formats:        A list of dictionaries for each format available, ordered
 112                     from worst to best quality.
 113
 114                     Potential fields:
 115                     * url        The mandatory URL representing the media:
 116                                    for plain file media - HTTP URL of this file,
 117                                    for RTMP - RTMP URL,
 118                                    for HLS - URL of the M3U8 media playlist,
 119                                    for HDS - URL of the F4M manifest,
 120                                    for DASH
 121                                      - HTTP URL to plain file media (in case of
 122                                        unfragmented media)
 123                                      - URL of the MPD manifest or base URL
 124                                        representing the media if MPD manifest
 125                                        is parsed from a string (in case of
 126                                        fragmented media)
 127                                    for MSS - URL of the ISM manifest.
 128                     * manifest_url
 129                                  The URL of the manifest file in case of
 130                                  fragmented media:
 131                                    for HLS - URL of the M3U8 master playlist,
 132                                    for HDS - URL of the F4M manifest,
 133                                    for DASH - URL of the MPD manifest,
 134                                    for MSS - URL of the ISM manifest.
 135                     * ext        Will be calculated from URL if missing
 136                     * format     A human-readable description of the format
 137                                  ("mp4 container with h264/opus").
 138                                  Calculated from the format_id, width, height.
 139                                  and format_note fields if missing.
 140                     * format_id  A short description of the format
 141                                  ("mp4_h264_opus" or "19").
 142                                 Technically optional, but strongly recommended.
 143                     * format_note Additional info about the format
 144                                  ("3D" or "DASH video")
 145                     * width      Width of the video, if known
 146                     * height     Height of the video, if known
 147                     * resolution Textual description of width and height
 148                     * tbr        Average bitrate of audio and video in KBit/s
 149                     * abr        Average audio bitrate in KBit/s
 150                     * acodec     Name of the audio codec in use
 151                     * asr        Audio sampling rate in Hertz
 152                     * vbr        Average video bitrate in KBit/s
 153                     * fps        Frame rate
 154                     * vcodec     Name of the video codec in use
 155                     * container  Name of the container format
 156                     * filesize   The number of bytes, if known in advance
 157                     * filesize_approx  An estimate for the number of bytes
 158                     * player_url SWF Player URL (used for rtmpdump).
 159                     * protocol   The protocol that will be used for the actual
 160                                  download, lower-case.
 161                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 162                                  "m3u8", "m3u8_native" or "http_dash_segments".
 163                     * fragment_base_url
 164                                  Base URL for fragments. Each fragment's path
 165                                  value (if present) will be relative to
 166                                  this URL.
 167                     * fragments  A list of fragments of a fragmented media.
 168                                  Each fragment entry must contain either an url
 169                                  or a path. If an url is present it should be
 170                                  considered by a client. Otherwise both path and
 171                                  fragment_base_url must be present. Here is
 172                                  the list of all potential fields:
 173                                  * "url" - fragment's URL
 174                                  * "path" - fragment's path relative to
 175                                             fragment_base_url
 176                                  * "duration" (optional, int or float)
 177                                  * "filesize" (optional, int)
 178                     * preference Order number of this format. If this field is
 179                                  present and not None, the formats get sorted
 180                                  by this field, regardless of all other values.
 181                                  -1 for default (order by other properties),
 182                                  -2 or smaller for less than default.
 183                                  < -1000 to hide the format (if there is
 184                                     another one which is strictly better)
 185                     * language   Language code, e.g. "de" or "en-US".
 186                     * language_preference  Is this in the language mentioned in
 187                                  the URL?
 188                                  10 if it's what the URL is about,
 189                                  -1 for default (don't know),
 190                                  -10 otherwise, other values reserved for now.
 191                     * quality    Order number of the video quality of this
 192                                  format, irrespective of the file format.
 193                                  -1 for default (order by other properties),
 194                                  -2 or smaller for less than default.
 195                     * source_preference  Order number for this video source
 196                                   (quality takes higher priority)
 197                                  -1 for default (order by other properties),
 198                                  -2 or smaller for less than default.
 199                     * http_headers  A dictionary of additional HTTP headers
 200                                  to add to the request.
 201                     * stretched_ratio  If given and not 1, indicates that the
 202                                  video's pixels are not square.
 203                                  width : height ratio as float.
 204                     * no_resume  The server does not support resuming the
 205                                  (HTTP or RTMP) download. Boolean.
 206                     * downloader_options  A dictionary of downloader options as
 207                                  described in FileDownloader
 208                     RTMP formats can also have the additional fields: page_url,
 209                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 210                     rtmp_protocol, rtmp_real_time
 211
 212     url:            Final video URL.
 213     ext:            Video filename extension.
 214     format:         The video format, defaults to ext (used for --get-format)
 215     player_url:     SWF Player URL (used for rtmpdump).
 216
 217     The following fields are optional:
 218
 219     alt_title:      A secondary title of the video.
 220     display_id      An alternative identifier for the video, not necessarily
 221                     unique, but available before title. Typically, id is
 222                     something like "4234987", title "Dancing naked mole rats",
 223                     and display_id "dancing-naked-mole-rats"
 224     thumbnails:     A list of dictionaries, with the following entries:
 225                         * "id" (optional, string) - Thumbnail format ID
 226                         * "url"
 227                         * "preference" (optional, int) - quality of the image
 228                         * "width" (optional, int)
 229                         * "height" (optional, int)
 230                         * "resolution" (optional, string "{width}x{height}",
 231                                         deprecated)
 232                         * "filesize" (optional, int)
 233                         * "_test_url" (optional, bool) - If true, test the URL
 234     thumbnail:      Full URL to a video thumbnail image.
 235     description:    Full video description.
 236     uploader:       Full name of the video uploader.
 237     license:        License name the video is licensed under.
 238     creator:        The creator of the video.
 239     release_timestamp: UNIX timestamp of the moment the video was released.
 240     release_date:   The date (YYYYMMDD) when the video was released.
 241     timestamp:      UNIX timestamp of the moment the video was uploaded
 242     upload_date:    Video upload date (YYYYMMDD).
 243                     If not explicitly set, calculated from timestamp.
 244     uploader_id:    Nickname or id of the video uploader.
 245     uploader_url:   Full URL to a personal webpage of the video uploader.
 246     channel:        Full name of the channel the video is uploaded on.
 247                     Note that channel fields may or may not repeat uploader
 248                     fields. This depends on a particular extractor.
 249     channel_id:     Id of the channel.
 250     channel_url:    Full URL to a channel webpage.
 251     location:       Physical location where the video was filmed.
 252     subtitles:      The available subtitles as a dictionary in the format
 253                     {tag: subformats}. "tag" is usually a language code, and
 254                     "subformats" is a list sorted from lower to higher
 255                     preference, each element is a dictionary with the "ext"
 256                     entry and one of:
 257                         * "data": The subtitles file contents
 258                         * "url": A URL pointing to the subtitles file
 259                     It can optionally also have:
 260                         * "name": Name or description of the subtitles
 261                     "ext" will be calculated from URL if missing
 262     automatic_captions: Like 'subtitles'; contains automatically generated
 263                     captions instead of normal subtitles
 264     duration:       Length of the video in seconds, as an integer or float.
 265     view_count:     How many users have watched the video on the platform.
 266     like_count:     Number of positive ratings of the video
 267     dislike_count:  Number of negative ratings of the video
 268     repost_count:   Number of reposts of the video
 269     average_rating: Average rating give by users, the scale used depends on the webpage
 270     comment_count:  Number of comments on the video
 271     comments:       A list of comments, each with one or more of the following
 272                     properties (all but one of text or html optional):
 273                         * "author" - human-readable name of the comment author
 274                         * "author_id" - user ID of the comment author
 275                         * "author_thumbnail" - The thumbnail of the comment author
 276                         * "id" - Comment ID
 277                         * "html" - Comment as HTML
 278                         * "text" - Plain text of the comment
 279                         * "timestamp" - UNIX timestamp of comment
 280                         * "parent" - ID of the comment this one is replying to.
 281                                      Set to "root" to indicate that this is a
 282                                      comment to the original video.
 283                         * "like_count" - Number of positive ratings of the comment
 284                         * "dislike_count" - Number of negative ratings of the comment
 285                         * "is_favorited" - Whether the comment is marked as
 286                                            favorite by the video uploader
 287                         * "author_is_uploader" - Whether the comment is made by
 288                                                  the video uploader
 289     age_limit:      Age restriction for the video, as an integer (years)
 290     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 291                     should allow to get the same result again. (It will be set
 292                     by YoutubeDL if it's missing)
 293     categories:     A list of categories that the video falls in, for example
 294                     ["Sports", "Berlin"]
 295     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 296     cast:           A list of the video cast
 297     is_live:        True, False, or None (=unknown). Whether this video is a
 298                     live stream that goes on instead of a fixed-length video.
 299     was_live:       True, False, or None (=unknown). Whether this video was
 300                     originally a live stream.
 301     live_status:    'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
 302                     If absent, automatically set from is_live, was_live
 303     start_time:     Time in seconds where the reproduction should start, as
 304                     specified in the URL.
 305     end_time:       Time in seconds where the reproduction should end, as
 306                     specified in the URL.
 307     chapters:       A list of dictionaries, with the following entries:
 308                         * "start_time" - The start time of the chapter in seconds
 309                         * "end_time" - The end time of the chapter in seconds
 310                         * "title" (optional, string)
 311     playable_in_embed: Whether this video is allowed to play in embedded
 312                     players on other sites. Can be True (=always allowed),
 313                     False (=never allowed), None (=unknown), or a string
 314                     specifying the criteria for embedability (Eg: 'whitelist')
 315     availability:   Under what condition the video is available. One of
 316                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 317                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 318                     to set it
 319     __post_extractor: A function to be called just before the metadata is
 320                     written to either disk, logger or console. The function
 321                     must return a dict which will be added to the info_dict.
 322                     This is usefull for additional information that is
 323                     time-consuming to extract. Note that the fields thus
 324                     extracted will not be available to output template and
 325                     match_filter. So, only "comments" and "comment_count" are
 326                     currently allowed to be extracted via this method.
 327
 328     The following fields should only be used when the video belongs to some logical
 329     chapter or section:
 330
 331     chapter:        Name or title of the chapter the video belongs to.
 332     chapter_number: Number of the chapter the video belongs to, as an integer.
 333     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 334
 335     The following fields should only be used when the video is an episode of some
 336     series, programme or podcast:
 337
 338     series:         Title of the series or programme the video episode belongs to.
 339     season:         Title of the season the video episode belongs to.
 340     season_number:  Number of the season the video episode belongs to, as an integer.
 341     season_id:      Id of the season the video episode belongs to, as a unicode string.
 342     episode:        Title of the video episode. Unlike mandatory video title field,
 343                     this field should denote the exact title of the video episode
 344                     without any kind of decoration.
 345     episode_number: Number of the video episode within a season, as an integer.
 346     episode_id:     Id of the video episode, as a unicode string.
 347
 348     The following fields should only be used when the media is a track or a part of
 349     a music album:
 350
 351     track:          Title of the track.
 352     track_number:   Number of the track within an album or a disc, as an integer.
 353     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 354                     as a unicode string.
 355     artist:         Artist(s) of the track.
 356     genre:          Genre(s) of the track.
 357     album:          Title of the album the track belongs to.
 358     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 359     album_artist:   List of all artists appeared on the album (e.g.
 360                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 361                     and compilations).
 362     disc_number:    Number of the disc or other physical medium the track belongs to,
 363                     as an integer.
 364     release_year:   Year (YYYY) when the album was released.
 365
 366     Unless mentioned otherwise, the fields should be Unicode strings.
 367
 368     Unless mentioned otherwise, None is equivalent to absence of information.
 369
 370
 371     _type "playlist" indicates multiple videos.
 372     There must be a key "entries", which is a list, an iterable, or a PagedList
 373     object, each element of which is a valid dictionary by this specification.
 374
 375     Additionally, playlists can have "id", "title", and any other relevent
 376     attributes with the same semantics as videos (see above).
 377
 378
 379     _type "multi_video" indicates that there are multiple videos that
 380     form a single show, for examples multiple acts of an opera or TV episode.
 381     It must have an entries key like a playlist and contain all the keys
 382     required for a video at the same time.
 383
 384
 385     _type "url" indicates that the video must be extracted from another
 386     location, possibly by a different extractor. Its only required key is:
 387     "url" - the next URL to extract.
 388     The key "ie_key" can be set to the class name (minus the trailing "IE",
 389     e.g. "Youtube") if the extractor class is known in advance.
 390     Additionally, the dictionary may have any properties of the resolved entity
 391     known in advance, for example "title" if the title of the referred video is
 392     known ahead of time.
 393
 394
 395     _type "url_transparent" entities have the same specification as "url", but
 396     indicate that the given additional information is more precise than the one
 397     associated with the resolved URL.
 398     This is useful when a site employs a video service that hosts the video and
 399     its technical metadata, but that video service does not embed a useful
 400     title, description etc.
 401
 402
 403     Subclasses of this one should re-define the _real_initialize() and
 404     _real_extract() methods and define a _VALID_URL regexp.
 405     Probably, they should also be added to the list of extractors.
 406
 407     _GEO_BYPASS attribute may be set to False in order to disable
 408     geo restriction bypass mechanisms for a particular extractor.
 409     Though it won't disable explicit geo restriction bypass based on
 410     country code provided with geo_bypass_country.
 411
 412     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 413     countries for this extractor. One of these countries will be used by
 414     geo restriction bypass mechanism right away in order to bypass
 415     geo restriction, of course, if the mechanism is not disabled.
 416
 417     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 418     IP blocks in CIDR notation for this extractor. One of these IP blocks
 419     will be used by geo restriction bypass mechanism similarly
 420     to _GEO_COUNTRIES.
 421
 422     Finally, the _WORKING attribute should be set to False for broken IEs
 423     in order to warn the users and skip the tests.
 424     """
 425
 426     _ready = False
 427     _downloader = None
 428     _x_forwarded_for_ip = None
 429     _GEO_BYPASS = True
 430     _GEO_COUNTRIES = None
 431     _GEO_IP_BLOCKS = None
 432     _WORKING = True
 433
 434     _LOGIN_HINTS = {
 435         'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
 436         'cookies': (
 437             'Use --cookies for the authentication. '
 438             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to pass cookies'),
 439         'password': 'Use --username and --password or --netrc to provide account credentials',
 440     }
 441
 442     def __init__(self, downloader=None):
 443         """Constructor. Receives an optional downloader."""
 444         self._ready = False
 445         self._x_forwarded_for_ip = None
 446         self._printed_messages = set()
 447         self.set_downloader(downloader)
 448
 449     @classmethod
 450     def _match_valid_url(cls, url):
 451         # This does not use has/getattr intentionally - we want to know whether
 452         # we have cached the regexp for *this* class, whereas getattr would also
 453         # match the superclass
 454         if '_VALID_URL_RE' not in cls.__dict__:
 455             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 456         return cls._VALID_URL_RE.match(url)
 457
 458     @classmethod
 459     def suitable(cls, url):
 460         """Receives a URL and returns True if suitable for this IE."""
 461         return cls._match_valid_url(url) is not None
 462
 463     @classmethod
 464     def _match_id(cls, url):
 465         return cls._match_valid_url(url).group('id')
 466
 467     @classmethod
 468     def working(cls):
 469         """Getter method for _WORKING."""
 470         return cls._WORKING
 471
 472     def initialize(self):
 473         """Initializes an instance (authentication, etc)."""
 474         self._printed_messages = set()
 475         self._initialize_geo_bypass({
 476             'countries': self._GEO_COUNTRIES,
 477             'ip_blocks': self._GEO_IP_BLOCKS,
 478         })
 479         if not self._ready:
 480             self._real_initialize()
 481             self._ready = True
 482
 483     def _initialize_geo_bypass(self, geo_bypass_context):
 484         """
 485         Initialize geo restriction bypass mechanism.
 486
 487         This method is used to initialize geo bypass mechanism based on faking
 488         X-Forwarded-For HTTP header. A random country from provided country list
 489         is selected and a random IP belonging to this country is generated. This
 490         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 491         HTTP requests.
 492
 493         This method will be used for initial geo bypass mechanism initialization
 494         during the instance initialization with _GEO_COUNTRIES and
 495         _GEO_IP_BLOCKS.
 496
 497         You may also manually call it from extractor's code if geo bypass
 498         information is not available beforehand (e.g. obtained during
 499         extraction) or due to some other reason. In this case you should pass
 500         this information in geo bypass context passed as first argument. It may
 501         contain following fields:
 502
 503         countries:  List of geo unrestricted countries (similar
 504                     to _GEO_COUNTRIES)
 505         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 506                     (similar to _GEO_IP_BLOCKS)
 507
 508         """
 509         if not self._x_forwarded_for_ip:
 510
 511             # Geo bypass mechanism is explicitly disabled by user
 512             if not self.get_param('geo_bypass', True):
 513                 return
 514
 515             if not geo_bypass_context:
 516                 geo_bypass_context = {}
 517
 518             # Backward compatibility: previously _initialize_geo_bypass
 519             # expected a list of countries, some 3rd party code may still use
 520             # it this way
 521             if isinstance(geo_bypass_context, (list, tuple)):
 522                 geo_bypass_context = {
 523                     'countries': geo_bypass_context,
 524                 }
 525
 526             # The whole point of geo bypass mechanism is to fake IP
 527             # as X-Forwarded-For HTTP header based on some IP block or
 528             # country code.
 529
 530             # Path 1: bypassing based on IP block in CIDR notation
 531
 532             # Explicit IP block specified by user, use it right away
 533             # regardless of whether extractor is geo bypassable or not
 534             ip_block = self.get_param('geo_bypass_ip_block', None)
 535
 536             # Otherwise use random IP block from geo bypass context but only
 537             # if extractor is known as geo bypassable
 538             if not ip_block:
 539                 ip_blocks = geo_bypass_context.get('ip_blocks')
 540                 if self._GEO_BYPASS and ip_blocks:
 541                     ip_block = random.choice(ip_blocks)
 542
 543             if ip_block:
 544                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 545                 self._downloader.write_debug(
 546                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 547                 return
 548
 549             # Path 2: bypassing based on country code
 550
 551             # Explicit country code specified by user, use it right away
 552             # regardless of whether extractor is geo bypassable or not
 553             country = self.get_param('geo_bypass_country', None)
 554
 555             # Otherwise use random country code from geo bypass context but
 556             # only if extractor is known as geo bypassable
 557             if not country:
 558                 countries = geo_bypass_context.get('countries')
 559                 if self._GEO_BYPASS and countries:
 560                     country = random.choice(countries)
 561
 562             if country:
 563                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 564                 self._downloader.write_debug(
 565                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 566
 567     def extract(self, url):
 568         """Extracts URL information and returns it in list of dicts."""
 569         try:
 570             for _ in range(2):
 571                 try:
 572                     self.initialize()
 573                     self.write_debug('Extracting URL: %s' % url)
 574                     ie_result = self._real_extract(url)
 575                     if ie_result is None:
 576                         return None
 577                     if self._x_forwarded_for_ip:
 578                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 579                     subtitles = ie_result.get('subtitles')
 580                     if (subtitles and 'live_chat' in subtitles
 581                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 582                         del subtitles['live_chat']
 583                     return ie_result
 584                 except GeoRestrictedError as e:
 585                     if self.__maybe_fake_ip_and_retry(e.countries):
 586                         continue
 587                     raise
 588         except ExtractorError:
 589             raise
 590         except compat_http_client.IncompleteRead as e:
 591             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 592         except (KeyError, StopIteration) as e:
 593             raise ExtractorError('An extractor error has occurred.', cause=e)
 594
 595     def __maybe_fake_ip_and_retry(self, countries):
 596         if (not self.get_param('geo_bypass_country', None)
 597                 and self._GEO_BYPASS
 598                 and self.get_param('geo_bypass', True)
 599                 and not self._x_forwarded_for_ip
 600                 and countries):
 601             country_code = random.choice(countries)
 602             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 603             if self._x_forwarded_for_ip:
 604                 self.report_warning(
 605                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 606                     % (self._x_forwarded_for_ip, country_code.upper()))
 607                 return True
 608         return False
 609
 610     def set_downloader(self, downloader):
 611         """Sets the downloader for this IE."""
 612         self._downloader = downloader
 613
 614     def _real_initialize(self):
 615         """Real initialization process. Redefine in subclasses."""
 616         pass
 617
 618     def _real_extract(self, url):
 619         """Real extraction process. Redefine in subclasses."""
 620         pass
 621
 622     @classmethod
 623     def ie_key(cls):
 624         """A string for getting the InfoExtractor with get_info_extractor"""
 625         return compat_str(cls.__name__[:-2])
 626
 627     @property
 628     def IE_NAME(self):
 629         return compat_str(type(self).__name__[:-2])
 630
 631     @staticmethod
 632     def __can_accept_status_code(err, expected_status):
 633         assert isinstance(err, compat_urllib_error.HTTPError)
 634         if expected_status is None:
 635             return False
 636         elif callable(expected_status):
 637             return expected_status(err.code) is True
 638         else:
 639             return err.code in variadic(expected_status)
 640
 641     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 642         """
 643         Return the response handle.
 644
 645         See _download_webpage docstring for arguments specification.
 646         """
 647         if not self._downloader._first_webpage_request:
 648             sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
 649             if sleep_interval > 0:
 650                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 651                 time.sleep(sleep_interval)
 652         else:
 653             self._downloader._first_webpage_request = False
 654
 655         if note is None:
 656             self.report_download_webpage(video_id)
 657         elif note is not False:
 658             if video_id is None:
 659                 self.to_screen('%s' % (note,))
 660             else:
 661                 self.to_screen('%s: %s' % (video_id, note))
 662
 663         # Some sites check X-Forwarded-For HTTP header in order to figure out
 664         # the origin of the client behind proxy. This allows bypassing geo
 665         # restriction by faking this header's value to IP that belongs to some
 666         # geo unrestricted country. We will do so once we encounter any
 667         # geo restriction error.
 668         if self._x_forwarded_for_ip:
 669             if 'X-Forwarded-For' not in headers:
 670                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 671
 672         if isinstance(url_or_request, compat_urllib_request.Request):
 673             url_or_request = update_Request(
 674                 url_or_request, data=data, headers=headers, query=query)
 675         else:
 676             if query:
 677                 url_or_request = update_url_query(url_or_request, query)
 678             if data is not None or headers:
 679                 url_or_request = sanitized_Request(url_or_request, data, headers)
 680         try:
 681             return self._downloader.urlopen(url_or_request)
 682         except network_exceptions as err:
 683             if isinstance(err, compat_urllib_error.HTTPError):
 684                 if self.__can_accept_status_code(err, expected_status):
 685                     # Retain reference to error to prevent file object from
 686                     # being closed before it can be read. Works around the
 687                     # effects of <https://bugs.python.org/issue15002>
 688                     # introduced in Python 3.4.1.
 689                     err.fp._error = err
 690                     return err.fp
 691
 692             if errnote is False:
 693                 return False
 694             if errnote is None:
 695                 errnote = 'Unable to download webpage'
 696
 697             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 698             if fatal:
 699                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 700             else:
 701                 self.report_warning(errmsg)
 702                 return False
 703
 704     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 705         """
 706         Return a tuple (page content as string, URL handle).
 707
 708         See _download_webpage docstring for arguments specification.
 709         """
 710         # Strip hashes from the URL (#1038)
 711         if isinstance(url_or_request, (compat_str, str)):
 712             url_or_request = url_or_request.partition('#')[0]
 713
 714         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 715         if urlh is False:
 716             assert not fatal
 717             return False
 718         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 719         return (content, urlh)
 720
 721     @staticmethod
 722     def _guess_encoding_from_content(content_type, webpage_bytes):
 723         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 724         if m:
 725             encoding = m.group(1)
 726         else:
 727             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 728                           webpage_bytes[:1024])
 729             if m:
 730                 encoding = m.group(1).decode('ascii')
 731             elif webpage_bytes.startswith(b'\xff\xfe'):
 732                 encoding = 'utf-16'
 733             else:
 734                 encoding = 'utf-8'
 735
 736         return encoding
 737
 738     def __check_blocked(self, content):
 739         first_block = content[:512]
 740         if ('<title>Access to this site is blocked</title>' in content
 741                 and 'Websense' in first_block):
 742             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 743             blocked_iframe = self._html_search_regex(
 744                 r'<iframe src="([^"]+)"', content,
 745                 'Websense information URL', default=None)
 746             if blocked_iframe:
 747                 msg += ' Visit %s for more details' % blocked_iframe
 748             raise ExtractorError(msg, expected=True)
 749         if '<title>The URL you requested has been blocked</title>' in first_block:
 750             msg = (
 751                 'Access to this webpage has been blocked by Indian censorship. '
 752                 'Use a VPN or proxy server (with --proxy) to route around it.')
 753             block_msg = self._html_search_regex(
 754                 r'</h1><p>(.*?)</p>',
 755                 content, 'block message', default=None)
 756             if block_msg:
 757                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 758             raise ExtractorError(msg, expected=True)
 759         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 760                 and 'blocklist.rkn.gov.ru' in content):
 761             raise ExtractorError(
 762                 'Access to this webpage has been blocked by decision of the Russian government. '
 763                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 764                 expected=True)
 765
 766     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 767         content_type = urlh.headers.get('Content-Type', '')
 768         webpage_bytes = urlh.read()
 769         if prefix is not None:
 770             webpage_bytes = prefix + webpage_bytes
 771         if not encoding:
 772             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 773         if self.get_param('dump_intermediate_pages', False):
 774             self.to_screen('Dumping request to ' + urlh.geturl())
 775             dump = base64.b64encode(webpage_bytes).decode('ascii')
 776             self._downloader.to_screen(dump)
 777         if self.get_param('write_pages', False):
 778             basen = '%s_%s' % (video_id, urlh.geturl())
 779             if len(basen) > 240:
 780                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 781                 basen = basen[:240 - len(h)] + h
 782             raw_filename = basen + '.dump'
 783             filename = sanitize_filename(raw_filename, restricted=True)
 784             self.to_screen('Saving request to ' + filename)
 785             # Working around MAX_PATH limitation on Windows (see
 786             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 787             if compat_os_name == 'nt':
 788                 absfilepath = os.path.abspath(filename)
 789                 if len(absfilepath) > 259:
 790                     filename = '\\\\?\\' + absfilepath
 791             with open(filename, 'wb') as outf:
 792                 outf.write(webpage_bytes)
 793
 794         try:
 795             content = webpage_bytes.decode(encoding, 'replace')
 796         except LookupError:
 797             content = webpage_bytes.decode('utf-8', 'replace')
 798
 799         self.__check_blocked(content)
 800
 801         return content
 802
 803     def _download_webpage(
 804             self, url_or_request, video_id, note=None, errnote=None,
 805             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 806             headers={}, query={}, expected_status=None):
 807         """
 808         Return the data of the page as a string.
 809
 810         Arguments:
 811         url_or_request -- plain text URL as a string or
 812             a compat_urllib_request.Requestobject
 813         video_id -- Video/playlist/item identifier (string)
 814
 815         Keyword arguments:
 816         note -- note printed before downloading (string)
 817         errnote -- note printed in case of an error (string)
 818         fatal -- flag denoting whether error should be considered fatal,
 819             i.e. whether it should cause ExtractionError to be raised,
 820             otherwise a warning will be reported and extraction continued
 821         tries -- number of tries
 822         timeout -- sleep interval between tries
 823         encoding -- encoding for a page content decoding, guessed automatically
 824             when not explicitly specified
 825         data -- POST data (bytes)
 826         headers -- HTTP headers (dict)
 827         query -- URL query (dict)
 828         expected_status -- allows to accept failed HTTP requests (non 2xx
 829             status code) by explicitly specifying a set of accepted status
 830             codes. Can be any of the following entities:
 831                 - an integer type specifying an exact failed status code to
 832                   accept
 833                 - a list or a tuple of integer types specifying a list of
 834                   failed status codes to accept
 835                 - a callable accepting an actual failed status code and
 836                   returning True if it should be accepted
 837             Note that this argument does not affect success status codes (2xx)
 838             which are always accepted.
 839         """
 840
 841         success = False
 842         try_count = 0
 843         while success is False:
 844             try:
 845                 res = self._download_webpage_handle(
 846                     url_or_request, video_id, note, errnote, fatal,
 847                     encoding=encoding, data=data, headers=headers, query=query,
 848                     expected_status=expected_status)
 849                 success = True
 850             except compat_http_client.IncompleteRead as e:
 851                 try_count += 1
 852                 if try_count >= tries:
 853                     raise e
 854                 self._sleep(timeout, video_id)
 855         if res is False:
 856             return res
 857         else:
 858             content, _ = res
 859             return content
 860
 861     def _download_xml_handle(
 862             self, url_or_request, video_id, note='Downloading XML',
 863             errnote='Unable to download XML', transform_source=None,
 864             fatal=True, encoding=None, data=None, headers={}, query={},
 865             expected_status=None):
 866         """
 867         Return a tuple (xml as an compat_etree_Element, URL handle).
 868
 869         See _download_webpage docstring for arguments specification.
 870         """
 871         res = self._download_webpage_handle(
 872             url_or_request, video_id, note, errnote, fatal=fatal,
 873             encoding=encoding, data=data, headers=headers, query=query,
 874             expected_status=expected_status)
 875         if res is False:
 876             return res
 877         xml_string, urlh = res
 878         return self._parse_xml(
 879             xml_string, video_id, transform_source=transform_source,
 880             fatal=fatal), urlh
 881
 882     def _download_xml(
 883             self, url_or_request, video_id,
 884             note='Downloading XML', errnote='Unable to download XML',
 885             transform_source=None, fatal=True, encoding=None,
 886             data=None, headers={}, query={}, expected_status=None):
 887         """
 888         Return the xml as an compat_etree_Element.
 889
 890         See _download_webpage docstring for arguments specification.
 891         """
 892         res = self._download_xml_handle(
 893             url_or_request, video_id, note=note, errnote=errnote,
 894             transform_source=transform_source, fatal=fatal, encoding=encoding,
 895             data=data, headers=headers, query=query,
 896             expected_status=expected_status)
 897         return res if res is False else res[0]
 898
 899     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 900         if transform_source:
 901             xml_string = transform_source(xml_string)
 902         try:
 903             return compat_etree_fromstring(xml_string.encode('utf-8'))
 904         except compat_xml_parse_error as ve:
 905             errmsg = '%s: Failed to parse XML ' % video_id
 906             if fatal:
 907                 raise ExtractorError(errmsg, cause=ve)
 908             else:
 909                 self.report_warning(errmsg + str(ve))
 910
 911     def _download_json_handle(
 912             self, url_or_request, video_id, note='Downloading JSON metadata',
 913             errnote='Unable to download JSON metadata', transform_source=None,
 914             fatal=True, encoding=None, data=None, headers={}, query={},
 915             expected_status=None):
 916         """
 917         Return a tuple (JSON object, URL handle).
 918
 919         See _download_webpage docstring for arguments specification.
 920         """
 921         res = self._download_webpage_handle(
 922             url_or_request, video_id, note, errnote, fatal=fatal,
 923             encoding=encoding, data=data, headers=headers, query=query,
 924             expected_status=expected_status)
 925         if res is False:
 926             return res
 927         json_string, urlh = res
 928         return self._parse_json(
 929             json_string, video_id, transform_source=transform_source,
 930             fatal=fatal), urlh
 931
 932     def _download_json(
 933             self, url_or_request, video_id, note='Downloading JSON metadata',
 934             errnote='Unable to download JSON metadata', transform_source=None,
 935             fatal=True, encoding=None, data=None, headers={}, query={},
 936             expected_status=None):
 937         """
 938         Return the JSON object as a dict.
 939
 940         See _download_webpage docstring for arguments specification.
 941         """
 942         res = self._download_json_handle(
 943             url_or_request, video_id, note=note, errnote=errnote,
 944             transform_source=transform_source, fatal=fatal, encoding=encoding,
 945             data=data, headers=headers, query=query,
 946             expected_status=expected_status)
 947         return res if res is False else res[0]
 948
 949     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 950         if transform_source:
 951             json_string = transform_source(json_string)
 952         try:
 953             return json.loads(json_string)
 954         except ValueError as ve:
 955             errmsg = '%s: Failed to parse JSON ' % video_id
 956             if fatal:
 957                 raise ExtractorError(errmsg, cause=ve)
 958             else:
 959                 self.report_warning(errmsg + str(ve))
 960
 961     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 962         return self._parse_json(
 963             data[data.find('{'):data.rfind('}') + 1],
 964             video_id, transform_source, fatal)
 965
 966     def _download_socket_json_handle(
 967             self, url_or_request, video_id, note='Polling socket',
 968             errnote='Unable to poll socket', transform_source=None,
 969             fatal=True, encoding=None, data=None, headers={}, query={},
 970             expected_status=None):
 971         """
 972         Return a tuple (JSON object, URL handle).
 973
 974         See _download_webpage docstring for arguments specification.
 975         """
 976         res = self._download_webpage_handle(
 977             url_or_request, video_id, note, errnote, fatal=fatal,
 978             encoding=encoding, data=data, headers=headers, query=query,
 979             expected_status=expected_status)
 980         if res is False:
 981             return res
 982         webpage, urlh = res
 983         return self._parse_socket_response_as_json(
 984             webpage, video_id, transform_source=transform_source,
 985             fatal=fatal), urlh
 986
 987     def _download_socket_json(
 988             self, url_or_request, video_id, note='Polling socket',
 989             errnote='Unable to poll socket', transform_source=None,
 990             fatal=True, encoding=None, data=None, headers={}, query={},
 991             expected_status=None):
 992         """
 993         Return the JSON object as a dict.
 994
 995         See _download_webpage docstring for arguments specification.
 996         """
 997         res = self._download_socket_json_handle(
 998             url_or_request, video_id, note=note, errnote=errnote,
 999             transform_source=transform_source, fatal=fatal, encoding=encoding,
1000             data=data, headers=headers, query=query,
1001             expected_status=expected_status)
1002         return res if res is False else res[0]
1003
1004     def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1005         idstr = format_field(video_id, template='%s: ')
1006         msg = f'[{self.IE_NAME}] {idstr}{msg}'
1007         if only_once:
1008             if f'WARNING: {msg}' in self._printed_messages:
1009                 return
1010             self._printed_messages.add(f'WARNING: {msg}')
1011         self._downloader.report_warning(msg, *args, **kwargs)
1012
1013     def to_screen(self, msg, *args, **kwargs):
1014         """Print msg to screen, prefixing it with '[ie_name]'"""
1015         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1016
1017     def write_debug(self, msg, *args, **kwargs):
1018         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1019
1020     def get_param(self, name, default=None, *args, **kwargs):
1021         if self._downloader:
1022             return self._downloader.params.get(name, default, *args, **kwargs)
1023         return default
1024
1025     def report_extraction(self, id_or_name):
1026         """Report information extraction."""
1027         self.to_screen('%s: Extracting information' % id_or_name)
1028
1029     def report_download_webpage(self, video_id):
1030         """Report webpage download."""
1031         self.to_screen('%s: Downloading webpage' % video_id)
1032
1033     def report_age_confirmation(self):
1034         """Report attempt to confirm age."""
1035         self.to_screen('Confirming age')
1036
1037     def report_login(self):
1038         """Report attempt to log in."""
1039         self.to_screen('Logging in')
1040
1041     def raise_login_required(
1042             self, msg='This video is only available for registered users',
1043             metadata_available=False, method='any'):
1044         if metadata_available and self.get_param('ignore_no_formats_error'):
1045             self.report_warning(msg)
1046         if method is not None:
1047             msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1048         raise ExtractorError(msg, expected=True)
1049
1050     def raise_geo_restricted(
1051             self, msg='This video is not available from your location due to geo restriction',
1052             countries=None, metadata_available=False):
1053         if metadata_available and self.get_param('ignore_no_formats_error'):
1054             self.report_warning(msg)
1055         else:
1056             raise GeoRestrictedError(msg, countries=countries)
1057
1058     def raise_no_formats(self, msg, expected=False, video_id=None):
1059         if expected and self.get_param('ignore_no_formats_error'):
1060             self.report_warning(msg, video_id)
1061         elif isinstance(msg, ExtractorError):
1062             raise msg
1063         else:
1064             raise ExtractorError(msg, expected=expected, video_id=video_id)
1065
1066     # Methods for following #608
1067     @staticmethod
1068     def url_result(url, ie=None, video_id=None, video_title=None):
1069         """Returns a URL that points to a page that should be processed"""
1070         # TODO: ie should be the class used for getting the info
1071         video_info = {'_type': 'url',
1072                       'url': url,
1073                       'ie_key': ie}
1074         if video_id is not None:
1075             video_info['id'] = video_id
1076         if video_title is not None:
1077             video_info['title'] = video_title
1078         return video_info
1079
1080     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1081         urls = orderedSet(
1082             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1083             for m in matches)
1084         return self.playlist_result(
1085             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1086
1087     @staticmethod
1088     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1089         """Returns a playlist"""
1090         video_info = {'_type': 'playlist',
1091                       'entries': entries}
1092         video_info.update(kwargs)
1093         if playlist_id:
1094             video_info['id'] = playlist_id
1095         if playlist_title:
1096             video_info['title'] = playlist_title
1097         if playlist_description is not None:
1098             video_info['description'] = playlist_description
1099         return video_info
1100
1101     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1102         """
1103         Perform a regex search on the given string, using a single or a list of
1104         patterns returning the first matching group.
1105         In case of failure return a default value or raise a WARNING or a
1106         RegexNotFoundError, depending on fatal, specifying the field name.
1107         """
1108         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1109             mobj = re.search(pattern, string, flags)
1110         else:
1111             for p in pattern:
1112                 mobj = re.search(p, string, flags)
1113                 if mobj:
1114                     break
1115
1116         if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1117             _name = '\033[0;34m%s\033[0m' % name
1118         else:
1119             _name = name
1120
1121         if mobj:
1122             if group is None:
1123                 # return the first matching group
1124                 return next(g for g in mobj.groups() if g is not None)
1125             elif isinstance(group, (list, tuple)):
1126                 return tuple(mobj.group(g) for g in group)
1127             else:
1128                 return mobj.group(group)
1129         elif default is not NO_DEFAULT:
1130             return default
1131         elif fatal:
1132             raise RegexNotFoundError('Unable to extract %s' % _name)
1133         else:
1134             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1135             return None
1136
1137     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1138         """
1139         Like _search_regex, but strips HTML tags and unescapes entities.
1140         """
1141         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1142         if res:
1143             return clean_html(res).strip()
1144         else:
1145             return res
1146
1147     def _get_netrc_login_info(self, netrc_machine=None):
1148         username = None
1149         password = None
1150         netrc_machine = netrc_machine or self._NETRC_MACHINE
1151
1152         if self.get_param('usenetrc', False):
1153             try:
1154                 info = netrc.netrc().authenticators(netrc_machine)
1155                 if info is not None:
1156                     username = info[0]
1157                     password = info[2]
1158                 else:
1159                     raise netrc.NetrcParseError(
1160                         'No authenticators for %s' % netrc_machine)
1161             except (IOError, netrc.NetrcParseError) as err:
1162                 self.report_warning(
1163                     'parsing .netrc: %s' % error_to_compat_str(err))
1164
1165         return username, password
1166
1167     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1168         """
1169         Get the login info as (username, password)
1170         First look for the manually specified credentials using username_option
1171         and password_option as keys in params dictionary. If no such credentials
1172         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1173         value.
1174         If there's no info available, return (None, None)
1175         """
1176
1177         # Attempt to use provided username and password or .netrc data
1178         username = self.get_param(username_option)
1179         if username is not None:
1180             password = self.get_param(password_option)
1181         else:
1182             username, password = self._get_netrc_login_info(netrc_machine)
1183
1184         return username, password
1185
1186     def _get_tfa_info(self, note='two-factor verification code'):
1187         """
1188         Get the two-factor authentication info
1189         TODO - asking the user will be required for sms/phone verify
1190         currently just uses the command line option
1191         If there's no info available, return None
1192         """
1193
1194         tfa = self.get_param('twofactor')
1195         if tfa is not None:
1196             return tfa
1197
1198         return compat_getpass('Type %s and press [Return]: ' % note)
1199
1200     # Helper functions for extracting OpenGraph info
1201     @staticmethod
1202     def _og_regexes(prop):
1203         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1204         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1205                        % {'prop': re.escape(prop)})
1206         template = r'<meta[^>]+?%s[^>]+?%s'
1207         return [
1208             template % (property_re, content_re),
1209             template % (content_re, property_re),
1210         ]
1211
1212     @staticmethod
1213     def _meta_regex(prop):
1214         return r'''(?isx)<meta
1215                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1216                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1217
1218     def _og_search_property(self, prop, html, name=None, **kargs):
1219         prop = variadic(prop)
1220         if name is None:
1221             name = 'OpenGraph %s' % prop[0]
1222         og_regexes = []
1223         for p in prop:
1224             og_regexes.extend(self._og_regexes(p))
1225         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1226         if escaped is None:
1227             return None
1228         return unescapeHTML(escaped)
1229
1230     def _og_search_thumbnail(self, html, **kargs):
1231         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1232
1233     def _og_search_description(self, html, **kargs):
1234         return self._og_search_property('description', html, fatal=False, **kargs)
1235
1236     def _og_search_title(self, html, **kargs):
1237         return self._og_search_property('title', html, **kargs)
1238
1239     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1240         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1241         if secure:
1242             regexes = self._og_regexes('video:secure_url') + regexes
1243         return self._html_search_regex(regexes, html, name, **kargs)
1244
1245     def _og_search_url(self, html, **kargs):
1246         return self._og_search_property('url', html, **kargs)
1247
1248     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1249         name = variadic(name)
1250         if display_name is None:
1251             display_name = name[0]
1252         return self._html_search_regex(
1253             [self._meta_regex(n) for n in name],
1254             html, display_name, fatal=fatal, group='content', **kwargs)
1255
1256     def _dc_search_uploader(self, html):
1257         return self._html_search_meta('dc.creator', html, 'uploader')
1258
1259     def _rta_search(self, html):
1260         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1261         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1262                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1263                      html):
1264             return 18
1265         return 0
1266
1267     def _media_rating_search(self, html):
1268         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1269         rating = self._html_search_meta('rating', html)
1270
1271         if not rating:
1272             return None
1273
1274         RATING_TABLE = {
1275             'safe for kids': 0,
1276             'general': 8,
1277             '14 years': 14,
1278             'mature': 17,
1279             'restricted': 19,
1280         }
1281         return RATING_TABLE.get(rating.lower())
1282
1283     def _family_friendly_search(self, html):
1284         # See http://schema.org/VideoObject
1285         family_friendly = self._html_search_meta(
1286             'isFamilyFriendly', html, default=None)
1287
1288         if not family_friendly:
1289             return None
1290
1291         RATING_TABLE = {
1292             '1': 0,
1293             'true': 0,
1294             '0': 18,
1295             'false': 18,
1296         }
1297         return RATING_TABLE.get(family_friendly.lower())
1298
1299     def _twitter_search_player(self, html):
1300         return self._html_search_meta('twitter:player', html,
1301                                       'twitter card player')
1302
1303     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1304         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1305         default = kwargs.get('default', NO_DEFAULT)
1306         # JSON-LD may be malformed and thus `fatal` should be respected.
1307         # At the same time `default` may be passed that assumes `fatal=False`
1308         # for _search_regex. Let's simulate the same behavior here as well.
1309         fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
1310         json_ld = []
1311         for mobj in json_ld_list:
1312             json_ld_item = self._parse_json(
1313                 mobj.group('json_ld'), video_id, fatal=fatal)
1314             if not json_ld_item:
1315                 continue
1316             if isinstance(json_ld_item, dict):
1317                 json_ld.append(json_ld_item)
1318             elif isinstance(json_ld_item, (list, tuple)):
1319                 json_ld.extend(json_ld_item)
1320         if json_ld:
1321             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1322         if json_ld:
1323             return json_ld
1324         if default is not NO_DEFAULT:
1325             return default
1326         elif fatal:
1327             raise RegexNotFoundError('Unable to extract JSON-LD')
1328         else:
1329             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1330             return {}
1331
1332     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1333         if isinstance(json_ld, compat_str):
1334             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1335         if not json_ld:
1336             return {}
1337         info = {}
1338         if not isinstance(json_ld, (list, tuple, dict)):
1339             return info
1340         if isinstance(json_ld, dict):
1341             json_ld = [json_ld]
1342
1343         INTERACTION_TYPE_MAP = {
1344             'CommentAction': 'comment',
1345             'AgreeAction': 'like',
1346             'DisagreeAction': 'dislike',
1347             'LikeAction': 'like',
1348             'DislikeAction': 'dislike',
1349             'ListenAction': 'view',
1350             'WatchAction': 'view',
1351             'ViewAction': 'view',
1352         }
1353
1354         def extract_interaction_type(e):
1355             interaction_type = e.get('interactionType')
1356             if isinstance(interaction_type, dict):
1357                 interaction_type = interaction_type.get('@type')
1358             return str_or_none(interaction_type)
1359
1360         def extract_interaction_statistic(e):
1361             interaction_statistic = e.get('interactionStatistic')
1362             if isinstance(interaction_statistic, dict):
1363                 interaction_statistic = [interaction_statistic]
1364             if not isinstance(interaction_statistic, list):
1365                 return
1366             for is_e in interaction_statistic:
1367                 if not isinstance(is_e, dict):
1368                     continue
1369                 if is_e.get('@type') != 'InteractionCounter':
1370                     continue
1371                 interaction_type = extract_interaction_type(is_e)
1372                 if not interaction_type:
1373                     continue
1374                 # For interaction count some sites provide string instead of
1375                 # an integer (as per spec) with non digit characters (e.g. ",")
1376                 # so extracting count with more relaxed str_to_int
1377                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1378                 if interaction_count is None:
1379                     continue
1380                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1381                 if not count_kind:
1382                     continue
1383                 count_key = '%s_count' % count_kind
1384                 if info.get(count_key) is not None:
1385                     continue
1386                 info[count_key] = interaction_count
1387
1388         def extract_video_object(e):
1389             assert e['@type'] == 'VideoObject'
1390             author = e.get('author')
1391             info.update({
1392                 'url': url_or_none(e.get('contentUrl')),
1393                 'title': unescapeHTML(e.get('name')),
1394                 'description': unescapeHTML(e.get('description')),
1395                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1396                 'duration': parse_duration(e.get('duration')),
1397                 'timestamp': unified_timestamp(e.get('uploadDate')),
1398                 # author can be an instance of 'Organization' or 'Person' types.
1399                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1400                 # however some websites are using 'Text' type instead.
1401                 # 1. https://schema.org/VideoObject
1402                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1403                 'filesize': float_or_none(e.get('contentSize')),
1404                 'tbr': int_or_none(e.get('bitrate')),
1405                 'width': int_or_none(e.get('width')),
1406                 'height': int_or_none(e.get('height')),
1407                 'view_count': int_or_none(e.get('interactionCount')),
1408             })
1409             extract_interaction_statistic(e)
1410
1411         for e in json_ld:
1412             if '@context' in e:
1413                 item_type = e.get('@type')
1414                 if expected_type is not None and expected_type != item_type:
1415                     continue
1416                 if item_type in ('TVEpisode', 'Episode'):
1417                     episode_name = unescapeHTML(e.get('name'))
1418                     info.update({
1419                         'episode': episode_name,
1420                         'episode_number': int_or_none(e.get('episodeNumber')),
1421                         'description': unescapeHTML(e.get('description')),
1422                     })
1423                     if not info.get('title') and episode_name:
1424                         info['title'] = episode_name
1425                     part_of_season = e.get('partOfSeason')
1426                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1427                         info.update({
1428                             'season': unescapeHTML(part_of_season.get('name')),
1429                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1430                         })
1431                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1432                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1433                         info['series'] = unescapeHTML(part_of_series.get('name'))
1434                 elif item_type == 'Movie':
1435                     info.update({
1436                         'title': unescapeHTML(e.get('name')),
1437                         'description': unescapeHTML(e.get('description')),
1438                         'duration': parse_duration(e.get('duration')),
1439                         'timestamp': unified_timestamp(e.get('dateCreated')),
1440                     })
1441                 elif item_type in ('Article', 'NewsArticle'):
1442                     info.update({
1443                         'timestamp': parse_iso8601(e.get('datePublished')),
1444                         'title': unescapeHTML(e.get('headline')),
1445                         'description': unescapeHTML(e.get('articleBody')),
1446                     })
1447                 elif item_type == 'VideoObject':
1448                     extract_video_object(e)
1449                     if expected_type is None:
1450                         continue
1451                     else:
1452                         break
1453                 video = e.get('video')
1454                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1455                     extract_video_object(video)
1456                 if expected_type is None:
1457                     continue
1458                 else:
1459                     break
1460         return dict((k, v) for k, v in info.items() if v is not None)
1461
1462     @staticmethod
1463     def _hidden_inputs(html):
1464         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1465         hidden_inputs = {}
1466         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1467             attrs = extract_attributes(input)
1468             if not input:
1469                 continue
1470             if attrs.get('type') not in ('hidden', 'submit'):
1471                 continue
1472             name = attrs.get('name') or attrs.get('id')
1473             value = attrs.get('value')
1474             if name and value is not None:
1475                 hidden_inputs[name] = value
1476         return hidden_inputs
1477
1478     def _form_hidden_inputs(self, form_id, html):
1479         form = self._search_regex(
1480             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1481             html, '%s form' % form_id, group='form')
1482         return self._hidden_inputs(form)
1483
1484     class FormatSort:
1485         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1486
1487         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1488                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1489                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1490         ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
1491                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1492                         'fps', 'fs_approx', 'source', 'format_id')
1493
1494         settings = {
1495             'vcodec': {'type': 'ordered', 'regex': True,
1496                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1497             'acodec': {'type': 'ordered', 'regex': True,
1498                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1499             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1500                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1501             'vext': {'type': 'ordered', 'field': 'video_ext',
1502                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1503                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1504             'aext': {'type': 'ordered', 'field': 'audio_ext',
1505                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1506                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1507             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1508             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1509                            'field': ('vcodec', 'acodec'),
1510                            'function': lambda it: int(any(v != 'none' for v in it))},
1511             'ie_pref': {'priority': True, 'type': 'extractor'},
1512             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1513             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1514             'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1515             'quality': {'convert': 'float_none', 'default': -1},
1516             'filesize': {'convert': 'bytes'},
1517             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1518             'id': {'convert': 'string', 'field': 'format_id'},
1519             'height': {'convert': 'float_none'},
1520             'width': {'convert': 'float_none'},
1521             'fps': {'convert': 'float_none'},
1522             'tbr': {'convert': 'float_none'},
1523             'vbr': {'convert': 'float_none'},
1524             'abr': {'convert': 'float_none'},
1525             'asr': {'convert': 'float_none'},
1526             'source': {'convert': 'ignore', 'field': 'source_preference'},
1527
1528             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1529             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1530             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1531             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1532             'res': {'type': 'multiple', 'field': ('height', 'width'),
1533                     'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1534
1535             # Most of these exist only for compatibility reasons
1536             'dimension': {'type': 'alias', 'field': 'res'},
1537             'resolution': {'type': 'alias', 'field': 'res'},
1538             'extension': {'type': 'alias', 'field': 'ext'},
1539             'bitrate': {'type': 'alias', 'field': 'br'},
1540             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1541             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1542             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1543             'framerate': {'type': 'alias', 'field': 'fps'},
1544             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1545             'protocol': {'type': 'alias', 'field': 'proto'},
1546             'source_preference': {'type': 'alias', 'field': 'source'},
1547             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1548             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1549             'samplerate': {'type': 'alias', 'field': 'asr'},
1550             'video_ext': {'type': 'alias', 'field': 'vext'},
1551             'audio_ext': {'type': 'alias', 'field': 'aext'},
1552             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1553             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1554             'video': {'type': 'alias', 'field': 'hasvid'},
1555             'has_video': {'type': 'alias', 'field': 'hasvid'},
1556             'audio': {'type': 'alias', 'field': 'hasaud'},
1557             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1558             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1559             'preference': {'type': 'alias', 'field': 'ie_pref'},
1560             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1561             'format_id': {'type': 'alias', 'field': 'id'},
1562         }
1563
1564         _order = []
1565
1566         def _get_field_setting(self, field, key):
1567             if field not in self.settings:
1568                 self.settings[field] = {}
1569             propObj = self.settings[field]
1570             if key not in propObj:
1571                 type = propObj.get('type')
1572                 if key == 'field':
1573                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1574                 elif key == 'convert':
1575                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1576                 else:
1577                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1578                 propObj[key] = default
1579             return propObj[key]
1580
1581         def _resolve_field_value(self, field, value, convertNone=False):
1582             if value is None:
1583                 if not convertNone:
1584                     return None
1585             else:
1586                 value = value.lower()
1587             conversion = self._get_field_setting(field, 'convert')
1588             if conversion == 'ignore':
1589                 return None
1590             if conversion == 'string':
1591                 return value
1592             elif conversion == 'float_none':
1593                 return float_or_none(value)
1594             elif conversion == 'bytes':
1595                 return FileDownloader.parse_bytes(value)
1596             elif conversion == 'order':
1597                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1598                 use_regex = self._get_field_setting(field, 'regex')
1599                 list_length = len(order_list)
1600                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1601                 if use_regex and value is not None:
1602                     for i, regex in enumerate(order_list):
1603                         if regex and re.match(regex, value):
1604                             return list_length - i
1605                     return list_length - empty_pos  # not in list
1606                 else:  # not regex or  value = None
1607                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1608             else:
1609                 if value.isnumeric():
1610                     return float(value)
1611                 else:
1612                     self.settings[field]['convert'] = 'string'
1613                     return value
1614
1615         def evaluate_params(self, params, sort_extractor):
1616             self._use_free_order = params.get('prefer_free_formats', False)
1617             self._sort_user = params.get('format_sort', [])
1618             self._sort_extractor = sort_extractor
1619
1620             def add_item(field, reverse, closest, limit_text):
1621                 field = field.lower()
1622                 if field in self._order:
1623                     return
1624                 self._order.append(field)
1625                 limit = self._resolve_field_value(field, limit_text)
1626                 data = {
1627                     'reverse': reverse,
1628                     'closest': False if limit is None else closest,
1629                     'limit_text': limit_text,
1630                     'limit': limit}
1631                 if field in self.settings:
1632                     self.settings[field].update(data)
1633                 else:
1634                     self.settings[field] = data
1635
1636             sort_list = (
1637                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1638                 + (tuple() if params.get('format_sort_force', False)
1639                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1640                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1641
1642             for item in sort_list:
1643                 match = re.match(self.regex, item)
1644                 if match is None:
1645                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1646                 field = match.group('field')
1647                 if field is None:
1648                     continue
1649                 if self._get_field_setting(field, 'type') == 'alias':
1650                     field = self._get_field_setting(field, 'field')
1651                 reverse = match.group('reverse') is not None
1652                 closest = match.group('separator') == '~'
1653                 limit_text = match.group('limit')
1654
1655                 has_limit = limit_text is not None
1656                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1657                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1658
1659                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1660                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1661                 limit_count = len(limits)
1662                 for (i, f) in enumerate(fields):
1663                     add_item(f, reverse, closest,
1664                              limits[i] if i < limit_count
1665                              else limits[0] if has_limit and not has_multiple_limits
1666                              else None)
1667
1668         def print_verbose_info(self, write_debug):
1669             if self._sort_user:
1670                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1671             if self._sort_extractor:
1672                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1673             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1674                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1675                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1676                               self._get_field_setting(field, 'limit_text'),
1677                               self._get_field_setting(field, 'limit'))
1678                 if self._get_field_setting(field, 'limit_text') is not None else '')
1679                 for field in self._order if self._get_field_setting(field, 'visible')]))
1680
1681         def _calculate_field_preference_from_value(self, format, field, type, value):
1682             reverse = self._get_field_setting(field, 'reverse')
1683             closest = self._get_field_setting(field, 'closest')
1684             limit = self._get_field_setting(field, 'limit')
1685
1686             if type == 'extractor':
1687                 maximum = self._get_field_setting(field, 'max')
1688                 if value is None or (maximum is not None and value >= maximum):
1689                     value = -1
1690             elif type == 'boolean':
1691                 in_list = self._get_field_setting(field, 'in_list')
1692                 not_in_list = self._get_field_setting(field, 'not_in_list')
1693                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1694             elif type == 'ordered':
1695                 value = self._resolve_field_value(field, value, True)
1696
1697             # try to convert to number
1698             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1699             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1700             if is_num:
1701                 value = val_num
1702
1703             return ((-10, 0) if value is None
1704                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1705                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1706                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1707                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1708                     else (-1, value, 0))
1709
1710         def _calculate_field_preference(self, format, field):
1711             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1712             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1713             if type == 'multiple':
1714                 type = 'field'  # Only 'field' is allowed in multiple for now
1715                 actual_fields = self._get_field_setting(field, 'field')
1716
1717                 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1718             else:
1719                 value = get_value(field)
1720             return self._calculate_field_preference_from_value(format, field, type, value)
1721
1722         def calculate_preference(self, format):
1723             # Determine missing protocol
1724             if not format.get('protocol'):
1725                 format['protocol'] = determine_protocol(format)
1726
1727             # Determine missing ext
1728             if not format.get('ext') and 'url' in format:
1729                 format['ext'] = determine_ext(format['url'])
1730             if format.get('vcodec') == 'none':
1731                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1732                 format['video_ext'] = 'none'
1733             else:
1734                 format['video_ext'] = format['ext']
1735                 format['audio_ext'] = 'none'
1736             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1737             #    format['preference'] = -1000
1738
1739             # Determine missing bitrates
1740             if format.get('tbr') is None:
1741                 if format.get('vbr') is not None and format.get('abr') is not None:
1742                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1743             else:
1744                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1745                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1746                 if format.get('acodec') != "none" and format.get('abr') is None:
1747                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1748
1749             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1750
1751     def _sort_formats(self, formats, field_preference=[]):
1752         if not formats:
1753             if self.get_param('ignore_no_formats_error'):
1754                 return
1755             raise ExtractorError('No video formats found')
1756         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1757         format_sort.evaluate_params(self._downloader.params, field_preference)
1758         if self.get_param('verbose', False):
1759             format_sort.print_verbose_info(self._downloader.write_debug)
1760         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1761
1762     def _check_formats(self, formats, video_id):
1763         if formats:
1764             formats[:] = filter(
1765                 lambda f: self._is_valid_url(
1766                     f['url'], video_id,
1767                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1768                 formats)
1769
1770     @staticmethod
1771     def _remove_duplicate_formats(formats):
1772         format_urls = set()
1773         unique_formats = []
1774         for f in formats:
1775             if f['url'] not in format_urls:
1776                 format_urls.add(f['url'])
1777                 unique_formats.append(f)
1778         formats[:] = unique_formats
1779
1780     def _is_valid_url(self, url, video_id, item='video', headers={}):
1781         url = self._proto_relative_url(url, scheme='http:')
1782         # For now assume non HTTP(S) URLs always valid
1783         if not (url.startswith('http://') or url.startswith('https://')):
1784             return True
1785         try:
1786             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1787             return True
1788         except ExtractorError as e:
1789             self.to_screen(
1790                 '%s: %s URL is invalid, skipping: %s'
1791                 % (video_id, item, error_to_compat_str(e.cause)))
1792             return False
1793
1794     def http_scheme(self):
1795         """ Either "http:" or "https:", depending on the user's preferences """
1796         return (
1797             'http:'
1798             if self.get_param('prefer_insecure', False)
1799             else 'https:')
1800
1801     def _proto_relative_url(self, url, scheme=None):
1802         if url is None:
1803             return url
1804         if url.startswith('//'):
1805             if scheme is None:
1806                 scheme = self.http_scheme()
1807             return scheme + url
1808         else:
1809             return url
1810
1811     def _sleep(self, timeout, video_id, msg_template=None):
1812         if msg_template is None:
1813             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1814         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1815         self.to_screen(msg)
1816         time.sleep(timeout)
1817
1818     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1819                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1820                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1821         manifest = self._download_xml(
1822             manifest_url, video_id, 'Downloading f4m manifest',
1823             'Unable to download f4m manifest',
1824             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1825             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1826             transform_source=transform_source,
1827             fatal=fatal, data=data, headers=headers, query=query)
1828
1829         if manifest is False:
1830             return []
1831
1832         return self._parse_f4m_formats(
1833             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1834             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1835
1836     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1837                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1838                            fatal=True, m3u8_id=None):
1839         if not isinstance(manifest, compat_etree_Element) and not fatal:
1840             return []
1841
1842         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1843         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1844         if akamai_pv is not None and ';' in akamai_pv.text:
1845             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1846             if playerVerificationChallenge.strip() != '':
1847                 return []
1848
1849         formats = []
1850         manifest_version = '1.0'
1851         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1852         if not media_nodes:
1853             manifest_version = '2.0'
1854             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1855         # Remove unsupported DRM protected media from final formats
1856         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1857         media_nodes = remove_encrypted_media(media_nodes)
1858         if not media_nodes:
1859             return formats
1860
1861         manifest_base_url = get_base_url(manifest)
1862
1863         bootstrap_info = xpath_element(
1864             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1865             'bootstrap info', default=None)
1866
1867         vcodec = None
1868         mime_type = xpath_text(
1869             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1870             'base URL', default=None)
1871         if mime_type and mime_type.startswith('audio/'):
1872             vcodec = 'none'
1873
1874         for i, media_el in enumerate(media_nodes):
1875             tbr = int_or_none(media_el.attrib.get('bitrate'))
1876             width = int_or_none(media_el.attrib.get('width'))
1877             height = int_or_none(media_el.attrib.get('height'))
1878             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1879             # If <bootstrapInfo> is present, the specified f4m is a
1880             # stream-level manifest, and only set-level manifests may refer to
1881             # external resources.  See section 11.4 and section 4 of F4M spec
1882             if bootstrap_info is None:
1883                 media_url = None
1884                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1885                 if manifest_version == '2.0':
1886                     media_url = media_el.attrib.get('href')
1887                 if media_url is None:
1888                     media_url = media_el.attrib.get('url')
1889                 if not media_url:
1890                     continue
1891                 manifest_url = (
1892                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1893                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1894                 # If media_url is itself a f4m manifest do the recursive extraction
1895                 # since bitrates in parent manifest (this one) and media_url manifest
1896                 # may differ leading to inability to resolve the format by requested
1897                 # bitrate in f4m downloader
1898                 ext = determine_ext(manifest_url)
1899                 if ext == 'f4m':
1900                     f4m_formats = self._extract_f4m_formats(
1901                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1902                         transform_source=transform_source, fatal=fatal)
1903                     # Sometimes stream-level manifest contains single media entry that
1904                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1905                     # At the same time parent's media entry in set-level manifest may
1906                     # contain it. We will copy it from parent in such cases.
1907                     if len(f4m_formats) == 1:
1908                         f = f4m_formats[0]
1909                         f.update({
1910                             'tbr': f.get('tbr') or tbr,
1911                             'width': f.get('width') or width,
1912                             'height': f.get('height') or height,
1913                             'format_id': f.get('format_id') if not tbr else format_id,
1914                             'vcodec': vcodec,
1915                         })
1916                     formats.extend(f4m_formats)
1917                     continue
1918                 elif ext == 'm3u8':
1919                     formats.extend(self._extract_m3u8_formats(
1920                         manifest_url, video_id, 'mp4', preference=preference,
1921                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1922                     continue
1923             formats.append({
1924                 'format_id': format_id,
1925                 'url': manifest_url,
1926                 'manifest_url': manifest_url,
1927                 'ext': 'flv' if bootstrap_info is not None else None,
1928                 'protocol': 'f4m',
1929                 'tbr': tbr,
1930                 'width': width,
1931                 'height': height,
1932                 'vcodec': vcodec,
1933                 'preference': preference,
1934                 'quality': quality,
1935             })
1936         return formats
1937
1938     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1939         return {
1940             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1941             'url': m3u8_url,
1942             'ext': ext,
1943             'protocol': 'm3u8',
1944             'preference': preference - 100 if preference else -100,
1945             'quality': quality,
1946             'resolution': 'multiple',
1947             'format_note': 'Quality selection URL',
1948         }
1949
1950     def _extract_m3u8_formats(self, *args, **kwargs):
1951         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1952         if subs:
1953             self.report_warning(bug_reports_message(
1954                 "Ignoring subtitle tracks found in the HLS manifest; "
1955                 "if any subtitle tracks are missing,"
1956             ), only_once=True)
1957         return fmts
1958
1959     def _extract_m3u8_formats_and_subtitles(
1960             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1961             preference=None, quality=None, m3u8_id=None, note=None,
1962             errnote=None, fatal=True, live=False, data=None, headers={},
1963             query={}):
1964
1965         res = self._download_webpage_handle(
1966             m3u8_url, video_id,
1967             note='Downloading m3u8 information' if note is None else note,
1968             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1969             fatal=fatal, data=data, headers=headers, query=query)
1970
1971         if res is False:
1972             return [], {}
1973
1974         m3u8_doc, urlh = res
1975         m3u8_url = urlh.geturl()
1976
1977         return self._parse_m3u8_formats_and_subtitles(
1978             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1979             preference=preference, quality=quality, m3u8_id=m3u8_id,
1980             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1981             headers=headers, query=query, video_id=video_id)
1982
1983     def _parse_m3u8_formats_and_subtitles(
1984             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
1985             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1986             errnote=None, fatal=True, data=None, headers={}, query={},
1987             video_id=None):
1988         formats, subtitles = [], {}
1989
1990         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1991             return formats, subtitles
1992
1993         if (not self.get_param('allow_unplayable_formats')
1994                 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)):  # Apple FairPlay
1995             return formats, subtitles
1996
1997         def format_url(url):
1998             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
1999
2000         if self.get_param('hls_split_discontinuity', False):
2001             def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2002                 if not m3u8_doc:
2003                     if not manifest_url:
2004                         return []
2005                     m3u8_doc = self._download_webpage(
2006                         manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2007                         note=False, errnote='Failed to download m3u8 playlist information')
2008                     if m3u8_doc is False:
2009                         return []
2010                 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2011
2012         else:
2013             def _extract_m3u8_playlist_indices(*args, **kwargs):
2014                 return [None]
2015
2016         # References:
2017         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2018         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2019         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2020
2021         # We should try extracting formats only from master playlists [1, 4.3.4],
2022         # i.e. playlists that describe available qualities. On the other hand
2023         # media playlists [1, 4.3.3] should be returned as is since they contain
2024         # just the media without qualities renditions.
2025         # Fortunately, master playlist can be easily distinguished from media
2026         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2027         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2028         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2029         # media playlist and MUST NOT appear in master playlist thus we can
2030         # clearly detect media playlist with this criterion.
2031
2032         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2033             formats = [{
2034                 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
2035                 'format_index': idx,
2036                 'url': m3u8_url,
2037                 'ext': ext,
2038                 'protocol': entry_protocol,
2039                 'preference': preference,
2040                 'quality': quality,
2041             } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2042
2043             return formats, subtitles
2044
2045         groups = {}
2046         last_stream_inf = {}
2047
2048         def extract_media(x_media_line):
2049             media = parse_m3u8_attributes(x_media_line)
2050             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2051             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2052             if not (media_type and group_id and name):
2053                 return
2054             groups.setdefault(group_id, []).append(media)
2055             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2056             if media_type == 'SUBTITLES':
2057                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2058                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2059                 # However, lack of URI has been spotted in the wild.
2060                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2061                 if not media.get('URI'):
2062                     return
2063                 url = format_url(media['URI'])
2064                 sub_info = {
2065                     'url': url,
2066                     'ext': determine_ext(url),
2067                 }
2068                 if sub_info['ext'] == 'm3u8':
2069                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2070                     # files may contain is WebVTT:
2071                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2072                     sub_info['ext'] = 'vtt'
2073                     sub_info['protocol'] = 'm3u8_native'
2074                 lang = media.get('LANGUAGE') or 'und'
2075                 subtitles.setdefault(lang, []).append(sub_info)
2076             if media_type not in ('VIDEO', 'AUDIO'):
2077                 return
2078             media_url = media.get('URI')
2079             if media_url:
2080                 manifest_url = format_url(media_url)
2081                 formats.extend({
2082                     'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
2083                     'format_note': name,
2084                     'format_index': idx,
2085                     'url': manifest_url,
2086                     'manifest_url': m3u8_url,
2087                     'language': media.get('LANGUAGE'),
2088                     'ext': ext,
2089                     'protocol': entry_protocol,
2090                     'preference': preference,
2091                     'quality': quality,
2092                     'vcodec': 'none' if media_type == 'AUDIO' else None,
2093                 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2094
2095         def build_stream_name():
2096             # Despite specification does not mention NAME attribute for
2097             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2098             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2099             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2100             stream_name = last_stream_inf.get('NAME')
2101             if stream_name:
2102                 return stream_name
2103             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2104             # from corresponding rendition group
2105             stream_group_id = last_stream_inf.get('VIDEO')
2106             if not stream_group_id:
2107                 return
2108             stream_group = groups.get(stream_group_id)
2109             if not stream_group:
2110                 return stream_group_id
2111             rendition = stream_group[0]
2112             return rendition.get('NAME') or stream_group_id
2113
2114         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2115         # chance to detect video only formats when EXT-X-STREAM-INF tags
2116         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2117         for line in m3u8_doc.splitlines():
2118             if line.startswith('#EXT-X-MEDIA:'):
2119                 extract_media(line)
2120
2121         for line in m3u8_doc.splitlines():
2122             if line.startswith('#EXT-X-STREAM-INF:'):
2123                 last_stream_inf = parse_m3u8_attributes(line)
2124             elif line.startswith('#') or not line.strip():
2125                 continue
2126             else:
2127                 tbr = float_or_none(
2128                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2129                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2130                 manifest_url = format_url(line.strip())
2131
2132                 for idx in _extract_m3u8_playlist_indices(manifest_url):
2133                     format_id = [m3u8_id, None, idx]
2134                     # Bandwidth of live streams may differ over time thus making
2135                     # format_id unpredictable. So it's better to keep provided
2136                     # format_id intact.
2137                     if not live:
2138                         stream_name = build_stream_name()
2139                         format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
2140                     f = {
2141                         'format_id': '-'.join(map(str, filter(None, format_id))),
2142                         'format_index': idx,
2143                         'url': manifest_url,
2144                         'manifest_url': m3u8_url,
2145                         'tbr': tbr,
2146                         'ext': ext,
2147                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2148                         'protocol': entry_protocol,
2149                         'preference': preference,
2150                         'quality': quality,
2151                     }
2152                     resolution = last_stream_inf.get('RESOLUTION')
2153                     if resolution:
2154                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2155                         if mobj:
2156                             f['width'] = int(mobj.group('width'))
2157                             f['height'] = int(mobj.group('height'))
2158                     # Unified Streaming Platform
2159                     mobj = re.search(
2160                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2161                     if mobj:
2162                         abr, vbr = mobj.groups()
2163                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2164                         f.update({
2165                             'vbr': vbr,
2166                             'abr': abr,
2167                         })
2168                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2169                     f.update(codecs)
2170                     audio_group_id = last_stream_inf.get('AUDIO')
2171                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2172                     # references a rendition group MUST have a CODECS attribute.
2173                     # However, this is not always respected, for example, [2]
2174                     # contains EXT-X-STREAM-INF tag which references AUDIO
2175                     # rendition group but does not have CODECS and despite
2176                     # referencing an audio group it represents a complete
2177                     # (with audio and video) format. So, for such cases we will
2178                     # ignore references to rendition groups and treat them
2179                     # as complete formats.
2180                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2181                         audio_group = groups.get(audio_group_id)
2182                         if audio_group and audio_group[0].get('URI'):
2183                             # TODO: update acodec for audio only formats with
2184                             # the same GROUP-ID
2185                             f['acodec'] = 'none'
2186                     if not f.get('ext'):
2187                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2188                     formats.append(f)
2189
2190                     # for DailyMotion
2191                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2192                     if progressive_uri:
2193                         http_f = f.copy()
2194                         del http_f['manifest_url']
2195                         http_f.update({
2196                             'format_id': f['format_id'].replace('hls-', 'http-'),
2197                             'protocol': 'http',
2198                             'url': progressive_uri,
2199                         })
2200                         formats.append(http_f)
2201
2202                 last_stream_inf = {}
2203         return formats, subtitles
2204
2205     @staticmethod
2206     def _xpath_ns(path, namespace=None):
2207         if not namespace:
2208             return path
2209         out = []
2210         for c in path.split('/'):
2211             if not c or c == '.':
2212                 out.append(c)
2213             else:
2214                 out.append('{%s}%s' % (namespace, c))
2215         return '/'.join(out)
2216
2217     def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2218         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2219
2220         if smil is False:
2221             assert not fatal
2222             return []
2223
2224         namespace = self._parse_smil_namespace(smil)
2225
2226         fmts = self._parse_smil_formats(
2227             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2228         subs = self._parse_smil_subtitles(
2229             smil, namespace=namespace)
2230
2231         return fmts, subs
2232
2233     def _extract_smil_formats(self, *args, **kwargs):
2234         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2235         if subs:
2236             self.report_warning(bug_reports_message(
2237                 "Ignoring subtitle tracks found in the SMIL manifest; "
2238                 "if any subtitle tracks are missing,"
2239             ), only_once=True)
2240         return fmts
2241
2242     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2243         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2244         if smil is False:
2245             return {}
2246         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2247
2248     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2249         return self._download_xml(
2250             smil_url, video_id, 'Downloading SMIL file',
2251             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2252
2253     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2254         namespace = self._parse_smil_namespace(smil)
2255
2256         formats = self._parse_smil_formats(
2257             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2258         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2259
2260         video_id = os.path.splitext(url_basename(smil_url))[0]
2261         title = None
2262         description = None
2263         upload_date = None
2264         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2265             name = meta.attrib.get('name')
2266             content = meta.attrib.get('content')
2267             if not name or not content:
2268                 continue
2269             if not title and name == 'title':
2270                 title = content
2271             elif not description and name in ('description', 'abstract'):
2272                 description = content
2273             elif not upload_date and name == 'date':
2274                 upload_date = unified_strdate(content)
2275
2276         thumbnails = [{
2277             'id': image.get('type'),
2278             'url': image.get('src'),
2279             'width': int_or_none(image.get('width')),
2280             'height': int_or_none(image.get('height')),
2281         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2282
2283         return {
2284             'id': video_id,
2285             'title': title or video_id,
2286             'description': description,
2287             'upload_date': upload_date,
2288             'thumbnails': thumbnails,
2289             'formats': formats,
2290             'subtitles': subtitles,
2291         }
2292
2293     def _parse_smil_namespace(self, smil):
2294         return self._search_regex(
2295             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2296
2297     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2298         base = smil_url
2299         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2300             b = meta.get('base') or meta.get('httpBase')
2301             if b:
2302                 base = b
2303                 break
2304
2305         formats = []
2306         rtmp_count = 0
2307         http_count = 0
2308         m3u8_count = 0
2309
2310         srcs = []
2311         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2312         for medium in media:
2313             src = medium.get('src')
2314             if not src or src in srcs:
2315                 continue
2316             srcs.append(src)
2317
2318             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2319             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2320             width = int_or_none(medium.get('width'))
2321             height = int_or_none(medium.get('height'))
2322             proto = medium.get('proto')
2323             ext = medium.get('ext')
2324             src_ext = determine_ext(src)
2325             streamer = medium.get('streamer') or base
2326
2327             if proto == 'rtmp' or streamer.startswith('rtmp'):
2328                 rtmp_count += 1
2329                 formats.append({
2330                     'url': streamer,
2331                     'play_path': src,
2332                     'ext': 'flv',
2333                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2334                     'tbr': bitrate,
2335                     'filesize': filesize,
2336                     'width': width,
2337                     'height': height,
2338                 })
2339                 if transform_rtmp_url:
2340                     streamer, src = transform_rtmp_url(streamer, src)
2341                     formats[-1].update({
2342                         'url': streamer,
2343                         'play_path': src,
2344                     })
2345                 continue
2346
2347             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2348             src_url = src_url.strip()
2349
2350             if proto == 'm3u8' or src_ext == 'm3u8':
2351                 m3u8_formats = self._extract_m3u8_formats(
2352                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2353                 if len(m3u8_formats) == 1:
2354                     m3u8_count += 1
2355                     m3u8_formats[0].update({
2356                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2357                         'tbr': bitrate,
2358                         'width': width,
2359                         'height': height,
2360                     })
2361                 formats.extend(m3u8_formats)
2362             elif src_ext == 'f4m':
2363                 f4m_url = src_url
2364                 if not f4m_params:
2365                     f4m_params = {
2366                         'hdcore': '3.2.0',
2367                         'plugin': 'flowplayer-3.2.0.1',
2368                     }
2369                 f4m_url += '&' if '?' in f4m_url else '?'
2370                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2371                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2372             elif src_ext == 'mpd':
2373                 formats.extend(self._extract_mpd_formats(
2374                     src_url, video_id, mpd_id='dash', fatal=False))
2375             elif re.search(r'\.ism/[Mm]anifest', src_url):
2376                 formats.extend(self._extract_ism_formats(
2377                     src_url, video_id, ism_id='mss', fatal=False))
2378             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2379                 http_count += 1
2380                 formats.append({
2381                     'url': src_url,
2382                     'ext': ext or src_ext or 'flv',
2383                     'format_id': 'http-%d' % (bitrate or http_count),
2384                     'tbr': bitrate,
2385                     'filesize': filesize,
2386                     'width': width,
2387                     'height': height,
2388                 })
2389
2390         return formats
2391
2392     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2393         urls = []
2394         subtitles = {}
2395         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2396             src = textstream.get('src')
2397             if not src or src in urls:
2398                 continue
2399             urls.append(src)
2400             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2401             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2402             subtitles.setdefault(lang, []).append({
2403                 'url': src,
2404                 'ext': ext,
2405             })
2406         return subtitles
2407
2408     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2409         xspf = self._download_xml(
2410             xspf_url, playlist_id, 'Downloading xpsf playlist',
2411             'Unable to download xspf manifest', fatal=fatal)
2412         if xspf is False:
2413             return []
2414         return self._parse_xspf(
2415             xspf, playlist_id, xspf_url=xspf_url,
2416             xspf_base_url=base_url(xspf_url))
2417
2418     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2419         NS_MAP = {
2420             'xspf': 'http://xspf.org/ns/0/',
2421             's1': 'http://static.streamone.nl/player/ns/0',
2422         }
2423
2424         entries = []
2425         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2426             title = xpath_text(
2427                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2428             description = xpath_text(
2429                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2430             thumbnail = xpath_text(
2431                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2432             duration = float_or_none(
2433                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2434
2435             formats = []
2436             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2437                 format_url = urljoin(xspf_base_url, location.text)
2438                 if not format_url:
2439                     continue
2440                 formats.append({
2441                     'url': format_url,
2442                     'manifest_url': xspf_url,
2443                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2444                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2445                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2446                 })
2447             self._sort_formats(formats)
2448
2449             entries.append({
2450                 'id': playlist_id,
2451                 'title': title,
2452                 'description': description,
2453                 'thumbnail': thumbnail,
2454                 'duration': duration,
2455                 'formats': formats,
2456             })
2457         return entries
2458
2459     def _extract_mpd_formats(self, *args, **kwargs):
2460         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2461         if subs:
2462             self.report_warning(bug_reports_message(
2463                 "Ignoring subtitle tracks found in the DASH manifest; "
2464                 "if any subtitle tracks are missing,"
2465             ), only_once=True)
2466         return fmts
2467
2468     def _extract_mpd_formats_and_subtitles(
2469             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2470             fatal=True, data=None, headers={}, query={}):
2471         res = self._download_xml_handle(
2472             mpd_url, video_id,
2473             note='Downloading MPD manifest' if note is None else note,
2474             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2475             fatal=fatal, data=data, headers=headers, query=query)
2476         if res is False:
2477             return [], {}
2478         mpd_doc, urlh = res
2479         if mpd_doc is None:
2480             return [], {}
2481         mpd_base_url = base_url(urlh.geturl())
2482
2483         return self._parse_mpd_formats_and_subtitles(
2484             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2485
2486     def _parse_mpd_formats(self, *args, **kwargs):
2487         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2488         if subs:
2489             self.report_warning(bug_reports_message(
2490                 "Ignoring subtitle tracks found in the DASH manifest; "
2491                 "if any subtitle tracks are missing,"
2492             ), only_once=True)
2493         return fmts
2494
2495     def _parse_mpd_formats_and_subtitles(
2496             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2497         """
2498         Parse formats from MPD manifest.
2499         References:
2500          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2501             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2502          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2503         """
2504         if not self.get_param('dynamic_mpd', True):
2505             if mpd_doc.get('type') == 'dynamic':
2506                 return [], {}
2507
2508         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2509
2510         def _add_ns(path):
2511             return self._xpath_ns(path, namespace)
2512
2513         def is_drm_protected(element):
2514             return element.find(_add_ns('ContentProtection')) is not None
2515
2516         def extract_multisegment_info(element, ms_parent_info):
2517             ms_info = ms_parent_info.copy()
2518
2519             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2520             # common attributes and elements.  We will only extract relevant
2521             # for us.
2522             def extract_common(source):
2523                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2524                 if segment_timeline is not None:
2525                     s_e = segment_timeline.findall(_add_ns('S'))
2526                     if s_e:
2527                         ms_info['total_number'] = 0
2528                         ms_info['s'] = []
2529                         for s in s_e:
2530                             r = int(s.get('r', 0))
2531                             ms_info['total_number'] += 1 + r
2532                             ms_info['s'].append({
2533                                 't': int(s.get('t', 0)),
2534                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2535                                 'd': int(s.attrib['d']),
2536                                 'r': r,
2537                             })
2538                 start_number = source.get('startNumber')
2539                 if start_number:
2540                     ms_info['start_number'] = int(start_number)
2541                 timescale = source.get('timescale')
2542                 if timescale:
2543                     ms_info['timescale'] = int(timescale)
2544                 segment_duration = source.get('duration')
2545                 if segment_duration:
2546                     ms_info['segment_duration'] = float(segment_duration)
2547
2548             def extract_Initialization(source):
2549                 initialization = source.find(_add_ns('Initialization'))
2550                 if initialization is not None:
2551                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2552
2553             segment_list = element.find(_add_ns('SegmentList'))
2554             if segment_list is not None:
2555                 extract_common(segment_list)
2556                 extract_Initialization(segment_list)
2557                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2558                 if segment_urls_e:
2559                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2560             else:
2561                 segment_template = element.find(_add_ns('SegmentTemplate'))
2562                 if segment_template is not None:
2563                     extract_common(segment_template)
2564                     media = segment_template.get('media')
2565                     if media:
2566                         ms_info['media'] = media
2567                     initialization = segment_template.get('initialization')
2568                     if initialization:
2569                         ms_info['initialization'] = initialization
2570                     else:
2571                         extract_Initialization(segment_template)
2572             return ms_info
2573
2574         skip_unplayable = not self.get_param('allow_unplayable_formats')
2575
2576         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2577         formats = []
2578         subtitles = {}
2579         for period in mpd_doc.findall(_add_ns('Period')):
2580             period_duration = parse_duration(period.get('duration')) or mpd_duration
2581             period_ms_info = extract_multisegment_info(period, {
2582                 'start_number': 1,
2583                 'timescale': 1,
2584             })
2585             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2586                 if skip_unplayable and is_drm_protected(adaptation_set):
2587                     continue
2588                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2589                 for representation in adaptation_set.findall(_add_ns('Representation')):
2590                     if skip_unplayable and is_drm_protected(representation):
2591                         continue
2592                     representation_attrib = adaptation_set.attrib.copy()
2593                     representation_attrib.update(representation.attrib)
2594                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2595                     mime_type = representation_attrib['mimeType']
2596                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2597
2598                     codecs = representation_attrib.get('codecs', '')
2599                     if content_type not in ('video', 'audio', 'text'):
2600                         if mime_type == 'image/jpeg':
2601                             content_type = mime_type
2602                         elif codecs.split('.')[0] == 'stpp':
2603                             content_type = 'text'
2604                         else:
2605                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2606                             continue
2607
2608                     base_url = ''
2609                     for element in (representation, adaptation_set, period, mpd_doc):
2610                         base_url_e = element.find(_add_ns('BaseURL'))
2611                         if base_url_e is not None:
2612                             base_url = base_url_e.text + base_url
2613                             if re.match(r'^https?://', base_url):
2614                                 break
2615                     if mpd_base_url and not re.match(r'^https?://', base_url):
2616                         if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2617                             mpd_base_url += '/'
2618                         base_url = mpd_base_url + base_url
2619                     representation_id = representation_attrib.get('id')
2620                     lang = representation_attrib.get('lang')
2621                     url_el = representation.find(_add_ns('BaseURL'))
2622                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2623                     bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2624                     if representation_id is not None:
2625                         format_id = representation_id
2626                     else:
2627                         format_id = content_type
2628                     if mpd_id:
2629                         format_id = mpd_id + '-' + format_id
2630                     if content_type in ('video', 'audio'):
2631                         f = {
2632                             'format_id': format_id,
2633                             'manifest_url': mpd_url,
2634                             'ext': mimetype2ext(mime_type),
2635                             'width': int_or_none(representation_attrib.get('width')),
2636                             'height': int_or_none(representation_attrib.get('height')),
2637                             'tbr': float_or_none(bandwidth, 1000),
2638                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2639                             'fps': int_or_none(representation_attrib.get('frameRate')),
2640                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2641                             'format_note': 'DASH %s' % content_type,
2642                             'filesize': filesize,
2643                             'container': mimetype2ext(mime_type) + '_dash',
2644                         }
2645                         f.update(parse_codecs(codecs))
2646                     elif content_type == 'text':
2647                         f = {
2648                             'ext': mimetype2ext(mime_type),
2649                             'manifest_url': mpd_url,
2650                             'filesize': filesize,
2651                         }
2652                     elif content_type == 'image/jpeg':
2653                         # See test case in VikiIE
2654                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2655                         f = {
2656                             'format_id': format_id,
2657                             'ext': 'mhtml',
2658                             'manifest_url': mpd_url,
2659                             'format_note': 'DASH storyboards (jpeg)',
2660                             'acodec': 'none',
2661                             'vcodec': 'none',
2662                         }
2663                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2664
2665                     def prepare_template(template_name, identifiers):
2666                         tmpl = representation_ms_info[template_name]
2667                         # First of, % characters outside $...$ templates
2668                         # must be escaped by doubling for proper processing
2669                         # by % operator string formatting used further (see
2670                         # https://github.com/ytdl-org/youtube-dl/issues/16867).
2671                         t = ''
2672                         in_template = False
2673                         for c in tmpl:
2674                             t += c
2675                             if c == '$':
2676                                 in_template = not in_template
2677                             elif c == '%' and not in_template:
2678                                 t += c
2679                         # Next, $...$ templates are translated to their
2680                         # %(...) counterparts to be used with % operator
2681                         if representation_id is not None:
2682                             t = t.replace('$RepresentationID$', representation_id)
2683                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2684                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2685                         t.replace('$$', '$')
2686                         return t
2687
2688                     # @initialization is a regular template like @media one
2689                     # so it should be handled just the same way (see
2690                     # https://github.com/ytdl-org/youtube-dl/issues/11605)
2691                     if 'initialization' in representation_ms_info:
2692                         initialization_template = prepare_template(
2693                             'initialization',
2694                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2695                             # $Time$ shall not be included for @initialization thus
2696                             # only $Bandwidth$ remains
2697                             ('Bandwidth', ))
2698                         representation_ms_info['initialization_url'] = initialization_template % {
2699                             'Bandwidth': bandwidth,
2700                         }
2701
2702                     def location_key(location):
2703                         return 'url' if re.match(r'^https?://', location) else 'path'
2704
2705                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2706
2707                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2708                         media_location_key = location_key(media_template)
2709
2710                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2711                         # can't be used at the same time
2712                         if '%(Number' in media_template and 's' not in representation_ms_info:
2713                             segment_duration = None
2714                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2715                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2716                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2717                             representation_ms_info['fragments'] = [{
2718                                 media_location_key: media_template % {
2719                                     'Number': segment_number,
2720                                     'Bandwidth': bandwidth,
2721                                 },
2722                                 'duration': segment_duration,
2723                             } for segment_number in range(
2724                                 representation_ms_info['start_number'],
2725                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2726                         else:
2727                             # $Number*$ or $Time$ in media template with S list available
2728                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2729                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2730                             representation_ms_info['fragments'] = []
2731                             segment_time = 0
2732                             segment_d = None
2733                             segment_number = representation_ms_info['start_number']
2734
2735                             def add_segment_url():
2736                                 segment_url = media_template % {
2737                                     'Time': segment_time,
2738                                     'Bandwidth': bandwidth,
2739                                     'Number': segment_number,
2740                                 }
2741                                 representation_ms_info['fragments'].append({
2742                                     media_location_key: segment_url,
2743                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2744                                 })
2745
2746                             for num, s in enumerate(representation_ms_info['s']):
2747                                 segment_time = s.get('t') or segment_time
2748                                 segment_d = s['d']
2749                                 add_segment_url()
2750                                 segment_number += 1
2751                                 for r in range(s.get('r', 0)):
2752                                     segment_time += segment_d
2753                                     add_segment_url()
2754                                     segment_number += 1
2755                                 segment_time += segment_d
2756                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2757                         # No media template
2758                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2759                         # or any YouTube dashsegments video
2760                         fragments = []
2761                         segment_index = 0
2762                         timescale = representation_ms_info['timescale']
2763                         for s in representation_ms_info['s']:
2764                             duration = float_or_none(s['d'], timescale)
2765                             for r in range(s.get('r', 0) + 1):
2766                                 segment_uri = representation_ms_info['segment_urls'][segment_index]
2767                                 fragments.append({
2768                                     location_key(segment_uri): segment_uri,
2769                                     'duration': duration,
2770                                 })
2771                                 segment_index += 1
2772                         representation_ms_info['fragments'] = fragments
2773                     elif 'segment_urls' in representation_ms_info:
2774                         # Segment URLs with no SegmentTimeline
2775                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2776                         # https://github.com/ytdl-org/youtube-dl/pull/14844
2777                         fragments = []
2778                         segment_duration = float_or_none(
2779                             representation_ms_info['segment_duration'],
2780                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2781                         for segment_url in representation_ms_info['segment_urls']:
2782                             fragment = {
2783                                 location_key(segment_url): segment_url,
2784                             }
2785                             if segment_duration:
2786                                 fragment['duration'] = segment_duration
2787                             fragments.append(fragment)
2788                         representation_ms_info['fragments'] = fragments
2789                     # If there is a fragments key available then we correctly recognized fragmented media.
2790                     # Otherwise we will assume unfragmented media with direct access. Technically, such
2791                     # assumption is not necessarily correct since we may simply have no support for
2792                     # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2793                     if 'fragments' in representation_ms_info:
2794                         f.update({
2795                             # NB: mpd_url may be empty when MPD manifest is parsed from a string
2796                             'url': mpd_url or base_url,
2797                             'fragment_base_url': base_url,
2798                             'fragments': [],
2799                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2800                         })
2801                         if 'initialization_url' in representation_ms_info:
2802                             initialization_url = representation_ms_info['initialization_url']
2803                             if not f.get('url'):
2804                                 f['url'] = initialization_url
2805                             f['fragments'].append({location_key(initialization_url): initialization_url})
2806                         f['fragments'].extend(representation_ms_info['fragments'])
2807                     else:
2808                         # Assuming direct URL to unfragmented media.
2809                         f['url'] = base_url
2810                     if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2811                         formats.append(f)
2812                     elif content_type == 'text':
2813                         subtitles.setdefault(lang or 'und', []).append(f)
2814
2815         return formats, subtitles
2816
2817     def _extract_ism_formats(self, *args, **kwargs):
2818         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2819         if subs:
2820             self.report_warning(bug_reports_message(
2821                 "Ignoring subtitle tracks found in the ISM manifest; "
2822                 "if any subtitle tracks are missing,"
2823             ))
2824         return fmts
2825
2826     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2827         res = self._download_xml_handle(
2828             ism_url, video_id,
2829             note='Downloading ISM manifest' if note is None else note,
2830             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2831             fatal=fatal, data=data, headers=headers, query=query)
2832         if res is False:
2833             return [], {}
2834         ism_doc, urlh = res
2835         if ism_doc is None:
2836             return [], {}
2837
2838         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2839
2840     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2841         """
2842         Parse formats from ISM manifest.
2843         References:
2844          1. [MS-SSTR]: Smooth Streaming Protocol,
2845             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2846         """
2847         if ism_doc.get('IsLive') == 'TRUE':
2848             return [], {}
2849         if (not self.get_param('allow_unplayable_formats')
2850                 and ism_doc.find('Protection') is not None):
2851             return [], {}
2852
2853         duration = int(ism_doc.attrib['Duration'])
2854         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2855
2856         formats = []
2857         subtitles = {}
2858         for stream in ism_doc.findall('StreamIndex'):
2859             stream_type = stream.get('Type')
2860             if stream_type not in ('video', 'audio', 'text'):
2861                 continue
2862             url_pattern = stream.attrib['Url']
2863             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2864             stream_name = stream.get('Name')
2865             stream_language = stream.get('Language', 'und')
2866             for track in stream.findall('QualityLevel'):
2867                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2868                 # TODO: add support for WVC1 and WMAP
2869                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2870                     self.report_warning('%s is not a supported codec' % fourcc)
2871                     continue
2872                 tbr = int(track.attrib['Bitrate']) // 1000
2873                 # [1] does not mention Width and Height attributes. However,
2874                 # they're often present while MaxWidth and MaxHeight are
2875                 # missing, so should be used as fallbacks
2876                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2877                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2878                 sampling_rate = int_or_none(track.get('SamplingRate'))
2879
2880                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2881                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2882
2883                 fragments = []
2884                 fragment_ctx = {
2885                     'time': 0,
2886                 }
2887                 stream_fragments = stream.findall('c')
2888                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2889                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2890                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2891                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2892                     if not fragment_ctx['duration']:
2893                         try:
2894                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2895                         except IndexError:
2896                             next_fragment_time = duration
2897                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2898                     for _ in range(fragment_repeat):
2899                         fragments.append({
2900                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2901                             'duration': fragment_ctx['duration'] / stream_timescale,
2902                         })
2903                         fragment_ctx['time'] += fragment_ctx['duration']
2904
2905                 format_id = []
2906                 if ism_id:
2907                     format_id.append(ism_id)
2908                 if stream_name:
2909                     format_id.append(stream_name)
2910                 format_id.append(compat_str(tbr))
2911
2912                 if stream_type == 'text':
2913                     subtitles.setdefault(stream_language, []).append({
2914                         'ext': 'ismt',
2915                         'protocol': 'ism',
2916                         'url': ism_url,
2917                         'manifest_url': ism_url,
2918                         'fragments': fragments,
2919                         '_download_params': {
2920                             'stream_type': stream_type,
2921                             'duration': duration,
2922                             'timescale': stream_timescale,
2923                             'fourcc': fourcc,
2924                             'language': stream_language,
2925                             'codec_private_data': track.get('CodecPrivateData'),
2926                         }
2927                     })
2928                 elif stream_type in ('video', 'audio'):
2929                     formats.append({
2930                         'format_id': '-'.join(format_id),
2931                         'url': ism_url,
2932                         'manifest_url': ism_url,
2933                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2934                         'width': width,
2935                         'height': height,
2936                         'tbr': tbr,
2937                         'asr': sampling_rate,
2938                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2939                         'acodec': 'none' if stream_type == 'video' else fourcc,
2940                         'protocol': 'ism',
2941                         'fragments': fragments,
2942                         '_download_params': {
2943                             'stream_type': stream_type,
2944                             'duration': duration,
2945                             'timescale': stream_timescale,
2946                             'width': width or 0,
2947                             'height': height or 0,
2948                             'fourcc': fourcc,
2949                             'language': stream_language,
2950                             'codec_private_data': track.get('CodecPrivateData'),
2951                             'sampling_rate': sampling_rate,
2952                             'channels': int_or_none(track.get('Channels', 2)),
2953                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2954                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2955                         },
2956                     })
2957         return formats, subtitles
2958
2959     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2960         def absolute_url(item_url):
2961             return urljoin(base_url, item_url)
2962
2963         def parse_content_type(content_type):
2964             if not content_type:
2965                 return {}
2966             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2967             if ctr:
2968                 mimetype, codecs = ctr.groups()
2969                 f = parse_codecs(codecs)
2970                 f['ext'] = mimetype2ext(mimetype)
2971                 return f
2972             return {}
2973
2974         def _media_formats(src, cur_media_type, type_info={}):
2975             full_url = absolute_url(src)
2976             ext = type_info.get('ext') or determine_ext(full_url)
2977             if ext == 'm3u8':
2978                 is_plain_url = False
2979                 formats = self._extract_m3u8_formats(
2980                     full_url, video_id, ext='mp4',
2981                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2982                     preference=preference, quality=quality, fatal=False)
2983             elif ext == 'mpd':
2984                 is_plain_url = False
2985                 formats = self._extract_mpd_formats(
2986                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2987             else:
2988                 is_plain_url = True
2989                 formats = [{
2990                     'url': full_url,
2991                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2992                 }]
2993             return is_plain_url, formats
2994
2995         entries = []
2996         # amp-video and amp-audio are very similar to their HTML5 counterparts
2997         # so we wll include them right here (see
2998         # https://www.ampproject.org/docs/reference/components/amp-video)
2999         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3000         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3001         media_tags = [(media_tag, media_tag_name, media_type, '')
3002                       for media_tag, media_tag_name, media_type
3003                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3004         media_tags.extend(re.findall(
3005             # We only allow video|audio followed by a whitespace or '>'.
3006             # Allowing more characters may end up in significant slow down (see
3007             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3008             # http://www.porntrex.com/maps/videositemap.xml).
3009             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3010         for media_tag, _, media_type, media_content in media_tags:
3011             media_info = {
3012                 'formats': [],
3013                 'subtitles': {},
3014             }
3015             media_attributes = extract_attributes(media_tag)
3016             src = strip_or_none(media_attributes.get('src'))
3017             if src:
3018                 _, formats = _media_formats(src, media_type)
3019                 media_info['formats'].extend(formats)
3020             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3021             if media_content:
3022                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3023                     s_attr = extract_attributes(source_tag)
3024                     # data-video-src and data-src are non standard but seen
3025                     # several times in the wild
3026                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3027                     if not src:
3028                         continue
3029                     f = parse_content_type(s_attr.get('type'))
3030                     is_plain_url, formats = _media_formats(src, media_type, f)
3031                     if is_plain_url:
3032                         # width, height, res, label and title attributes are
3033                         # all not standard but seen several times in the wild
3034                         labels = [
3035                             s_attr.get(lbl)
3036                             for lbl in ('label', 'title')
3037                             if str_or_none(s_attr.get(lbl))
3038                         ]
3039                         width = int_or_none(s_attr.get('width'))
3040                         height = (int_or_none(s_attr.get('height'))
3041                                   or int_or_none(s_attr.get('res')))
3042                         if not width or not height:
3043                             for lbl in labels:
3044                                 resolution = parse_resolution(lbl)
3045                                 if not resolution:
3046                                     continue
3047                                 width = width or resolution.get('width')
3048                                 height = height or resolution.get('height')
3049                         for lbl in labels:
3050                             tbr = parse_bitrate(lbl)
3051                             if tbr:
3052                                 break
3053                         else:
3054                             tbr = None
3055                         f.update({
3056                             'width': width,
3057                             'height': height,
3058                             'tbr': tbr,
3059                             'format_id': s_attr.get('label') or s_attr.get('title'),
3060                         })
3061                         f.update(formats[0])
3062                         media_info['formats'].append(f)
3063                     else:
3064                         media_info['formats'].extend(formats)
3065                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3066                     track_attributes = extract_attributes(track_tag)
3067                     kind = track_attributes.get('kind')
3068                     if not kind or kind in ('subtitles', 'captions'):
3069                         src = strip_or_none(track_attributes.get('src'))
3070                         if not src:
3071                             continue
3072                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3073                         media_info['subtitles'].setdefault(lang, []).append({
3074                             'url': absolute_url(src),
3075                         })
3076             for f in media_info['formats']:
3077                 f.setdefault('http_headers', {})['Referer'] = base_url
3078             if media_info['formats'] or media_info['subtitles']:
3079                 entries.append(media_info)
3080         return entries
3081
3082     def _extract_akamai_formats(self, *args, **kwargs):
3083         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3084         if subs:
3085             self.report_warning(bug_reports_message(
3086                 "Ignoring subtitle tracks found in the manifests; "
3087                 "if any subtitle tracks are missing,"
3088             ))
3089         return fmts
3090
3091     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3092         signed = 'hdnea=' in manifest_url
3093         if not signed:
3094             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3095             manifest_url = re.sub(
3096                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3097                 '', manifest_url).strip('?')
3098
3099         formats = []
3100         subtitles = {}
3101
3102         hdcore_sign = 'hdcore=3.7.0'
3103         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3104         hds_host = hosts.get('hds')
3105         if hds_host:
3106             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3107         if 'hdcore=' not in f4m_url:
3108             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3109         f4m_formats = self._extract_f4m_formats(
3110             f4m_url, video_id, f4m_id='hds', fatal=False)
3111         for entry in f4m_formats:
3112             entry.update({'extra_param_to_segment_url': hdcore_sign})
3113         formats.extend(f4m_formats)
3114
3115         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3116         hls_host = hosts.get('hls')
3117         if hls_host:
3118             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3119         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3120             m3u8_url, video_id, 'mp4', 'm3u8_native',
3121             m3u8_id='hls', fatal=False)
3122         formats.extend(m3u8_formats)
3123         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3124
3125         http_host = hosts.get('http')
3126         if http_host and m3u8_formats and not signed:
3127             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3128             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3129             qualities_length = len(qualities)
3130             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3131                 i = 0
3132                 for f in m3u8_formats:
3133                     if f['vcodec'] != 'none':
3134                         for protocol in ('http', 'https'):
3135                             http_f = f.copy()
3136                             del http_f['manifest_url']
3137                             http_url = re.sub(
3138                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3139                             http_f.update({
3140                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3141                                 'url': http_url,
3142                                 'protocol': protocol,
3143                             })
3144                             formats.append(http_f)
3145                         i += 1
3146
3147         return formats, subtitles
3148
3149     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3150         query = compat_urlparse.urlparse(url).query
3151         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3152         mobj = re.search(
3153             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3154         url_base = mobj.group('url')
3155         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3156         formats = []
3157
3158         def manifest_url(manifest):
3159             m_url = '%s/%s' % (http_base_url, manifest)
3160             if query:
3161                 m_url += '?%s' % query
3162             return m_url
3163
3164         if 'm3u8' not in skip_protocols:
3165             formats.extend(self._extract_m3u8_formats(
3166                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3167                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3168         if 'f4m' not in skip_protocols:
3169             formats.extend(self._extract_f4m_formats(
3170                 manifest_url('manifest.f4m'),
3171                 video_id, f4m_id='hds', fatal=False))
3172         if 'dash' not in skip_protocols:
3173             formats.extend(self._extract_mpd_formats(
3174                 manifest_url('manifest.mpd'),
3175                 video_id, mpd_id='dash', fatal=False))
3176         if re.search(r'(?:/smil:|\.smil)', url_base):
3177             if 'smil' not in skip_protocols:
3178                 rtmp_formats = self._extract_smil_formats(
3179                     manifest_url('jwplayer.smil'),
3180                     video_id, fatal=False)
3181                 for rtmp_format in rtmp_formats:
3182                     rtsp_format = rtmp_format.copy()
3183                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3184                     del rtsp_format['play_path']
3185                     del rtsp_format['ext']
3186                     rtsp_format.update({
3187                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3188                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3189                         'protocol': 'rtsp',
3190                     })
3191                     formats.extend([rtmp_format, rtsp_format])
3192         else:
3193             for protocol in ('rtmp', 'rtsp'):
3194                 if protocol not in skip_protocols:
3195                     formats.append({
3196                         'url': '%s:%s' % (protocol, url_base),
3197                         'format_id': protocol,
3198                         'protocol': protocol,
3199                     })
3200         return formats
3201
3202     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3203         mobj = re.search(
3204             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3205             webpage)
3206         if mobj:
3207             try:
3208                 jwplayer_data = self._parse_json(mobj.group('options'),
3209                                                  video_id=video_id,
3210                                                  transform_source=transform_source)
3211             except ExtractorError:
3212                 pass
3213             else:
3214                 if isinstance(jwplayer_data, dict):
3215                     return jwplayer_data
3216
3217     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3218         jwplayer_data = self._find_jwplayer_data(
3219             webpage, video_id, transform_source=js_to_json)
3220         return self._parse_jwplayer_data(
3221             jwplayer_data, video_id, *args, **kwargs)
3222
3223     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3224                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3225         # JWPlayer backward compatibility: flattened playlists
3226         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3227         if 'playlist' not in jwplayer_data:
3228             jwplayer_data = {'playlist': [jwplayer_data]}
3229
3230         entries = []
3231
3232         # JWPlayer backward compatibility: single playlist item
3233         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3234         if not isinstance(jwplayer_data['playlist'], list):
3235             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3236
3237         for video_data in jwplayer_data['playlist']:
3238             # JWPlayer backward compatibility: flattened sources
3239             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3240             if 'sources' not in video_data:
3241                 video_data['sources'] = [video_data]
3242
3243             this_video_id = video_id or video_data['mediaid']
3244
3245             formats = self._parse_jwplayer_formats(
3246                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3247                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3248
3249             subtitles = {}
3250             tracks = video_data.get('tracks')
3251             if tracks and isinstance(tracks, list):
3252                 for track in tracks:
3253                     if not isinstance(track, dict):
3254                         continue
3255                     track_kind = track.get('kind')
3256                     if not track_kind or not isinstance(track_kind, compat_str):
3257                         continue
3258                     if track_kind.lower() not in ('captions', 'subtitles'):
3259                         continue
3260                     track_url = urljoin(base_url, track.get('file'))
3261                     if not track_url:
3262                         continue
3263                     subtitles.setdefault(track.get('label') or 'en', []).append({
3264                         'url': self._proto_relative_url(track_url)
3265                     })
3266
3267             entry = {
3268                 'id': this_video_id,
3269                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3270                 'description': clean_html(video_data.get('description')),
3271                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3272                 'timestamp': int_or_none(video_data.get('pubdate')),
3273                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3274                 'subtitles': subtitles,
3275             }
3276             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3277             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3278                 entry.update({
3279                     '_type': 'url_transparent',
3280                     'url': formats[0]['url'],
3281                 })
3282             else:
3283                 self._sort_formats(formats)
3284                 entry['formats'] = formats
3285             entries.append(entry)
3286         if len(entries) == 1:
3287             return entries[0]
3288         else:
3289             return self.playlist_result(entries)
3290
3291     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3292                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3293         urls = []
3294         formats = []
3295         for source in jwplayer_sources_data:
3296             if not isinstance(source, dict):
3297                 continue
3298             source_url = urljoin(
3299                 base_url, self._proto_relative_url(source.get('file')))
3300             if not source_url or source_url in urls:
3301                 continue
3302             urls.append(source_url)
3303             source_type = source.get('type') or ''
3304             ext = mimetype2ext(source_type) or determine_ext(source_url)
3305             if source_type == 'hls' or ext == 'm3u8':
3306                 formats.extend(self._extract_m3u8_formats(
3307                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3308                     m3u8_id=m3u8_id, fatal=False))
3309             elif source_type == 'dash' or ext == 'mpd':
3310                 formats.extend(self._extract_mpd_formats(
3311                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3312             elif ext == 'smil':
3313                 formats.extend(self._extract_smil_formats(
3314                     source_url, video_id, fatal=False))
3315             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3316             elif source_type.startswith('audio') or ext in (
3317                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3318                 formats.append({
3319                     'url': source_url,
3320                     'vcodec': 'none',
3321                     'ext': ext,
3322                 })
3323             else:
3324                 height = int_or_none(source.get('height'))
3325                 if height is None:
3326                     # Often no height is provided but there is a label in
3327                     # format like "1080p", "720p SD", or 1080.
3328                     height = int_or_none(self._search_regex(
3329                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3330                         'height', default=None))
3331                 a_format = {
3332                     'url': source_url,
3333                     'width': int_or_none(source.get('width')),
3334                     'height': height,
3335                     'tbr': int_or_none(source.get('bitrate')),
3336                     'ext': ext,
3337                 }
3338                 if source_url.startswith('rtmp'):
3339                     a_format['ext'] = 'flv'
3340                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3341                     # of jwplayer.flash.swf
3342                     rtmp_url_parts = re.split(
3343                         r'((?:mp4|mp3|flv):)', source_url, 1)
3344                     if len(rtmp_url_parts) == 3:
3345                         rtmp_url, prefix, play_path = rtmp_url_parts
3346                         a_format.update({
3347                             'url': rtmp_url,
3348                             'play_path': prefix + play_path,
3349                         })
3350                     if rtmp_params:
3351                         a_format.update(rtmp_params)
3352                 formats.append(a_format)
3353         return formats
3354
3355     def _live_title(self, name):
3356         """ Generate the title for a live video """
3357         now = datetime.datetime.now()
3358         now_str = now.strftime('%Y-%m-%d %H:%M')
3359         return name + ' ' + now_str
3360
3361     def _int(self, v, name, fatal=False, **kwargs):
3362         res = int_or_none(v, **kwargs)
3363         if 'get_attr' in kwargs:
3364             print(getattr(v, kwargs['get_attr']))
3365         if res is None:
3366             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3367             if fatal:
3368                 raise ExtractorError(msg)
3369             else:
3370                 self.report_warning(msg)
3371         return res
3372
3373     def _float(self, v, name, fatal=False, **kwargs):
3374         res = float_or_none(v, **kwargs)
3375         if res is None:
3376             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3377             if fatal:
3378                 raise ExtractorError(msg)
3379             else:
3380                 self.report_warning(msg)
3381         return res
3382
3383     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3384                     path='/', secure=False, discard=False, rest={}, **kwargs):
3385         cookie = compat_cookiejar_Cookie(
3386             0, name, value, port, port is not None, domain, True,
3387             domain.startswith('.'), path, True, secure, expire_time,
3388             discard, None, None, rest)
3389         self._downloader.cookiejar.set_cookie(cookie)
3390
3391     def _get_cookies(self, url):
3392         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3393         req = sanitized_Request(url)
3394         self._downloader.cookiejar.add_cookie_header(req)
3395         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3396
3397     def _apply_first_set_cookie_header(self, url_handle, cookie):
3398         """
3399         Apply first Set-Cookie header instead of the last. Experimental.
3400
3401         Some sites (e.g. [1-3]) may serve two cookies under the same name
3402         in Set-Cookie header and expect the first (old) one to be set rather
3403         than second (new). However, as of RFC6265 the newer one cookie
3404         should be set into cookie store what actually happens.
3405         We will workaround this issue by resetting the cookie to
3406         the first one manually.
3407         1. https://new.vk.com/
3408         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3409         3. https://learning.oreilly.com/
3410         """
3411         for header, cookies in url_handle.headers.items():
3412             if header.lower() != 'set-cookie':
3413                 continue
3414             if sys.version_info[0] >= 3:
3415                 cookies = cookies.encode('iso-8859-1')
3416             cookies = cookies.decode('utf-8')
3417             cookie_value = re.search(
3418                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3419             if cookie_value:
3420                 value, domain = cookie_value.groups()
3421                 self._set_cookie(domain, cookie, value)
3422                 break
3423
3424     def get_testcases(self, include_onlymatching=False):
3425         t = getattr(self, '_TEST', None)
3426         if t:
3427             assert not hasattr(self, '_TESTS'), \
3428                 '%s has _TEST and _TESTS' % type(self).__name__
3429             tests = [t]
3430         else:
3431             tests = getattr(self, '_TESTS', [])
3432         for t in tests:
3433             if not include_onlymatching and t.get('only_matching', False):
3434                 continue
3435             t['name'] = type(self).__name__[:-len('IE')]
3436             yield t
3437
3438     def is_suitable(self, age_limit):
3439         """ Test whether the extractor is generally suitable for the given
3440         age limit (i.e. pornographic sites are not, all others usually are) """
3441
3442         any_restricted = False
3443         for tc in self.get_testcases(include_onlymatching=False):
3444             if tc.get('playlist', []):
3445                 tc = tc['playlist'][0]
3446             is_restricted = age_restricted(
3447                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3448             if not is_restricted:
3449                 return True
3450             any_restricted = any_restricted or is_restricted
3451         return not any_restricted
3452
3453     def extract_subtitles(self, *args, **kwargs):
3454         if (self.get_param('writesubtitles', False)
3455                 or self.get_param('listsubtitles')):
3456             return self._get_subtitles(*args, **kwargs)
3457         return {}
3458
3459     def _get_subtitles(self, *args, **kwargs):
3460         raise NotImplementedError('This method must be implemented by subclasses')
3461
3462     @staticmethod
3463     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3464         """ Merge subtitle items for one language. Items with duplicated URLs
3465         will be dropped. """
3466         list1_urls = set([item['url'] for item in subtitle_list1])
3467         ret = list(subtitle_list1)
3468         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3469         return ret
3470
3471     @classmethod
3472     def _merge_subtitles(cls, *dicts, target=None):
3473         """ Merge subtitle dictionaries, language by language. """
3474         if target is None:
3475             target = {}
3476         for d in dicts:
3477             for lang, subs in d.items():
3478                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3479         return target
3480
3481     def extract_automatic_captions(self, *args, **kwargs):
3482         if (self.get_param('writeautomaticsub', False)
3483                 or self.get_param('listsubtitles')):
3484             return self._get_automatic_captions(*args, **kwargs)
3485         return {}
3486
3487     def _get_automatic_captions(self, *args, **kwargs):
3488         raise NotImplementedError('This method must be implemented by subclasses')
3489
3490     def mark_watched(self, *args, **kwargs):
3491         if (self.get_param('mark_watched', False)
3492                 and (self._get_login_info()[0] is not None
3493                      or self.get_param('cookiefile') is not None)):
3494             self._mark_watched(*args, **kwargs)
3495
3496     def _mark_watched(self, *args, **kwargs):
3497         raise NotImplementedError('This method must be implemented by subclasses')
3498
3499     def geo_verification_headers(self):
3500         headers = {}
3501         geo_verification_proxy = self.get_param('geo_verification_proxy')
3502         if geo_verification_proxy:
3503             headers['Ytdl-request-proxy'] = geo_verification_proxy
3504         return headers
3505
3506     def _generic_id(self, url):
3507         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3508
3509     def _generic_title(self, url):
3510         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3511
3512     @staticmethod
3513     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3514         all_known = all(map(
3515             lambda x: x is not None,
3516             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3517         return (
3518             'private' if is_private
3519             else 'premium_only' if needs_premium
3520             else 'subscriber_only' if needs_subscription
3521             else 'needs_auth' if needs_auth
3522             else 'unlisted' if is_unlisted
3523             else 'public' if all_known
3524             else None)
3525
3526     def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3527         '''
3528         @returns            A list of values for the extractor argument given by "key"
3529                             or "default" if no such key is present
3530         @param default      The default value to return when the key is not present (default: [])
3531         @param casesense    When false, the values are converted to lower case
3532         '''
3533         val = traverse_obj(
3534             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3535         if val is None:
3536             return [] if default is NO_DEFAULT else default
3537         return list(val) if casesense else [x.lower() for x in val]
3538
3539
3540 class SearchInfoExtractor(InfoExtractor):
3541     """
3542     Base class for paged search queries extractors.
3543     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3544     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3545     """
3546
3547     @classmethod
3548     def _make_valid_url(cls):
3549         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3550
3551     @classmethod
3552     def suitable(cls, url):
3553         return re.match(cls._make_valid_url(), url) is not None
3554
3555     def _real_extract(self, query):
3556         mobj = re.match(self._make_valid_url(), query)
3557         if mobj is None:
3558             raise ExtractorError('Invalid search query "%s"' % query)
3559
3560         prefix = mobj.group('prefix')
3561         query = mobj.group('query')
3562         if prefix == '':
3563             return self._get_n_results(query, 1)
3564         elif prefix == 'all':
3565             return self._get_n_results(query, self._MAX_RESULTS)
3566         else:
3567             n = int(prefix)
3568             if n <= 0:
3569                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3570             elif n > self._MAX_RESULTS:
3571                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3572                 n = self._MAX_RESULTS
3573             return self._get_n_results(query, n)
3574
3575     def _get_n_results(self, query, n):
3576         """Get a specified number of results for a query"""
3577         raise NotImplementedError('This method must be implemented by subclasses')
3578
3579     @property
3580     def SEARCH_KEY(self):
3581         return self._SEARCH_KEY