yt_dlp/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar_Cookie,
  18     compat_cookies_SimpleCookie,
  19     compat_etree_Element,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_integer_types,
  23     compat_http_client,
  24     compat_os_name,
  25     compat_str,
  26     compat_urllib_error,
  27     compat_urllib_parse_unquote,
  28     compat_urllib_parse_urlencode,
  29     compat_urllib_request,
  30     compat_urlparse,
  31     compat_xml_parse_error,
  32 )
  33 from ..downloader import FileDownloader
  34 from ..downloader.f4m import (
  35     get_base_url,
  36     remove_encrypted_media,
  37 )
  38 from ..utils import (
  39     NO_DEFAULT,
  40     age_restricted,
  41     base_url,
  42     bug_reports_message,
  43     clean_html,
  44     compiled_regex_type,
  45     determine_ext,
  46     determine_protocol,
  47     dict_get,
  48     error_to_compat_str,
  49     ExtractorError,
  50     extract_attributes,
  51     fix_xml_ampersands,
  52     float_or_none,
  53     GeoRestrictedError,
  54     GeoUtils,
  55     int_or_none,
  56     js_to_json,
  57     JSON_LD_RE,
  58     mimetype2ext,
  59     network_exceptions,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     RegexNotFoundError,
  68     sanitized_Request,
  69     sanitize_filename,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     traverse_obj,
  74     unescapeHTML,
  75     unified_strdate,
  76     unified_timestamp,
  77     update_Request,
  78     update_url_query,
  79     urljoin,
  80     url_basename,
  81     url_or_none,
  82     xpath_element,
  83     xpath_text,
  84     xpath_with_ns,
  85 )
  86
  87
  88 class InfoExtractor(object):
  89     """Information Extractor class.
  90
  91     Information extractors are the classes that, given a URL, extract
  92     information about the video (or videos) the URL refers to. This
  93     information includes the real video URL, the video title, author and
  94     others. The information is stored in a dictionary which is then
  95     passed to the YoutubeDL. The YoutubeDL processes this
  96     information possibly downloading the video to the file system, among
  97     other possible outcomes.
  98
  99     The type field determines the type of the result.
 100     By far the most common value (and the default if _type is missing) is
 101     "video", which indicates a single video.
 102
 103     For a video, the dictionaries must include the following fields:
 104
 105     id:             Video identifier.
 106     title:          Video title, unescaped.
 107
 108     Additionally, it must contain either a formats entry or a url one:
 109
 110     formats:        A list of dictionaries for each format available, ordered
 111                     from worst to best quality.
 112
 113                     Potential fields:
 114                     * url        The mandatory URL representing the media:
 115                                    for plain file media - HTTP URL of this file,
 116                                    for RTMP - RTMP URL,
 117                                    for HLS - URL of the M3U8 media playlist,
 118                                    for HDS - URL of the F4M manifest,
 119                                    for DASH
 120                                      - HTTP URL to plain file media (in case of
 121                                        unfragmented media)
 122                                      - URL of the MPD manifest or base URL
 123                                        representing the media if MPD manifest
 124                                        is parsed from a string (in case of
 125                                        fragmented media)
 126                                    for MSS - URL of the ISM manifest.
 127                     * manifest_url
 128                                  The URL of the manifest file in case of
 129                                  fragmented media:
 130                                    for HLS - URL of the M3U8 master playlist,
 131                                    for HDS - URL of the F4M manifest,
 132                                    for DASH - URL of the MPD manifest,
 133                                    for MSS - URL of the ISM manifest.
 134                     * ext        Will be calculated from URL if missing
 135                     * format     A human-readable description of the format
 136                                  ("mp4 container with h264/opus").
 137                                  Calculated from the format_id, width, height.
 138                                  and format_note fields if missing.
 139                     * format_id  A short description of the format
 140                                  ("mp4_h264_opus" or "19").
 141                                 Technically optional, but strongly recommended.
 142                     * format_note Additional info about the format
 143                                  ("3D" or "DASH video")
 144                     * width      Width of the video, if known
 145                     * height     Height of the video, if known
 146                     * resolution Textual description of width and height
 147                     * tbr        Average bitrate of audio and video in KBit/s
 148                     * abr        Average audio bitrate in KBit/s
 149                     * acodec     Name of the audio codec in use
 150                     * asr        Audio sampling rate in Hertz
 151                     * vbr        Average video bitrate in KBit/s
 152                     * fps        Frame rate
 153                     * vcodec     Name of the video codec in use
 154                     * container  Name of the container format
 155                     * filesize   The number of bytes, if known in advance
 156                     * filesize_approx  An estimate for the number of bytes
 157                     * player_url SWF Player URL (used for rtmpdump).
 158                     * protocol   The protocol that will be used for the actual
 159                                  download, lower-case.
 160                                  "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
 161                                  "m3u8", "m3u8_native" or "http_dash_segments".
 162                     * fragment_base_url
 163                                  Base URL for fragments. Each fragment's path
 164                                  value (if present) will be relative to
 165                                  this URL.
 166                     * fragments  A list of fragments of a fragmented media.
 167                                  Each fragment entry must contain either an url
 168                                  or a path. If an url is present it should be
 169                                  considered by a client. Otherwise both path and
 170                                  fragment_base_url must be present. Here is
 171                                  the list of all potential fields:
 172                                  * "url" - fragment's URL
 173                                  * "path" - fragment's path relative to
 174                                             fragment_base_url
 175                                  * "duration" (optional, int or float)
 176                                  * "filesize" (optional, int)
 177                     * preference Order number of this format. If this field is
 178                                  present and not None, the formats get sorted
 179                                  by this field, regardless of all other values.
 180                                  -1 for default (order by other properties),
 181                                  -2 or smaller for less than default.
 182                                  < -1000 to hide the format (if there is
 183                                     another one which is strictly better)
 184                     * language   Language code, e.g. "de" or "en-US".
 185                     * language_preference  Is this in the language mentioned in
 186                                  the URL?
 187                                  10 if it's what the URL is about,
 188                                  -1 for default (don't know),
 189                                  -10 otherwise, other values reserved for now.
 190                     * quality    Order number of the video quality of this
 191                                  format, irrespective of the file format.
 192                                  -1 for default (order by other properties),
 193                                  -2 or smaller for less than default.
 194                     * source_preference  Order number for this video source
 195                                   (quality takes higher priority)
 196                                  -1 for default (order by other properties),
 197                                  -2 or smaller for less than default.
 198                     * http_headers  A dictionary of additional HTTP headers
 199                                  to add to the request.
 200                     * stretched_ratio  If given and not 1, indicates that the
 201                                  video's pixels are not square.
 202                                  width : height ratio as float.
 203                     * no_resume  The server does not support resuming the
 204                                  (HTTP or RTMP) download. Boolean.
 205                     * downloader_options  A dictionary of downloader options as
 206                                  described in FileDownloader
 207                     RTMP formats can also have the additional fields: page_url,
 208                     app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 209                     rtmp_protocol, rtmp_real_time
 210
 211     url:            Final video URL.
 212     ext:            Video filename extension.
 213     format:         The video format, defaults to ext (used for --get-format)
 214     player_url:     SWF Player URL (used for rtmpdump).
 215
 216     The following fields are optional:
 217
 218     alt_title:      A secondary title of the video.
 219     display_id      An alternative identifier for the video, not necessarily
 220                     unique, but available before title. Typically, id is
 221                     something like "4234987", title "Dancing naked mole rats",
 222                     and display_id "dancing-naked-mole-rats"
 223     thumbnails:     A list of dictionaries, with the following entries:
 224                         * "id" (optional, string) - Thumbnail format ID
 225                         * "url"
 226                         * "preference" (optional, int) - quality of the image
 227                         * "width" (optional, int)
 228                         * "height" (optional, int)
 229                         * "resolution" (optional, string "{width}x{height}",
 230                                         deprecated)
 231                         * "filesize" (optional, int)
 232     thumbnail:      Full URL to a video thumbnail image.
 233     description:    Full video description.
 234     uploader:       Full name of the video uploader.
 235     license:        License name the video is licensed under.
 236     creator:        The creator of the video.
 237     release_timestamp: UNIX timestamp of the moment the video was released.
 238     release_date:   The date (YYYYMMDD) when the video was released.
 239     timestamp:      UNIX timestamp of the moment the video was uploaded
 240     upload_date:    Video upload date (YYYYMMDD).
 241                     If not explicitly set, calculated from timestamp.
 242     uploader_id:    Nickname or id of the video uploader.
 243     uploader_url:   Full URL to a personal webpage of the video uploader.
 244     channel:        Full name of the channel the video is uploaded on.
 245                     Note that channel fields may or may not repeat uploader
 246                     fields. This depends on a particular extractor.
 247     channel_id:     Id of the channel.
 248     channel_url:    Full URL to a channel webpage.
 249     location:       Physical location where the video was filmed.
 250     subtitles:      The available subtitles as a dictionary in the format
 251                     {tag: subformats}. "tag" is usually a language code, and
 252                     "subformats" is a list sorted from lower to higher
 253                     preference, each element is a dictionary with the "ext"
 254                     entry and one of:
 255                         * "data": The subtitles file contents
 256                         * "url": A URL pointing to the subtitles file
 257                     It can optionally also have:
 258                         * "name": Name or description of the subtitles
 259                     "ext" will be calculated from URL if missing
 260     automatic_captions: Like 'subtitles'; contains automatically generated
 261                     captions instead of normal subtitles
 262     duration:       Length of the video in seconds, as an integer or float.
 263     view_count:     How many users have watched the video on the platform.
 264     like_count:     Number of positive ratings of the video
 265     dislike_count:  Number of negative ratings of the video
 266     repost_count:   Number of reposts of the video
 267     average_rating: Average rating give by users, the scale used depends on the webpage
 268     comment_count:  Number of comments on the video
 269     comments:       A list of comments, each with one or more of the following
 270                     properties (all but one of text or html optional):
 271                         * "author" - human-readable name of the comment author
 272                         * "author_id" - user ID of the comment author
 273                         * "author_thumbnail" - The thumbnail of the comment author
 274                         * "id" - Comment ID
 275                         * "html" - Comment as HTML
 276                         * "text" - Plain text of the comment
 277                         * "timestamp" - UNIX timestamp of comment
 278                         * "parent" - ID of the comment this one is replying to.
 279                                      Set to "root" to indicate that this is a
 280                                      comment to the original video.
 281                         * "like_count" - Number of positive ratings of the comment
 282                         * "dislike_count" - Number of negative ratings of the comment
 283                         * "is_favorited" - Whether the comment is marked as
 284                                            favorite by the video uploader
 285                         * "author_is_uploader" - Whether the comment is made by
 286                                                  the video uploader
 287     age_limit:      Age restriction for the video, as an integer (years)
 288     webpage_url:    The URL to the video webpage, if given to yt-dlp it
 289                     should allow to get the same result again. (It will be set
 290                     by YoutubeDL if it's missing)
 291     categories:     A list of categories that the video falls in, for example
 292                     ["Sports", "Berlin"]
 293     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 294     cast:           A list of the video cast
 295     is_live:        True, False, or None (=unknown). Whether this video is a
 296                     live stream that goes on instead of a fixed-length video.
 297     was_live:       True, False, or None (=unknown). Whether this video was
 298                     originally a live stream.
 299     start_time:     Time in seconds where the reproduction should start, as
 300                     specified in the URL.
 301     end_time:       Time in seconds where the reproduction should end, as
 302                     specified in the URL.
 303     chapters:       A list of dictionaries, with the following entries:
 304                         * "start_time" - The start time of the chapter in seconds
 305                         * "end_time" - The end time of the chapter in seconds
 306                         * "title" (optional, string)
 307     playable_in_embed: Whether this video is allowed to play in embedded
 308                     players on other sites. Can be True (=always allowed),
 309                     False (=never allowed), None (=unknown), or a string
 310                     specifying the criteria for embedability (Eg: 'whitelist')
 311     availability:   Under what condition the video is available. One of
 312                     'private', 'premium_only', 'subscriber_only', 'needs_auth',
 313                     'unlisted' or 'public'. Use 'InfoExtractor._availability'
 314                     to set it
 315     __post_extractor: A function to be called just before the metadata is
 316                     written to either disk, logger or console. The function
 317                     must return a dict which will be added to the info_dict.
 318                     This is usefull for additional information that is
 319                     time-consuming to extract. Note that the fields thus
 320                     extracted will not be available to output template and
 321                     match_filter. So, only "comments" and "comment_count" are
 322                     currently allowed to be extracted via this method.
 323
 324     The following fields should only be used when the video belongs to some logical
 325     chapter or section:
 326
 327     chapter:        Name or title of the chapter the video belongs to.
 328     chapter_number: Number of the chapter the video belongs to, as an integer.
 329     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 330
 331     The following fields should only be used when the video is an episode of some
 332     series, programme or podcast:
 333
 334     series:         Title of the series or programme the video episode belongs to.
 335     season:         Title of the season the video episode belongs to.
 336     season_number:  Number of the season the video episode belongs to, as an integer.
 337     season_id:      Id of the season the video episode belongs to, as a unicode string.
 338     episode:        Title of the video episode. Unlike mandatory video title field,
 339                     this field should denote the exact title of the video episode
 340                     without any kind of decoration.
 341     episode_number: Number of the video episode within a season, as an integer.
 342     episode_id:     Id of the video episode, as a unicode string.
 343
 344     The following fields should only be used when the media is a track or a part of
 345     a music album:
 346
 347     track:          Title of the track.
 348     track_number:   Number of the track within an album or a disc, as an integer.
 349     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 350                     as a unicode string.
 351     artist:         Artist(s) of the track.
 352     genre:          Genre(s) of the track.
 353     album:          Title of the album the track belongs to.
 354     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 355     album_artist:   List of all artists appeared on the album (e.g.
 356                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 357                     and compilations).
 358     disc_number:    Number of the disc or other physical medium the track belongs to,
 359                     as an integer.
 360     release_year:   Year (YYYY) when the album was released.
 361
 362     Unless mentioned otherwise, the fields should be Unicode strings.
 363
 364     Unless mentioned otherwise, None is equivalent to absence of information.
 365
 366
 367     _type "playlist" indicates multiple videos.
 368     There must be a key "entries", which is a list, an iterable, or a PagedList
 369     object, each element of which is a valid dictionary by this specification.
 370
 371     Additionally, playlists can have "id", "title", and any other relevent
 372     attributes with the same semantics as videos (see above).
 373
 374
 375     _type "multi_video" indicates that there are multiple videos that
 376     form a single show, for examples multiple acts of an opera or TV episode.
 377     It must have an entries key like a playlist and contain all the keys
 378     required for a video at the same time.
 379
 380
 381     _type "url" indicates that the video must be extracted from another
 382     location, possibly by a different extractor. Its only required key is:
 383     "url" - the next URL to extract.
 384     The key "ie_key" can be set to the class name (minus the trailing "IE",
 385     e.g. "Youtube") if the extractor class is known in advance.
 386     Additionally, the dictionary may have any properties of the resolved entity
 387     known in advance, for example "title" if the title of the referred video is
 388     known ahead of time.
 389
 390
 391     _type "url_transparent" entities have the same specification as "url", but
 392     indicate that the given additional information is more precise than the one
 393     associated with the resolved URL.
 394     This is useful when a site employs a video service that hosts the video and
 395     its technical metadata, but that video service does not embed a useful
 396     title, description etc.
 397
 398
 399     Subclasses of this one should re-define the _real_initialize() and
 400     _real_extract() methods and define a _VALID_URL regexp.
 401     Probably, they should also be added to the list of extractors.
 402
 403     _GEO_BYPASS attribute may be set to False in order to disable
 404     geo restriction bypass mechanisms for a particular extractor.
 405     Though it won't disable explicit geo restriction bypass based on
 406     country code provided with geo_bypass_country.
 407
 408     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 409     countries for this extractor. One of these countries will be used by
 410     geo restriction bypass mechanism right away in order to bypass
 411     geo restriction, of course, if the mechanism is not disabled.
 412
 413     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 414     IP blocks in CIDR notation for this extractor. One of these IP blocks
 415     will be used by geo restriction bypass mechanism similarly
 416     to _GEO_COUNTRIES.
 417
 418     Finally, the _WORKING attribute should be set to False for broken IEs
 419     in order to warn the users and skip the tests.
 420     """
 421
 422     _ready = False
 423     _downloader = None
 424     _x_forwarded_for_ip = None
 425     _GEO_BYPASS = True
 426     _GEO_COUNTRIES = None
 427     _GEO_IP_BLOCKS = None
 428     _WORKING = True
 429
 430     _LOGIN_HINTS = {
 431         'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
 432         'cookies': (
 433             'Use --cookies for the authentication. '
 434             'See  https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl  for how to pass cookies'),
 435         'password': 'Use --username and --password or --netrc to provide account credentials',
 436     }
 437
 438     def __init__(self, downloader=None):
 439         """Constructor. Receives an optional downloader."""
 440         self._ready = False
 441         self._x_forwarded_for_ip = None
 442         self.set_downloader(downloader)
 443
 444     @classmethod
 445     def suitable(cls, url):
 446         """Receives a URL and returns True if suitable for this IE."""
 447
 448         # This does not use has/getattr intentionally - we want to know whether
 449         # we have cached the regexp for *this* class, whereas getattr would also
 450         # match the superclass
 451         if '_VALID_URL_RE' not in cls.__dict__:
 452             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 453         return cls._VALID_URL_RE.match(url) is not None
 454
 455     @classmethod
 456     def _match_id(cls, url):
 457         if '_VALID_URL_RE' not in cls.__dict__:
 458             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 459         m = cls._VALID_URL_RE.match(url)
 460         assert m
 461         return compat_str(m.group('id'))
 462
 463     @classmethod
 464     def working(cls):
 465         """Getter method for _WORKING."""
 466         return cls._WORKING
 467
 468     def initialize(self):
 469         """Initializes an instance (authentication, etc)."""
 470         self._initialize_geo_bypass({
 471             'countries': self._GEO_COUNTRIES,
 472             'ip_blocks': self._GEO_IP_BLOCKS,
 473         })
 474         if not self._ready:
 475             self._real_initialize()
 476             self._ready = True
 477
 478     def _initialize_geo_bypass(self, geo_bypass_context):
 479         """
 480         Initialize geo restriction bypass mechanism.
 481
 482         This method is used to initialize geo bypass mechanism based on faking
 483         X-Forwarded-For HTTP header. A random country from provided country list
 484         is selected and a random IP belonging to this country is generated. This
 485         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 486         HTTP requests.
 487
 488         This method will be used for initial geo bypass mechanism initialization
 489         during the instance initialization with _GEO_COUNTRIES and
 490         _GEO_IP_BLOCKS.
 491
 492         You may also manually call it from extractor's code if geo bypass
 493         information is not available beforehand (e.g. obtained during
 494         extraction) or due to some other reason. In this case you should pass
 495         this information in geo bypass context passed as first argument. It may
 496         contain following fields:
 497
 498         countries:  List of geo unrestricted countries (similar
 499                     to _GEO_COUNTRIES)
 500         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 501                     (similar to _GEO_IP_BLOCKS)
 502
 503         """
 504         if not self._x_forwarded_for_ip:
 505
 506             # Geo bypass mechanism is explicitly disabled by user
 507             if not self.get_param('geo_bypass', True):
 508                 return
 509
 510             if not geo_bypass_context:
 511                 geo_bypass_context = {}
 512
 513             # Backward compatibility: previously _initialize_geo_bypass
 514             # expected a list of countries, some 3rd party code may still use
 515             # it this way
 516             if isinstance(geo_bypass_context, (list, tuple)):
 517                 geo_bypass_context = {
 518                     'countries': geo_bypass_context,
 519                 }
 520
 521             # The whole point of geo bypass mechanism is to fake IP
 522             # as X-Forwarded-For HTTP header based on some IP block or
 523             # country code.
 524
 525             # Path 1: bypassing based on IP block in CIDR notation
 526
 527             # Explicit IP block specified by user, use it right away
 528             # regardless of whether extractor is geo bypassable or not
 529             ip_block = self.get_param('geo_bypass_ip_block', None)
 530
 531             # Otherwise use random IP block from geo bypass context but only
 532             # if extractor is known as geo bypassable
 533             if not ip_block:
 534                 ip_blocks = geo_bypass_context.get('ip_blocks')
 535                 if self._GEO_BYPASS and ip_blocks:
 536                     ip_block = random.choice(ip_blocks)
 537
 538             if ip_block:
 539                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 540                 self._downloader.write_debug(
 541                     '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
 542                 return
 543
 544             # Path 2: bypassing based on country code
 545
 546             # Explicit country code specified by user, use it right away
 547             # regardless of whether extractor is geo bypassable or not
 548             country = self.get_param('geo_bypass_country', None)
 549
 550             # Otherwise use random country code from geo bypass context but
 551             # only if extractor is known as geo bypassable
 552             if not country:
 553                 countries = geo_bypass_context.get('countries')
 554                 if self._GEO_BYPASS and countries:
 555                     country = random.choice(countries)
 556
 557             if country:
 558                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 559                 self._downloader.write_debug(
 560                     'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
 561
 562     def extract(self, url):
 563         """Extracts URL information and returns it in list of dicts."""
 564         try:
 565             for _ in range(2):
 566                 try:
 567                     self.initialize()
 568                     self.write_debug('Extracting URL: %s' % url)
 569                     ie_result = self._real_extract(url)
 570                     if ie_result is None:
 571                         return None
 572                     if self._x_forwarded_for_ip:
 573                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 574                     subtitles = ie_result.get('subtitles')
 575                     if (subtitles and 'live_chat' in subtitles
 576                             and 'no-live-chat' in self.get_param('compat_opts', [])):
 577                         del subtitles['live_chat']
 578                     return ie_result
 579                 except GeoRestrictedError as e:
 580                     if self.__maybe_fake_ip_and_retry(e.countries):
 581                         continue
 582                     raise
 583         except ExtractorError:
 584             raise
 585         except compat_http_client.IncompleteRead as e:
 586             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 587         except (KeyError, StopIteration) as e:
 588             raise ExtractorError('An extractor error has occurred.', cause=e)
 589
 590     def __maybe_fake_ip_and_retry(self, countries):
 591         if (not self.get_param('geo_bypass_country', None)
 592                 and self._GEO_BYPASS
 593                 and self.get_param('geo_bypass', True)
 594                 and not self._x_forwarded_for_ip
 595                 and countries):
 596             country_code = random.choice(countries)
 597             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 598             if self._x_forwarded_for_ip:
 599                 self.report_warning(
 600                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 601                     % (self._x_forwarded_for_ip, country_code.upper()))
 602                 return True
 603         return False
 604
 605     def set_downloader(self, downloader):
 606         """Sets the downloader for this IE."""
 607         self._downloader = downloader
 608
 609     def _real_initialize(self):
 610         """Real initialization process. Redefine in subclasses."""
 611         pass
 612
 613     def _real_extract(self, url):
 614         """Real extraction process. Redefine in subclasses."""
 615         pass
 616
 617     @classmethod
 618     def ie_key(cls):
 619         """A string for getting the InfoExtractor with get_info_extractor"""
 620         return compat_str(cls.__name__[:-2])
 621
 622     @property
 623     def IE_NAME(self):
 624         return compat_str(type(self).__name__[:-2])
 625
 626     @staticmethod
 627     def __can_accept_status_code(err, expected_status):
 628         assert isinstance(err, compat_urllib_error.HTTPError)
 629         if expected_status is None:
 630             return False
 631         if isinstance(expected_status, compat_integer_types):
 632             return err.code == expected_status
 633         elif isinstance(expected_status, (list, tuple)):
 634             return err.code in expected_status
 635         elif callable(expected_status):
 636             return expected_status(err.code) is True
 637         else:
 638             assert False
 639
 640     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 641         """
 642         Return the response handle.
 643
 644         See _download_webpage docstring for arguments specification.
 645         """
 646         if not self._downloader._first_webpage_request:
 647             sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
 648             if sleep_interval > 0:
 649                 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
 650                 time.sleep(sleep_interval)
 651         else:
 652             self._downloader._first_webpage_request = False
 653
 654         if note is None:
 655             self.report_download_webpage(video_id)
 656         elif note is not False:
 657             if video_id is None:
 658                 self.to_screen('%s' % (note,))
 659             else:
 660                 self.to_screen('%s: %s' % (video_id, note))
 661
 662         # Some sites check X-Forwarded-For HTTP header in order to figure out
 663         # the origin of the client behind proxy. This allows bypassing geo
 664         # restriction by faking this header's value to IP that belongs to some
 665         # geo unrestricted country. We will do so once we encounter any
 666         # geo restriction error.
 667         if self._x_forwarded_for_ip:
 668             if 'X-Forwarded-For' not in headers:
 669                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 670
 671         if isinstance(url_or_request, compat_urllib_request.Request):
 672             url_or_request = update_Request(
 673                 url_or_request, data=data, headers=headers, query=query)
 674         else:
 675             if query:
 676                 url_or_request = update_url_query(url_or_request, query)
 677             if data is not None or headers:
 678                 url_or_request = sanitized_Request(url_or_request, data, headers)
 679         try:
 680             return self._downloader.urlopen(url_or_request)
 681         except network_exceptions as err:
 682             if isinstance(err, compat_urllib_error.HTTPError):
 683                 if self.__can_accept_status_code(err, expected_status):
 684                     # Retain reference to error to prevent file object from
 685                     # being closed before it can be read. Works around the
 686                     # effects of <https://bugs.python.org/issue15002>
 687                     # introduced in Python 3.4.1.
 688                     err.fp._error = err
 689                     return err.fp
 690
 691             if errnote is False:
 692                 return False
 693             if errnote is None:
 694                 errnote = 'Unable to download webpage'
 695
 696             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 697             if fatal:
 698                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 699             else:
 700                 self.report_warning(errmsg)
 701                 return False
 702
 703     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 704         """
 705         Return a tuple (page content as string, URL handle).
 706
 707         See _download_webpage docstring for arguments specification.
 708         """
 709         # Strip hashes from the URL (#1038)
 710         if isinstance(url_or_request, (compat_str, str)):
 711             url_or_request = url_or_request.partition('#')[0]
 712
 713         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 714         if urlh is False:
 715             assert not fatal
 716             return False
 717         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 718         return (content, urlh)
 719
 720     @staticmethod
 721     def _guess_encoding_from_content(content_type, webpage_bytes):
 722         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 723         if m:
 724             encoding = m.group(1)
 725         else:
 726             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 727                           webpage_bytes[:1024])
 728             if m:
 729                 encoding = m.group(1).decode('ascii')
 730             elif webpage_bytes.startswith(b'\xff\xfe'):
 731                 encoding = 'utf-16'
 732             else:
 733                 encoding = 'utf-8'
 734
 735         return encoding
 736
 737     def __check_blocked(self, content):
 738         first_block = content[:512]
 739         if ('<title>Access to this site is blocked</title>' in content
 740                 and 'Websense' in first_block):
 741             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 742             blocked_iframe = self._html_search_regex(
 743                 r'<iframe src="([^"]+)"', content,
 744                 'Websense information URL', default=None)
 745             if blocked_iframe:
 746                 msg += ' Visit %s for more details' % blocked_iframe
 747             raise ExtractorError(msg, expected=True)
 748         if '<title>The URL you requested has been blocked</title>' in first_block:
 749             msg = (
 750                 'Access to this webpage has been blocked by Indian censorship. '
 751                 'Use a VPN or proxy server (with --proxy) to route around it.')
 752             block_msg = self._html_search_regex(
 753                 r'</h1><p>(.*?)</p>',
 754                 content, 'block message', default=None)
 755             if block_msg:
 756                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 757             raise ExtractorError(msg, expected=True)
 758         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 759                 and 'blocklist.rkn.gov.ru' in content):
 760             raise ExtractorError(
 761                 'Access to this webpage has been blocked by decision of the Russian government. '
 762                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 763                 expected=True)
 764
 765     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 766         content_type = urlh.headers.get('Content-Type', '')
 767         webpage_bytes = urlh.read()
 768         if prefix is not None:
 769             webpage_bytes = prefix + webpage_bytes
 770         if not encoding:
 771             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 772         if self.get_param('dump_intermediate_pages', False):
 773             self.to_screen('Dumping request to ' + urlh.geturl())
 774             dump = base64.b64encode(webpage_bytes).decode('ascii')
 775             self._downloader.to_screen(dump)
 776         if self.get_param('write_pages', False):
 777             basen = '%s_%s' % (video_id, urlh.geturl())
 778             if len(basen) > 240:
 779                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 780                 basen = basen[:240 - len(h)] + h
 781             raw_filename = basen + '.dump'
 782             filename = sanitize_filename(raw_filename, restricted=True)
 783             self.to_screen('Saving request to ' + filename)
 784             # Working around MAX_PATH limitation on Windows (see
 785             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 786             if compat_os_name == 'nt':
 787                 absfilepath = os.path.abspath(filename)
 788                 if len(absfilepath) > 259:
 789                     filename = '\\\\?\\' + absfilepath
 790             with open(filename, 'wb') as outf:
 791                 outf.write(webpage_bytes)
 792
 793         try:
 794             content = webpage_bytes.decode(encoding, 'replace')
 795         except LookupError:
 796             content = webpage_bytes.decode('utf-8', 'replace')
 797
 798         self.__check_blocked(content)
 799
 800         return content
 801
 802     def _download_webpage(
 803             self, url_or_request, video_id, note=None, errnote=None,
 804             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 805             headers={}, query={}, expected_status=None):
 806         """
 807         Return the data of the page as a string.
 808
 809         Arguments:
 810         url_or_request -- plain text URL as a string or
 811             a compat_urllib_request.Requestobject
 812         video_id -- Video/playlist/item identifier (string)
 813
 814         Keyword arguments:
 815         note -- note printed before downloading (string)
 816         errnote -- note printed in case of an error (string)
 817         fatal -- flag denoting whether error should be considered fatal,
 818             i.e. whether it should cause ExtractionError to be raised,
 819             otherwise a warning will be reported and extraction continued
 820         tries -- number of tries
 821         timeout -- sleep interval between tries
 822         encoding -- encoding for a page content decoding, guessed automatically
 823             when not explicitly specified
 824         data -- POST data (bytes)
 825         headers -- HTTP headers (dict)
 826         query -- URL query (dict)
 827         expected_status -- allows to accept failed HTTP requests (non 2xx
 828             status code) by explicitly specifying a set of accepted status
 829             codes. Can be any of the following entities:
 830                 - an integer type specifying an exact failed status code to
 831                   accept
 832                 - a list or a tuple of integer types specifying a list of
 833                   failed status codes to accept
 834                 - a callable accepting an actual failed status code and
 835                   returning True if it should be accepted
 836             Note that this argument does not affect success status codes (2xx)
 837             which are always accepted.
 838         """
 839
 840         success = False
 841         try_count = 0
 842         while success is False:
 843             try:
 844                 res = self._download_webpage_handle(
 845                     url_or_request, video_id, note, errnote, fatal,
 846                     encoding=encoding, data=data, headers=headers, query=query,
 847                     expected_status=expected_status)
 848                 success = True
 849             except compat_http_client.IncompleteRead as e:
 850                 try_count += 1
 851                 if try_count >= tries:
 852                     raise e
 853                 self._sleep(timeout, video_id)
 854         if res is False:
 855             return res
 856         else:
 857             content, _ = res
 858             return content
 859
 860     def _download_xml_handle(
 861             self, url_or_request, video_id, note='Downloading XML',
 862             errnote='Unable to download XML', transform_source=None,
 863             fatal=True, encoding=None, data=None, headers={}, query={},
 864             expected_status=None):
 865         """
 866         Return a tuple (xml as an compat_etree_Element, URL handle).
 867
 868         See _download_webpage docstring for arguments specification.
 869         """
 870         res = self._download_webpage_handle(
 871             url_or_request, video_id, note, errnote, fatal=fatal,
 872             encoding=encoding, data=data, headers=headers, query=query,
 873             expected_status=expected_status)
 874         if res is False:
 875             return res
 876         xml_string, urlh = res
 877         return self._parse_xml(
 878             xml_string, video_id, transform_source=transform_source,
 879             fatal=fatal), urlh
 880
 881     def _download_xml(
 882             self, url_or_request, video_id,
 883             note='Downloading XML', errnote='Unable to download XML',
 884             transform_source=None, fatal=True, encoding=None,
 885             data=None, headers={}, query={}, expected_status=None):
 886         """
 887         Return the xml as an compat_etree_Element.
 888
 889         See _download_webpage docstring for arguments specification.
 890         """
 891         res = self._download_xml_handle(
 892             url_or_request, video_id, note=note, errnote=errnote,
 893             transform_source=transform_source, fatal=fatal, encoding=encoding,
 894             data=data, headers=headers, query=query,
 895             expected_status=expected_status)
 896         return res if res is False else res[0]
 897
 898     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 899         if transform_source:
 900             xml_string = transform_source(xml_string)
 901         try:
 902             return compat_etree_fromstring(xml_string.encode('utf-8'))
 903         except compat_xml_parse_error as ve:
 904             errmsg = '%s: Failed to parse XML ' % video_id
 905             if fatal:
 906                 raise ExtractorError(errmsg, cause=ve)
 907             else:
 908                 self.report_warning(errmsg + str(ve))
 909
 910     def _download_json_handle(
 911             self, url_or_request, video_id, note='Downloading JSON metadata',
 912             errnote='Unable to download JSON metadata', transform_source=None,
 913             fatal=True, encoding=None, data=None, headers={}, query={},
 914             expected_status=None):
 915         """
 916         Return a tuple (JSON object, URL handle).
 917
 918         See _download_webpage docstring for arguments specification.
 919         """
 920         res = self._download_webpage_handle(
 921             url_or_request, video_id, note, errnote, fatal=fatal,
 922             encoding=encoding, data=data, headers=headers, query=query,
 923             expected_status=expected_status)
 924         if res is False:
 925             return res
 926         json_string, urlh = res
 927         return self._parse_json(
 928             json_string, video_id, transform_source=transform_source,
 929             fatal=fatal), urlh
 930
 931     def _download_json(
 932             self, url_or_request, video_id, note='Downloading JSON metadata',
 933             errnote='Unable to download JSON metadata', transform_source=None,
 934             fatal=True, encoding=None, data=None, headers={}, query={},
 935             expected_status=None):
 936         """
 937         Return the JSON object as a dict.
 938
 939         See _download_webpage docstring for arguments specification.
 940         """
 941         res = self._download_json_handle(
 942             url_or_request, video_id, note=note, errnote=errnote,
 943             transform_source=transform_source, fatal=fatal, encoding=encoding,
 944             data=data, headers=headers, query=query,
 945             expected_status=expected_status)
 946         return res if res is False else res[0]
 947
 948     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 949         if transform_source:
 950             json_string = transform_source(json_string)
 951         try:
 952             return json.loads(json_string)
 953         except ValueError as ve:
 954             errmsg = '%s: Failed to parse JSON ' % video_id
 955             if fatal:
 956                 raise ExtractorError(errmsg, cause=ve)
 957             else:
 958                 self.report_warning(errmsg + str(ve))
 959
 960     def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
 961         return self._parse_json(
 962             data[data.find('{'):data.rfind('}') + 1],
 963             video_id, transform_source, fatal)
 964
 965     def _download_socket_json_handle(
 966             self, url_or_request, video_id, note='Polling socket',
 967             errnote='Unable to poll socket', transform_source=None,
 968             fatal=True, encoding=None, data=None, headers={}, query={},
 969             expected_status=None):
 970         """
 971         Return a tuple (JSON object, URL handle).
 972
 973         See _download_webpage docstring for arguments specification.
 974         """
 975         res = self._download_webpage_handle(
 976             url_or_request, video_id, note, errnote, fatal=fatal,
 977             encoding=encoding, data=data, headers=headers, query=query,
 978             expected_status=expected_status)
 979         if res is False:
 980             return res
 981         webpage, urlh = res
 982         return self._parse_socket_response_as_json(
 983             webpage, video_id, transform_source=transform_source,
 984             fatal=fatal), urlh
 985
 986     def _download_socket_json(
 987             self, url_or_request, video_id, note='Polling socket',
 988             errnote='Unable to poll socket', transform_source=None,
 989             fatal=True, encoding=None, data=None, headers={}, query={},
 990             expected_status=None):
 991         """
 992         Return the JSON object as a dict.
 993
 994         See _download_webpage docstring for arguments specification.
 995         """
 996         res = self._download_socket_json_handle(
 997             url_or_request, video_id, note=note, errnote=errnote,
 998             transform_source=transform_source, fatal=fatal, encoding=encoding,
 999             data=data, headers=headers, query=query,
1000             expected_status=expected_status)
1001         return res if res is False else res[0]
1002
1003     def report_warning(self, msg, video_id=None, *args, **kwargs):
1004         idstr = '' if video_id is None else '%s: ' % video_id
1005         self._downloader.report_warning(
1006             '[%s] %s%s' % (self.IE_NAME, idstr, msg), *args, **kwargs)
1007
1008     def to_screen(self, msg, *args, **kwargs):
1009         """Print msg to screen, prefixing it with '[ie_name]'"""
1010         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1011
1012     def write_debug(self, msg, *args, **kwargs):
1013         self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1014
1015     def get_param(self, name, default=None, *args, **kwargs):
1016         if self._downloader:
1017             return self._downloader.params.get(name, default, *args, **kwargs)
1018         return default
1019
1020     def report_extraction(self, id_or_name):
1021         """Report information extraction."""
1022         self.to_screen('%s: Extracting information' % id_or_name)
1023
1024     def report_download_webpage(self, video_id):
1025         """Report webpage download."""
1026         self.to_screen('%s: Downloading webpage' % video_id)
1027
1028     def report_age_confirmation(self):
1029         """Report attempt to confirm age."""
1030         self.to_screen('Confirming age')
1031
1032     def report_login(self):
1033         """Report attempt to log in."""
1034         self.to_screen('Logging in')
1035
1036     def raise_login_required(
1037             self, msg='This video is only available for registered users',
1038             metadata_available=False, method='any'):
1039         if metadata_available and self.get_param('ignore_no_formats_error'):
1040             self.report_warning(msg)
1041         raise ExtractorError('%s. %s' % (msg, self._LOGIN_HINTS[method]), expected=True)
1042
1043     def raise_geo_restricted(
1044             self, msg='This video is not available from your location due to geo restriction',
1045             countries=None, metadata_available=False):
1046         if metadata_available and self.get_param('ignore_no_formats_error'):
1047             self.report_warning(msg)
1048         else:
1049             raise GeoRestrictedError(msg, countries=countries)
1050
1051     def raise_no_formats(self, msg, expected=False, video_id=None):
1052         if expected and self.get_param('ignore_no_formats_error'):
1053             self.report_warning(msg, video_id)
1054         else:
1055             raise ExtractorError(msg, expected=expected, video_id=video_id)
1056
1057     # Methods for following #608
1058     @staticmethod
1059     def url_result(url, ie=None, video_id=None, video_title=None):
1060         """Returns a URL that points to a page that should be processed"""
1061         # TODO: ie should be the class used for getting the info
1062         video_info = {'_type': 'url',
1063                       'url': url,
1064                       'ie_key': ie}
1065         if video_id is not None:
1066             video_info['id'] = video_id
1067         if video_title is not None:
1068             video_info['title'] = video_title
1069         return video_info
1070
1071     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1072         urls = orderedSet(
1073             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1074             for m in matches)
1075         return self.playlist_result(
1076             urls, playlist_id=playlist_id, playlist_title=playlist_title)
1077
1078     @staticmethod
1079     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1080         """Returns a playlist"""
1081         video_info = {'_type': 'playlist',
1082                       'entries': entries}
1083         video_info.update(kwargs)
1084         if playlist_id:
1085             video_info['id'] = playlist_id
1086         if playlist_title:
1087             video_info['title'] = playlist_title
1088         if playlist_description is not None:
1089             video_info['description'] = playlist_description
1090         return video_info
1091
1092     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1093         """
1094         Perform a regex search on the given string, using a single or a list of
1095         patterns returning the first matching group.
1096         In case of failure return a default value or raise a WARNING or a
1097         RegexNotFoundError, depending on fatal, specifying the field name.
1098         """
1099         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1100             mobj = re.search(pattern, string, flags)
1101         else:
1102             for p in pattern:
1103                 mobj = re.search(p, string, flags)
1104                 if mobj:
1105                     break
1106
1107         if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
1108             _name = '\033[0;34m%s\033[0m' % name
1109         else:
1110             _name = name
1111
1112         if mobj:
1113             if group is None:
1114                 # return the first matching group
1115                 return next(g for g in mobj.groups() if g is not None)
1116             else:
1117                 return mobj.group(group)
1118         elif default is not NO_DEFAULT:
1119             return default
1120         elif fatal:
1121             raise RegexNotFoundError('Unable to extract %s' % _name)
1122         else:
1123             self.report_warning('unable to extract %s' % _name + bug_reports_message())
1124             return None
1125
1126     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1127         """
1128         Like _search_regex, but strips HTML tags and unescapes entities.
1129         """
1130         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1131         if res:
1132             return clean_html(res).strip()
1133         else:
1134             return res
1135
1136     def _get_netrc_login_info(self, netrc_machine=None):
1137         username = None
1138         password = None
1139         netrc_machine = netrc_machine or self._NETRC_MACHINE
1140
1141         if self.get_param('usenetrc', False):
1142             try:
1143                 info = netrc.netrc().authenticators(netrc_machine)
1144                 if info is not None:
1145                     username = info[0]
1146                     password = info[2]
1147                 else:
1148                     raise netrc.NetrcParseError(
1149                         'No authenticators for %s' % netrc_machine)
1150             except (IOError, netrc.NetrcParseError) as err:
1151                 self.report_warning(
1152                     'parsing .netrc: %s' % error_to_compat_str(err))
1153
1154         return username, password
1155
1156     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1157         """
1158         Get the login info as (username, password)
1159         First look for the manually specified credentials using username_option
1160         and password_option as keys in params dictionary. If no such credentials
1161         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1162         value.
1163         If there's no info available, return (None, None)
1164         """
1165
1166         # Attempt to use provided username and password or .netrc data
1167         username = self.get_param(username_option)
1168         if username is not None:
1169             password = self.get_param(password_option)
1170         else:
1171             username, password = self._get_netrc_login_info(netrc_machine)
1172
1173         return username, password
1174
1175     def _get_tfa_info(self, note='two-factor verification code'):
1176         """
1177         Get the two-factor authentication info
1178         TODO - asking the user will be required for sms/phone verify
1179         currently just uses the command line option
1180         If there's no info available, return None
1181         """
1182
1183         tfa = self.get_param('twofactor')
1184         if tfa is not None:
1185             return tfa
1186
1187         return compat_getpass('Type %s and press [Return]: ' % note)
1188
1189     # Helper functions for extracting OpenGraph info
1190     @staticmethod
1191     def _og_regexes(prop):
1192         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1193         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1194                        % {'prop': re.escape(prop)})
1195         template = r'<meta[^>]+?%s[^>]+?%s'
1196         return [
1197             template % (property_re, content_re),
1198             template % (content_re, property_re),
1199         ]
1200
1201     @staticmethod
1202     def _meta_regex(prop):
1203         return r'''(?isx)<meta
1204                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1205                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1206
1207     def _og_search_property(self, prop, html, name=None, **kargs):
1208         if not isinstance(prop, (list, tuple)):
1209             prop = [prop]
1210         if name is None:
1211             name = 'OpenGraph %s' % prop[0]
1212         og_regexes = []
1213         for p in prop:
1214             og_regexes.extend(self._og_regexes(p))
1215         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1216         if escaped is None:
1217             return None
1218         return unescapeHTML(escaped)
1219
1220     def _og_search_thumbnail(self, html, **kargs):
1221         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1222
1223     def _og_search_description(self, html, **kargs):
1224         return self._og_search_property('description', html, fatal=False, **kargs)
1225
1226     def _og_search_title(self, html, **kargs):
1227         return self._og_search_property('title', html, **kargs)
1228
1229     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1230         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1231         if secure:
1232             regexes = self._og_regexes('video:secure_url') + regexes
1233         return self._html_search_regex(regexes, html, name, **kargs)
1234
1235     def _og_search_url(self, html, **kargs):
1236         return self._og_search_property('url', html, **kargs)
1237
1238     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1239         if not isinstance(name, (list, tuple)):
1240             name = [name]
1241         if display_name is None:
1242             display_name = name[0]
1243         return self._html_search_regex(
1244             [self._meta_regex(n) for n in name],
1245             html, display_name, fatal=fatal, group='content', **kwargs)
1246
1247     def _dc_search_uploader(self, html):
1248         return self._html_search_meta('dc.creator', html, 'uploader')
1249
1250     def _rta_search(self, html):
1251         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1252         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1253                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1254                      html):
1255             return 18
1256         return 0
1257
1258     def _media_rating_search(self, html):
1259         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1260         rating = self._html_search_meta('rating', html)
1261
1262         if not rating:
1263             return None
1264
1265         RATING_TABLE = {
1266             'safe for kids': 0,
1267             'general': 8,
1268             '14 years': 14,
1269             'mature': 17,
1270             'restricted': 19,
1271         }
1272         return RATING_TABLE.get(rating.lower())
1273
1274     def _family_friendly_search(self, html):
1275         # See http://schema.org/VideoObject
1276         family_friendly = self._html_search_meta(
1277             'isFamilyFriendly', html, default=None)
1278
1279         if not family_friendly:
1280             return None
1281
1282         RATING_TABLE = {
1283             '1': 0,
1284             'true': 0,
1285             '0': 18,
1286             'false': 18,
1287         }
1288         return RATING_TABLE.get(family_friendly.lower())
1289
1290     def _twitter_search_player(self, html):
1291         return self._html_search_meta('twitter:player', html,
1292                                       'twitter card player')
1293
1294     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1295         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1296         default = kwargs.get('default', NO_DEFAULT)
1297         # JSON-LD may be malformed and thus `fatal` should be respected.
1298         # At the same time `default` may be passed that assumes `fatal=False`
1299         # for _search_regex. Let's simulate the same behavior here as well.
1300         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1301         json_ld = []
1302         for mobj in json_ld_list:
1303             json_ld_item = self._parse_json(
1304                 mobj.group('json_ld'), video_id, fatal=fatal)
1305             if not json_ld_item:
1306                 continue
1307             if isinstance(json_ld_item, dict):
1308                 json_ld.append(json_ld_item)
1309             elif isinstance(json_ld_item, (list, tuple)):
1310                 json_ld.extend(json_ld_item)
1311         if json_ld:
1312             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1313         if json_ld:
1314             return json_ld
1315         if default is not NO_DEFAULT:
1316             return default
1317         elif fatal:
1318             raise RegexNotFoundError('Unable to extract JSON-LD')
1319         else:
1320             self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1321             return {}
1322
1323     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1324         if isinstance(json_ld, compat_str):
1325             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1326         if not json_ld:
1327             return {}
1328         info = {}
1329         if not isinstance(json_ld, (list, tuple, dict)):
1330             return info
1331         if isinstance(json_ld, dict):
1332             json_ld = [json_ld]
1333
1334         INTERACTION_TYPE_MAP = {
1335             'CommentAction': 'comment',
1336             'AgreeAction': 'like',
1337             'DisagreeAction': 'dislike',
1338             'LikeAction': 'like',
1339             'DislikeAction': 'dislike',
1340             'ListenAction': 'view',
1341             'WatchAction': 'view',
1342             'ViewAction': 'view',
1343         }
1344
1345         def extract_interaction_type(e):
1346             interaction_type = e.get('interactionType')
1347             if isinstance(interaction_type, dict):
1348                 interaction_type = interaction_type.get('@type')
1349             return str_or_none(interaction_type)
1350
1351         def extract_interaction_statistic(e):
1352             interaction_statistic = e.get('interactionStatistic')
1353             if isinstance(interaction_statistic, dict):
1354                 interaction_statistic = [interaction_statistic]
1355             if not isinstance(interaction_statistic, list):
1356                 return
1357             for is_e in interaction_statistic:
1358                 if not isinstance(is_e, dict):
1359                     continue
1360                 if is_e.get('@type') != 'InteractionCounter':
1361                     continue
1362                 interaction_type = extract_interaction_type(is_e)
1363                 if not interaction_type:
1364                     continue
1365                 # For interaction count some sites provide string instead of
1366                 # an integer (as per spec) with non digit characters (e.g. ",")
1367                 # so extracting count with more relaxed str_to_int
1368                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1369                 if interaction_count is None:
1370                     continue
1371                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1372                 if not count_kind:
1373                     continue
1374                 count_key = '%s_count' % count_kind
1375                 if info.get(count_key) is not None:
1376                     continue
1377                 info[count_key] = interaction_count
1378
1379         def extract_video_object(e):
1380             assert e['@type'] == 'VideoObject'
1381             author = e.get('author')
1382             info.update({
1383                 'url': url_or_none(e.get('contentUrl')),
1384                 'title': unescapeHTML(e.get('name')),
1385                 'description': unescapeHTML(e.get('description')),
1386                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1387                 'duration': parse_duration(e.get('duration')),
1388                 'timestamp': unified_timestamp(e.get('uploadDate')),
1389                 # author can be an instance of 'Organization' or 'Person' types.
1390                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1391                 # however some websites are using 'Text' type instead.
1392                 # 1. https://schema.org/VideoObject
1393                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1394                 'filesize': float_or_none(e.get('contentSize')),
1395                 'tbr': int_or_none(e.get('bitrate')),
1396                 'width': int_or_none(e.get('width')),
1397                 'height': int_or_none(e.get('height')),
1398                 'view_count': int_or_none(e.get('interactionCount')),
1399             })
1400             extract_interaction_statistic(e)
1401
1402         for e in json_ld:
1403             if '@context' in e:
1404                 item_type = e.get('@type')
1405                 if expected_type is not None and expected_type != item_type:
1406                     continue
1407                 if item_type in ('TVEpisode', 'Episode'):
1408                     episode_name = unescapeHTML(e.get('name'))
1409                     info.update({
1410                         'episode': episode_name,
1411                         'episode_number': int_or_none(e.get('episodeNumber')),
1412                         'description': unescapeHTML(e.get('description')),
1413                     })
1414                     if not info.get('title') and episode_name:
1415                         info['title'] = episode_name
1416                     part_of_season = e.get('partOfSeason')
1417                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1418                         info.update({
1419                             'season': unescapeHTML(part_of_season.get('name')),
1420                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1421                         })
1422                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1423                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1424                         info['series'] = unescapeHTML(part_of_series.get('name'))
1425                 elif item_type == 'Movie':
1426                     info.update({
1427                         'title': unescapeHTML(e.get('name')),
1428                         'description': unescapeHTML(e.get('description')),
1429                         'duration': parse_duration(e.get('duration')),
1430                         'timestamp': unified_timestamp(e.get('dateCreated')),
1431                     })
1432                 elif item_type in ('Article', 'NewsArticle'):
1433                     info.update({
1434                         'timestamp': parse_iso8601(e.get('datePublished')),
1435                         'title': unescapeHTML(e.get('headline')),
1436                         'description': unescapeHTML(e.get('articleBody')),
1437                     })
1438                 elif item_type == 'VideoObject':
1439                     extract_video_object(e)
1440                     if expected_type is None:
1441                         continue
1442                     else:
1443                         break
1444                 video = e.get('video')
1445                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1446                     extract_video_object(video)
1447                 if expected_type is None:
1448                     continue
1449                 else:
1450                     break
1451         return dict((k, v) for k, v in info.items() if v is not None)
1452
1453     @staticmethod
1454     def _hidden_inputs(html):
1455         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1456         hidden_inputs = {}
1457         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1458             attrs = extract_attributes(input)
1459             if not input:
1460                 continue
1461             if attrs.get('type') not in ('hidden', 'submit'):
1462                 continue
1463             name = attrs.get('name') or attrs.get('id')
1464             value = attrs.get('value')
1465             if name and value is not None:
1466                 hidden_inputs[name] = value
1467         return hidden_inputs
1468
1469     def _form_hidden_inputs(self, form_id, html):
1470         form = self._search_regex(
1471             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1472             html, '%s form' % form_id, group='form')
1473         return self._hidden_inputs(form)
1474
1475     class FormatSort:
1476         regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1477
1478         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1479                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
1480                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
1481         ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
1482                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
1483                         'fps', 'fs_approx', 'source', 'format_id')
1484
1485         settings = {
1486             'vcodec': {'type': 'ordered', 'regex': True,
1487                        'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1488             'acodec': {'type': 'ordered', 'regex': True,
1489                        'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
1490             'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1491                       'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
1492             'vext': {'type': 'ordered', 'field': 'video_ext',
1493                      'order': ('mp4', 'webm', 'flv', '', 'none'),
1494                      'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1495             'aext': {'type': 'ordered', 'field': 'audio_ext',
1496                      'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1497                      'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1498             'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1499             'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple', 'default': 1,
1500                            'field': ('vcodec', 'acodec'),
1501                            'function': lambda it: int(any(v != 'none' for v in it))},
1502             'ie_pref': {'priority': True, 'type': 'extractor'},
1503             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1504             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1505             'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1506             'quality': {'convert': 'float_none', 'default': -1},
1507             'filesize': {'convert': 'bytes'},
1508             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1509             'id': {'convert': 'string', 'field': 'format_id'},
1510             'height': {'convert': 'float_none'},
1511             'width': {'convert': 'float_none'},
1512             'fps': {'convert': 'float_none'},
1513             'tbr': {'convert': 'float_none'},
1514             'vbr': {'convert': 'float_none'},
1515             'abr': {'convert': 'float_none'},
1516             'asr': {'convert': 'float_none'},
1517             'source': {'convert': 'ignore', 'field': 'source_preference'},
1518
1519             'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1520             'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1521             'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1522             'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1523             'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1524
1525             # Most of these exist only for compatibility reasons
1526             'dimension': {'type': 'alias', 'field': 'res'},
1527             'resolution': {'type': 'alias', 'field': 'res'},
1528             'extension': {'type': 'alias', 'field': 'ext'},
1529             'bitrate': {'type': 'alias', 'field': 'br'},
1530             'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1531             'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1532             'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1533             'framerate': {'type': 'alias', 'field': 'fps'},
1534             'language_preference': {'type': 'alias', 'field': 'lang'},  # not named as 'language' because such a field exists
1535             'protocol': {'type': 'alias', 'field': 'proto'},
1536             'source_preference': {'type': 'alias', 'field': 'source'},
1537             'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1538             'filesize_estimate': {'type': 'alias', 'field': 'size'},
1539             'samplerate': {'type': 'alias', 'field': 'asr'},
1540             'video_ext': {'type': 'alias', 'field': 'vext'},
1541             'audio_ext': {'type': 'alias', 'field': 'aext'},
1542             'video_codec': {'type': 'alias', 'field': 'vcodec'},
1543             'audio_codec': {'type': 'alias', 'field': 'acodec'},
1544             'video': {'type': 'alias', 'field': 'hasvid'},
1545             'has_video': {'type': 'alias', 'field': 'hasvid'},
1546             'audio': {'type': 'alias', 'field': 'hasaud'},
1547             'has_audio': {'type': 'alias', 'field': 'hasaud'},
1548             'extractor': {'type': 'alias', 'field': 'ie_pref'},
1549             'preference': {'type': 'alias', 'field': 'ie_pref'},
1550             'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1551             'format_id': {'type': 'alias', 'field': 'id'},
1552         }
1553
1554         _order = []
1555
1556         def _get_field_setting(self, field, key):
1557             if field not in self.settings:
1558                 self.settings[field] = {}
1559             propObj = self.settings[field]
1560             if key not in propObj:
1561                 type = propObj.get('type')
1562                 if key == 'field':
1563                     default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1564                 elif key == 'convert':
1565                     default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1566                 else:
1567                     default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1568                 propObj[key] = default
1569             return propObj[key]
1570
1571         def _resolve_field_value(self, field, value, convertNone=False):
1572             if value is None:
1573                 if not convertNone:
1574                     return None
1575             else:
1576                 value = value.lower()
1577             conversion = self._get_field_setting(field, 'convert')
1578             if conversion == 'ignore':
1579                 return None
1580             if conversion == 'string':
1581                 return value
1582             elif conversion == 'float_none':
1583                 return float_or_none(value)
1584             elif conversion == 'bytes':
1585                 return FileDownloader.parse_bytes(value)
1586             elif conversion == 'order':
1587                 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1588                 use_regex = self._get_field_setting(field, 'regex')
1589                 list_length = len(order_list)
1590                 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1591                 if use_regex and value is not None:
1592                     for i, regex in enumerate(order_list):
1593                         if regex and re.match(regex, value):
1594                             return list_length - i
1595                     return list_length - empty_pos  # not in list
1596                 else:  # not regex or  value = None
1597                     return list_length - (order_list.index(value) if value in order_list else empty_pos)
1598             else:
1599                 if value.isnumeric():
1600                     return float(value)
1601                 else:
1602                     self.settings[field]['convert'] = 'string'
1603                     return value
1604
1605         def evaluate_params(self, params, sort_extractor):
1606             self._use_free_order = params.get('prefer_free_formats', False)
1607             self._sort_user = params.get('format_sort', [])
1608             self._sort_extractor = sort_extractor
1609
1610             def add_item(field, reverse, closest, limit_text):
1611                 field = field.lower()
1612                 if field in self._order:
1613                     return
1614                 self._order.append(field)
1615                 limit = self._resolve_field_value(field, limit_text)
1616                 data = {
1617                     'reverse': reverse,
1618                     'closest': False if limit is None else closest,
1619                     'limit_text': limit_text,
1620                     'limit': limit}
1621                 if field in self.settings:
1622                     self.settings[field].update(data)
1623                 else:
1624                     self.settings[field] = data
1625
1626             sort_list = (
1627                 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1628                 + (tuple() if params.get('format_sort_force', False)
1629                    else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1630                 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1631
1632             for item in sort_list:
1633                 match = re.match(self.regex, item)
1634                 if match is None:
1635                     raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1636                 field = match.group('field')
1637                 if field is None:
1638                     continue
1639                 if self._get_field_setting(field, 'type') == 'alias':
1640                     field = self._get_field_setting(field, 'field')
1641                 reverse = match.group('reverse') is not None
1642                 closest = match.group('separator') == '~'
1643                 limit_text = match.group('limit')
1644
1645                 has_limit = limit_text is not None
1646                 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1647                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1648
1649                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1650                 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1651                 limit_count = len(limits)
1652                 for (i, f) in enumerate(fields):
1653                     add_item(f, reverse, closest,
1654                              limits[i] if i < limit_count
1655                              else limits[0] if has_limit and not has_multiple_limits
1656                              else None)
1657
1658         def print_verbose_info(self, write_debug):
1659             if self._sort_user:
1660                 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1661             if self._sort_extractor:
1662                 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1663             write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1664                 '+' if self._get_field_setting(field, 'reverse') else '', field,
1665                 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1666                               self._get_field_setting(field, 'limit_text'),
1667                               self._get_field_setting(field, 'limit'))
1668                 if self._get_field_setting(field, 'limit_text') is not None else '')
1669                 for field in self._order if self._get_field_setting(field, 'visible')]))
1670
1671         def _calculate_field_preference_from_value(self, format, field, type, value):
1672             reverse = self._get_field_setting(field, 'reverse')
1673             closest = self._get_field_setting(field, 'closest')
1674             limit = self._get_field_setting(field, 'limit')
1675
1676             if type == 'extractor':
1677                 maximum = self._get_field_setting(field, 'max')
1678                 if value is None or (maximum is not None and value >= maximum):
1679                     value = -1
1680             elif type == 'boolean':
1681                 in_list = self._get_field_setting(field, 'in_list')
1682                 not_in_list = self._get_field_setting(field, 'not_in_list')
1683                 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1684             elif type == 'ordered':
1685                 value = self._resolve_field_value(field, value, True)
1686
1687             # try to convert to number
1688             val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1689             is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1690             if is_num:
1691                 value = val_num
1692
1693             return ((-10, 0) if value is None
1694                     else (1, value, 0) if not is_num  # if a field has mixed strings and numbers, strings are sorted higher
1695                     else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1696                     else (0, value, 0) if not reverse and (limit is None or value <= limit)
1697                     else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1698                     else (-1, value, 0))
1699
1700         def _calculate_field_preference(self, format, field):
1701             type = self._get_field_setting(field, 'type')  # extractor, boolean, ordered, field, multiple
1702             get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1703             if type == 'multiple':
1704                 type = 'field'  # Only 'field' is allowed in multiple for now
1705                 actual_fields = self._get_field_setting(field, 'field')
1706
1707                 def wrapped_function(values):
1708                     values = tuple(filter(lambda x: x is not None, values))
1709                     return self._get_field_setting(field, 'function')(values) if values else None
1710
1711                 value = wrapped_function((get_value(f) for f in actual_fields))
1712             else:
1713                 value = get_value(field)
1714             return self._calculate_field_preference_from_value(format, field, type, value)
1715
1716         def calculate_preference(self, format):
1717             # Determine missing protocol
1718             if not format.get('protocol'):
1719                 format['protocol'] = determine_protocol(format)
1720
1721             # Determine missing ext
1722             if not format.get('ext') and 'url' in format:
1723                 format['ext'] = determine_ext(format['url'])
1724             if format.get('vcodec') == 'none':
1725                 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1726                 format['video_ext'] = 'none'
1727             else:
1728                 format['video_ext'] = format['ext']
1729                 format['audio_ext'] = 'none'
1730             # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'):  # Not supported?
1731             #    format['preference'] = -1000
1732
1733             # Determine missing bitrates
1734             if format.get('tbr') is None:
1735                 if format.get('vbr') is not None and format.get('abr') is not None:
1736                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1737             else:
1738                 if format.get('vcodec') != "none" and format.get('vbr') is None:
1739                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
1740                 if format.get('acodec') != "none" and format.get('abr') is None:
1741                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
1742
1743             return tuple(self._calculate_field_preference(format, field) for field in self._order)
1744
1745     def _sort_formats(self, formats, field_preference=[]):
1746         if not formats:
1747             if self.get_param('ignore_no_formats_error'):
1748                 return
1749             raise ExtractorError('No video formats found')
1750         format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
1751         format_sort.evaluate_params(self._downloader.params, field_preference)
1752         if self.get_param('verbose', False):
1753             format_sort.print_verbose_info(self._downloader.write_debug)
1754         formats.sort(key=lambda f: format_sort.calculate_preference(f))
1755
1756     def _check_formats(self, formats, video_id):
1757         if formats:
1758             formats[:] = filter(
1759                 lambda f: self._is_valid_url(
1760                     f['url'], video_id,
1761                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1762                 formats)
1763
1764     @staticmethod
1765     def _remove_duplicate_formats(formats):
1766         format_urls = set()
1767         unique_formats = []
1768         for f in formats:
1769             if f['url'] not in format_urls:
1770                 format_urls.add(f['url'])
1771                 unique_formats.append(f)
1772         formats[:] = unique_formats
1773
1774     def _is_valid_url(self, url, video_id, item='video', headers={}):
1775         url = self._proto_relative_url(url, scheme='http:')
1776         # For now assume non HTTP(S) URLs always valid
1777         if not (url.startswith('http://') or url.startswith('https://')):
1778             return True
1779         try:
1780             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1781             return True
1782         except ExtractorError as e:
1783             self.to_screen(
1784                 '%s: %s URL is invalid, skipping: %s'
1785                 % (video_id, item, error_to_compat_str(e.cause)))
1786             return False
1787
1788     def http_scheme(self):
1789         """ Either "http:" or "https:", depending on the user's preferences """
1790         return (
1791             'http:'
1792             if self.get_param('prefer_insecure', False)
1793             else 'https:')
1794
1795     def _proto_relative_url(self, url, scheme=None):
1796         if url is None:
1797             return url
1798         if url.startswith('//'):
1799             if scheme is None:
1800                 scheme = self.http_scheme()
1801             return scheme + url
1802         else:
1803             return url
1804
1805     def _sleep(self, timeout, video_id, msg_template=None):
1806         if msg_template is None:
1807             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1808         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1809         self.to_screen(msg)
1810         time.sleep(timeout)
1811
1812     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1813                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1814                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1815         manifest = self._download_xml(
1816             manifest_url, video_id, 'Downloading f4m manifest',
1817             'Unable to download f4m manifest',
1818             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1819             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1820             transform_source=transform_source,
1821             fatal=fatal, data=data, headers=headers, query=query)
1822
1823         if manifest is False:
1824             return []
1825
1826         return self._parse_f4m_formats(
1827             manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1828             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1829
1830     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1831                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1832                            fatal=True, m3u8_id=None):
1833         if not isinstance(manifest, compat_etree_Element) and not fatal:
1834             return []
1835
1836         # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1837         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1838         if akamai_pv is not None and ';' in akamai_pv.text:
1839             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1840             if playerVerificationChallenge.strip() != '':
1841                 return []
1842
1843         formats = []
1844         manifest_version = '1.0'
1845         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1846         if not media_nodes:
1847             manifest_version = '2.0'
1848             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1849         # Remove unsupported DRM protected media from final formats
1850         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1851         media_nodes = remove_encrypted_media(media_nodes)
1852         if not media_nodes:
1853             return formats
1854
1855         manifest_base_url = get_base_url(manifest)
1856
1857         bootstrap_info = xpath_element(
1858             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1859             'bootstrap info', default=None)
1860
1861         vcodec = None
1862         mime_type = xpath_text(
1863             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1864             'base URL', default=None)
1865         if mime_type and mime_type.startswith('audio/'):
1866             vcodec = 'none'
1867
1868         for i, media_el in enumerate(media_nodes):
1869             tbr = int_or_none(media_el.attrib.get('bitrate'))
1870             width = int_or_none(media_el.attrib.get('width'))
1871             height = int_or_none(media_el.attrib.get('height'))
1872             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1873             # If <bootstrapInfo> is present, the specified f4m is a
1874             # stream-level manifest, and only set-level manifests may refer to
1875             # external resources.  See section 11.4 and section 4 of F4M spec
1876             if bootstrap_info is None:
1877                 media_url = None
1878                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1879                 if manifest_version == '2.0':
1880                     media_url = media_el.attrib.get('href')
1881                 if media_url is None:
1882                     media_url = media_el.attrib.get('url')
1883                 if not media_url:
1884                     continue
1885                 manifest_url = (
1886                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1887                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1888                 # If media_url is itself a f4m manifest do the recursive extraction
1889                 # since bitrates in parent manifest (this one) and media_url manifest
1890                 # may differ leading to inability to resolve the format by requested
1891                 # bitrate in f4m downloader
1892                 ext = determine_ext(manifest_url)
1893                 if ext == 'f4m':
1894                     f4m_formats = self._extract_f4m_formats(
1895                         manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1896                         transform_source=transform_source, fatal=fatal)
1897                     # Sometimes stream-level manifest contains single media entry that
1898                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1899                     # At the same time parent's media entry in set-level manifest may
1900                     # contain it. We will copy it from parent in such cases.
1901                     if len(f4m_formats) == 1:
1902                         f = f4m_formats[0]
1903                         f.update({
1904                             'tbr': f.get('tbr') or tbr,
1905                             'width': f.get('width') or width,
1906                             'height': f.get('height') or height,
1907                             'format_id': f.get('format_id') if not tbr else format_id,
1908                             'vcodec': vcodec,
1909                         })
1910                     formats.extend(f4m_formats)
1911                     continue
1912                 elif ext == 'm3u8':
1913                     formats.extend(self._extract_m3u8_formats(
1914                         manifest_url, video_id, 'mp4', preference=preference,
1915                         quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1916                     continue
1917             formats.append({
1918                 'format_id': format_id,
1919                 'url': manifest_url,
1920                 'manifest_url': manifest_url,
1921                 'ext': 'flv' if bootstrap_info is not None else None,
1922                 'protocol': 'f4m',
1923                 'tbr': tbr,
1924                 'width': width,
1925                 'height': height,
1926                 'vcodec': vcodec,
1927                 'preference': preference,
1928                 'quality': quality,
1929             })
1930         return formats
1931
1932     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1933         return {
1934             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1935             'url': m3u8_url,
1936             'ext': ext,
1937             'protocol': 'm3u8',
1938             'preference': preference - 100 if preference else -100,
1939             'quality': quality,
1940             'resolution': 'multiple',
1941             'format_note': 'Quality selection URL',
1942         }
1943
1944     def _extract_m3u8_formats(self, *args, **kwargs):
1945         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1946         if subs:
1947             self.report_warning(bug_reports_message(
1948                 "Ignoring subtitle tracks found in the HLS manifest; "
1949                 "if any subtitle tracks are missing,"
1950             ))
1951         return fmts
1952
1953     def _extract_m3u8_formats_and_subtitles(
1954             self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1955             preference=None, quality=None, m3u8_id=None, note=None,
1956             errnote=None, fatal=True, live=False, data=None, headers={},
1957             query={}):
1958
1959         res = self._download_webpage_handle(
1960             m3u8_url, video_id,
1961             note='Downloading m3u8 information' if note is None else note,
1962             errnote='Failed to download m3u8 information' if errnote is None else errnote,
1963             fatal=fatal, data=data, headers=headers, query=query)
1964
1965         if res is False:
1966             return [], {}
1967
1968         m3u8_doc, urlh = res
1969         m3u8_url = urlh.geturl()
1970
1971         return self._parse_m3u8_formats_and_subtitles(
1972             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1973             preference=preference, quality=quality, m3u8_id=m3u8_id,
1974             note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1975             headers=headers, query=query, video_id=video_id)
1976
1977     def _parse_m3u8_formats_and_subtitles(
1978             self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
1979             preference=None, quality=None, m3u8_id=None, live=False, note=None,
1980             errnote=None, fatal=True, data=None, headers={}, query={},
1981             video_id=None):
1982
1983         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1984             return [], {}
1985
1986         if (not self.get_param('allow_unplayable_formats')
1987                 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)):  # Apple FairPlay
1988             return [], {}
1989
1990         formats = []
1991
1992         subtitles = {}
1993
1994         format_url = lambda u: (
1995             u
1996             if re.match(r'^https?://', u)
1997             else compat_urlparse.urljoin(m3u8_url, u))
1998
1999         split_discontinuity = self.get_param('hls_split_discontinuity', False)
2000
2001         # References:
2002         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2003         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2004         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2005
2006         # We should try extracting formats only from master playlists [1, 4.3.4],
2007         # i.e. playlists that describe available qualities. On the other hand
2008         # media playlists [1, 4.3.3] should be returned as is since they contain
2009         # just the media without qualities renditions.
2010         # Fortunately, master playlist can be easily distinguished from media
2011         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2012         # master playlist tags MUST NOT appear in a media playlist and vice versa.
2013         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2014         # media playlist and MUST NOT appear in master playlist thus we can
2015         # clearly detect media playlist with this criterion.
2016
2017         def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
2018                                            fatal=True, data=None, headers={}):
2019             if not m3u8_doc:
2020                 if not format_url:
2021                     return []
2022                 res = self._download_webpage_handle(
2023                     format_url, video_id,
2024                     note=False,
2025                     errnote='Failed to download m3u8 playlist information',
2026                     fatal=fatal, data=data, headers=headers)
2027
2028                 if res is False:
2029                     return []
2030
2031                 m3u8_doc, urlh = res
2032                 format_url = urlh.geturl()
2033
2034             playlist_formats = []
2035             i = (
2036                 0
2037                 if split_discontinuity
2038                 else None)
2039             format_info = {
2040                 'index': i,
2041                 'key_data': None,
2042                 'files': [],
2043             }
2044             for line in m3u8_doc.splitlines():
2045                 if not line.startswith('#'):
2046                     format_info['files'].append(line)
2047                 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
2048                     i += 1
2049                     playlist_formats.append(format_info)
2050                     format_info = {
2051                         'index': i,
2052                         'url': format_url,
2053                         'files': [],
2054                     }
2055             playlist_formats.append(format_info)
2056             return playlist_formats
2057
2058         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
2059
2060             playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
2061
2062             for format in playlist_formats:
2063                 format_id = []
2064                 if m3u8_id:
2065                     format_id.append(m3u8_id)
2066                 format_index = format.get('index')
2067                 if format_index:
2068                     format_id.append(str(format_index))
2069                 f = {
2070                     'format_id': '-'.join(format_id),
2071                     'format_index': format_index,
2072                     'url': m3u8_url,
2073                     'ext': ext,
2074                     'protocol': entry_protocol,
2075                     'preference': preference,
2076                     'quality': quality,
2077                 }
2078                 formats.append(f)
2079
2080             return formats, subtitles
2081
2082         groups = {}
2083         last_stream_inf = {}
2084
2085         def extract_media(x_media_line):
2086             media = parse_m3u8_attributes(x_media_line)
2087             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2088             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2089             if not (media_type and group_id and name):
2090                 return
2091             groups.setdefault(group_id, []).append(media)
2092             # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2093             if media_type == 'SUBTITLES':
2094                 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2095                 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2096                 # However, lack of URI has been spotted in the wild.
2097                 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2098                 if not media.get('URI'):
2099                     return
2100                 url = format_url(media['URI'])
2101                 sub_info = {
2102                     'url': url,
2103                     'ext': determine_ext(url),
2104                 }
2105                 if sub_info['ext'] == 'm3u8':
2106                     # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2107                     # files may contain is WebVTT:
2108                     # <https://tools.ietf.org/html/rfc8216#section-3.1>
2109                     sub_info['ext'] = 'vtt'
2110                     sub_info['protocol'] = 'm3u8_native'
2111                 lang = media.get('LANGUAGE') or 'und'
2112                 subtitles.setdefault(lang, []).append(sub_info)
2113             if media_type not in ('VIDEO', 'AUDIO'):
2114                 return
2115             media_url = media.get('URI')
2116             if media_url:
2117                 manifest_url = format_url(media_url)
2118                 format_id = []
2119                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2120                                                                   fatal=fatal, data=data, headers=headers)
2121
2122                 for format in playlist_formats:
2123                     format_index = format.get('index')
2124                     for v in (m3u8_id, group_id, name):
2125                         if v:
2126                             format_id.append(v)
2127                     if format_index:
2128                         format_id.append(str(format_index))
2129                     f = {
2130                         'format_id': '-'.join(format_id),
2131                         'format_note': name,
2132                         'format_index': format_index,
2133                         'url': manifest_url,
2134                         'manifest_url': m3u8_url,
2135                         'language': media.get('LANGUAGE'),
2136                         'ext': ext,
2137                         'protocol': entry_protocol,
2138                         'preference': preference,
2139                         'quality': quality,
2140                     }
2141                     if media_type == 'AUDIO':
2142                         f['vcodec'] = 'none'
2143                     formats.append(f)
2144
2145         def build_stream_name():
2146             # Despite specification does not mention NAME attribute for
2147             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2148             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2149             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2150             stream_name = last_stream_inf.get('NAME')
2151             if stream_name:
2152                 return stream_name
2153             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2154             # from corresponding rendition group
2155             stream_group_id = last_stream_inf.get('VIDEO')
2156             if not stream_group_id:
2157                 return
2158             stream_group = groups.get(stream_group_id)
2159             if not stream_group:
2160                 return stream_group_id
2161             rendition = stream_group[0]
2162             return rendition.get('NAME') or stream_group_id
2163
2164         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2165         # chance to detect video only formats when EXT-X-STREAM-INF tags
2166         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2167         for line in m3u8_doc.splitlines():
2168             if line.startswith('#EXT-X-MEDIA:'):
2169                 extract_media(line)
2170
2171         for line in m3u8_doc.splitlines():
2172             if line.startswith('#EXT-X-STREAM-INF:'):
2173                 last_stream_inf = parse_m3u8_attributes(line)
2174             elif line.startswith('#') or not line.strip():
2175                 continue
2176             else:
2177                 tbr = float_or_none(
2178                     last_stream_inf.get('AVERAGE-BANDWIDTH')
2179                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
2180                 manifest_url = format_url(line.strip())
2181
2182                 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2183                                                                   fatal=fatal, data=data, headers=headers)
2184
2185                 for frmt in playlist_formats:
2186                     format_id = []
2187                     if m3u8_id:
2188                         format_id.append(m3u8_id)
2189                     format_index = frmt.get('index')
2190                     stream_name = build_stream_name()
2191                     # Bandwidth of live streams may differ over time thus making
2192                     # format_id unpredictable. So it's better to keep provided
2193                     # format_id intact.
2194                     if not live:
2195                         format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2196                     if format_index:
2197                         format_id.append(str(format_index))
2198                     f = {
2199                         'format_id': '-'.join(format_id),
2200                         'format_index': format_index,
2201                         'url': manifest_url,
2202                         'manifest_url': m3u8_url,
2203                         'tbr': tbr,
2204                         'ext': ext,
2205                         'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2206                         'protocol': entry_protocol,
2207                         'preference': preference,
2208                         'quality': quality,
2209                     }
2210                     resolution = last_stream_inf.get('RESOLUTION')
2211                     if resolution:
2212                         mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2213                         if mobj:
2214                             f['width'] = int(mobj.group('width'))
2215                             f['height'] = int(mobj.group('height'))
2216                     # Unified Streaming Platform
2217                     mobj = re.search(
2218                         r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2219                     if mobj:
2220                         abr, vbr = mobj.groups()
2221                         abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2222                         f.update({
2223                             'vbr': vbr,
2224                             'abr': abr,
2225                         })
2226                     codecs = parse_codecs(last_stream_inf.get('CODECS'))
2227                     f.update(codecs)
2228                     audio_group_id = last_stream_inf.get('AUDIO')
2229                     # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2230                     # references a rendition group MUST have a CODECS attribute.
2231                     # However, this is not always respected, for example, [2]
2232                     # contains EXT-X-STREAM-INF tag which references AUDIO
2233                     # rendition group but does not have CODECS and despite
2234                     # referencing an audio group it represents a complete
2235                     # (with audio and video) format. So, for such cases we will
2236                     # ignore references to rendition groups and treat them
2237                     # as complete formats.
2238                     if audio_group_id and codecs and f.get('vcodec') != 'none':
2239                         audio_group = groups.get(audio_group_id)
2240                         if audio_group and audio_group[0].get('URI'):
2241                             # TODO: update acodec for audio only formats with
2242                             # the same GROUP-ID
2243                             f['acodec'] = 'none'
2244                     if not f.get('ext'):
2245                         f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2246                     formats.append(f)
2247
2248                     # for DailyMotion
2249                     progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2250                     if progressive_uri:
2251                         http_f = f.copy()
2252                         del http_f['manifest_url']
2253                         http_f.update({
2254                             'format_id': f['format_id'].replace('hls-', 'http-'),
2255                             'protocol': 'http',
2256                             'url': progressive_uri,
2257                         })
2258                         formats.append(http_f)
2259
2260                 last_stream_inf = {}
2261         return formats, subtitles
2262
2263     @staticmethod
2264     def _xpath_ns(path, namespace=None):
2265         if not namespace:
2266             return path
2267         out = []
2268         for c in path.split('/'):
2269             if not c or c == '.':
2270                 out.append(c)
2271             else:
2272                 out.append('{%s}%s' % (namespace, c))
2273         return '/'.join(out)
2274
2275     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2276         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2277
2278         if smil is False:
2279             assert not fatal
2280             return []
2281
2282         namespace = self._parse_smil_namespace(smil)
2283
2284         return self._parse_smil_formats(
2285             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2286
2287     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2288         smil = self._download_smil(smil_url, video_id, fatal=fatal)
2289         if smil is False:
2290             return {}
2291         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2292
2293     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2294         return self._download_xml(
2295             smil_url, video_id, 'Downloading SMIL file',
2296             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2297
2298     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2299         namespace = self._parse_smil_namespace(smil)
2300
2301         formats = self._parse_smil_formats(
2302             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2303         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2304
2305         video_id = os.path.splitext(url_basename(smil_url))[0]
2306         title = None
2307         description = None
2308         upload_date = None
2309         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2310             name = meta.attrib.get('name')
2311             content = meta.attrib.get('content')
2312             if not name or not content:
2313                 continue
2314             if not title and name == 'title':
2315                 title = content
2316             elif not description and name in ('description', 'abstract'):
2317                 description = content
2318             elif not upload_date and name == 'date':
2319                 upload_date = unified_strdate(content)
2320
2321         thumbnails = [{
2322             'id': image.get('type'),
2323             'url': image.get('src'),
2324             'width': int_or_none(image.get('width')),
2325             'height': int_or_none(image.get('height')),
2326         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2327
2328         return {
2329             'id': video_id,
2330             'title': title or video_id,
2331             'description': description,
2332             'upload_date': upload_date,
2333             'thumbnails': thumbnails,
2334             'formats': formats,
2335             'subtitles': subtitles,
2336         }
2337
2338     def _parse_smil_namespace(self, smil):
2339         return self._search_regex(
2340             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2341
2342     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2343         base = smil_url
2344         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2345             b = meta.get('base') or meta.get('httpBase')
2346             if b:
2347                 base = b
2348                 break
2349
2350         formats = []
2351         rtmp_count = 0
2352         http_count = 0
2353         m3u8_count = 0
2354
2355         srcs = []
2356         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2357         for medium in media:
2358             src = medium.get('src')
2359             if not src or src in srcs:
2360                 continue
2361             srcs.append(src)
2362
2363             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2364             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2365             width = int_or_none(medium.get('width'))
2366             height = int_or_none(medium.get('height'))
2367             proto = medium.get('proto')
2368             ext = medium.get('ext')
2369             src_ext = determine_ext(src)
2370             streamer = medium.get('streamer') or base
2371
2372             if proto == 'rtmp' or streamer.startswith('rtmp'):
2373                 rtmp_count += 1
2374                 formats.append({
2375                     'url': streamer,
2376                     'play_path': src,
2377                     'ext': 'flv',
2378                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2379                     'tbr': bitrate,
2380                     'filesize': filesize,
2381                     'width': width,
2382                     'height': height,
2383                 })
2384                 if transform_rtmp_url:
2385                     streamer, src = transform_rtmp_url(streamer, src)
2386                     formats[-1].update({
2387                         'url': streamer,
2388                         'play_path': src,
2389                     })
2390                 continue
2391
2392             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2393             src_url = src_url.strip()
2394
2395             if proto == 'm3u8' or src_ext == 'm3u8':
2396                 m3u8_formats = self._extract_m3u8_formats(
2397                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2398                 if len(m3u8_formats) == 1:
2399                     m3u8_count += 1
2400                     m3u8_formats[0].update({
2401                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2402                         'tbr': bitrate,
2403                         'width': width,
2404                         'height': height,
2405                     })
2406                 formats.extend(m3u8_formats)
2407             elif src_ext == 'f4m':
2408                 f4m_url = src_url
2409                 if not f4m_params:
2410                     f4m_params = {
2411                         'hdcore': '3.2.0',
2412                         'plugin': 'flowplayer-3.2.0.1',
2413                     }
2414                 f4m_url += '&' if '?' in f4m_url else '?'
2415                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2416                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2417             elif src_ext == 'mpd':
2418                 formats.extend(self._extract_mpd_formats(
2419                     src_url, video_id, mpd_id='dash', fatal=False))
2420             elif re.search(r'\.ism/[Mm]anifest', src_url):
2421                 formats.extend(self._extract_ism_formats(
2422                     src_url, video_id, ism_id='mss', fatal=False))
2423             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2424                 http_count += 1
2425                 formats.append({
2426                     'url': src_url,
2427                     'ext': ext or src_ext or 'flv',
2428                     'format_id': 'http-%d' % (bitrate or http_count),
2429                     'tbr': bitrate,
2430                     'filesize': filesize,
2431                     'width': width,
2432                     'height': height,
2433                 })
2434
2435         return formats
2436
2437     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2438         urls = []
2439         subtitles = {}
2440         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2441             src = textstream.get('src')
2442             if not src or src in urls:
2443                 continue
2444             urls.append(src)
2445             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2446             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2447             subtitles.setdefault(lang, []).append({
2448                 'url': src,
2449                 'ext': ext,
2450             })
2451         return subtitles
2452
2453     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2454         xspf = self._download_xml(
2455             xspf_url, playlist_id, 'Downloading xpsf playlist',
2456             'Unable to download xspf manifest', fatal=fatal)
2457         if xspf is False:
2458             return []
2459         return self._parse_xspf(
2460             xspf, playlist_id, xspf_url=xspf_url,
2461             xspf_base_url=base_url(xspf_url))
2462
2463     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2464         NS_MAP = {
2465             'xspf': 'http://xspf.org/ns/0/',
2466             's1': 'http://static.streamone.nl/player/ns/0',
2467         }
2468
2469         entries = []
2470         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2471             title = xpath_text(
2472                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2473             description = xpath_text(
2474                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2475             thumbnail = xpath_text(
2476                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2477             duration = float_or_none(
2478                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2479
2480             formats = []
2481             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2482                 format_url = urljoin(xspf_base_url, location.text)
2483                 if not format_url:
2484                     continue
2485                 formats.append({
2486                     'url': format_url,
2487                     'manifest_url': xspf_url,
2488                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2489                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2490                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2491                 })
2492             self._sort_formats(formats)
2493
2494             entries.append({
2495                 'id': playlist_id,
2496                 'title': title,
2497                 'description': description,
2498                 'thumbnail': thumbnail,
2499                 'duration': duration,
2500                 'formats': formats,
2501             })
2502         return entries
2503
2504     def _extract_mpd_formats(self, *args, **kwargs):
2505         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2506         if subs:
2507             self.report_warning(bug_reports_message(
2508                 "Ignoring subtitle tracks found in the DASH manifest; "
2509                 "if any subtitle tracks are missing,"
2510             ))
2511         return fmts
2512
2513     def _extract_mpd_formats_and_subtitles(
2514             self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2515             fatal=True, data=None, headers={}, query={}):
2516         res = self._download_xml_handle(
2517             mpd_url, video_id,
2518             note='Downloading MPD manifest' if note is None else note,
2519             errnote='Failed to download MPD manifest' if errnote is None else errnote,
2520             fatal=fatal, data=data, headers=headers, query=query)
2521         if res is False:
2522             return [], {}
2523         mpd_doc, urlh = res
2524         if mpd_doc is None:
2525             return [], {}
2526         mpd_base_url = base_url(urlh.geturl())
2527
2528         return self._parse_mpd_formats_and_subtitles(
2529             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2530
2531     def _parse_mpd_formats(self, *args, **kwargs):
2532         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2533         if subs:
2534             self.report_warning(bug_reports_message(
2535                 "Ignoring subtitle tracks found in the DASH manifest; "
2536                 "if any subtitle tracks are missing,"
2537             ))
2538         return fmts
2539
2540     def _parse_mpd_formats_and_subtitles(
2541             self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2542         """
2543         Parse formats from MPD manifest.
2544         References:
2545          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2546             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2547          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2548         """
2549         if not self.get_param('dynamic_mpd', True):
2550             if mpd_doc.get('type') == 'dynamic':
2551                 return [], {}
2552
2553         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2554
2555         def _add_ns(path):
2556             return self._xpath_ns(path, namespace)
2557
2558         def is_drm_protected(element):
2559             return element.find(_add_ns('ContentProtection')) is not None
2560
2561         def extract_multisegment_info(element, ms_parent_info):
2562             ms_info = ms_parent_info.copy()
2563
2564             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2565             # common attributes and elements.  We will only extract relevant
2566             # for us.
2567             def extract_common(source):
2568                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2569                 if segment_timeline is not None:
2570                     s_e = segment_timeline.findall(_add_ns('S'))
2571                     if s_e:
2572                         ms_info['total_number'] = 0
2573                         ms_info['s'] = []
2574                         for s in s_e:
2575                             r = int(s.get('r', 0))
2576                             ms_info['total_number'] += 1 + r
2577                             ms_info['s'].append({
2578                                 't': int(s.get('t', 0)),
2579                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2580                                 'd': int(s.attrib['d']),
2581                                 'r': r,
2582                             })
2583                 start_number = source.get('startNumber')
2584                 if start_number:
2585                     ms_info['start_number'] = int(start_number)
2586                 timescale = source.get('timescale')
2587                 if timescale:
2588                     ms_info['timescale'] = int(timescale)
2589                 segment_duration = source.get('duration')
2590                 if segment_duration:
2591                     ms_info['segment_duration'] = float(segment_duration)
2592
2593             def extract_Initialization(source):
2594                 initialization = source.find(_add_ns('Initialization'))
2595                 if initialization is not None:
2596                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2597
2598             segment_list = element.find(_add_ns('SegmentList'))
2599             if segment_list is not None:
2600                 extract_common(segment_list)
2601                 extract_Initialization(segment_list)
2602                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2603                 if segment_urls_e:
2604                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2605             else:
2606                 segment_template = element.find(_add_ns('SegmentTemplate'))
2607                 if segment_template is not None:
2608                     extract_common(segment_template)
2609                     media = segment_template.get('media')
2610                     if media:
2611                         ms_info['media'] = media
2612                     initialization = segment_template.get('initialization')
2613                     if initialization:
2614                         ms_info['initialization'] = initialization
2615                     else:
2616                         extract_Initialization(segment_template)
2617             return ms_info
2618
2619         skip_unplayable = not self.get_param('allow_unplayable_formats')
2620
2621         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2622         formats = []
2623         subtitles = {}
2624         for period in mpd_doc.findall(_add_ns('Period')):
2625             period_duration = parse_duration(period.get('duration')) or mpd_duration
2626             period_ms_info = extract_multisegment_info(period, {
2627                 'start_number': 1,
2628                 'timescale': 1,
2629             })
2630             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2631                 if skip_unplayable and is_drm_protected(adaptation_set):
2632                     continue
2633                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2634                 for representation in adaptation_set.findall(_add_ns('Representation')):
2635                     if skip_unplayable and is_drm_protected(representation):
2636                         continue
2637                     representation_attrib = adaptation_set.attrib.copy()
2638                     representation_attrib.update(representation.attrib)
2639                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2640                     mime_type = representation_attrib['mimeType']
2641                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2642
2643                     if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg':
2644                         base_url = ''
2645                         for element in (representation, adaptation_set, period, mpd_doc):
2646                             base_url_e = element.find(_add_ns('BaseURL'))
2647                             if base_url_e is not None:
2648                                 base_url = base_url_e.text + base_url
2649                                 if re.match(r'^https?://', base_url):
2650                                     break
2651                         if mpd_base_url and not re.match(r'^https?://', base_url):
2652                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2653                                 mpd_base_url += '/'
2654                             base_url = mpd_base_url + base_url
2655                         representation_id = representation_attrib.get('id')
2656                         lang = representation_attrib.get('lang')
2657                         url_el = representation.find(_add_ns('BaseURL'))
2658                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2659                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2660                         if representation_id is not None:
2661                             format_id = representation_id
2662                         else:
2663                             format_id = content_type
2664                         if mpd_id:
2665                             format_id = mpd_id + '-' + format_id
2666                         if content_type in ('video', 'audio'):
2667                             f = {
2668                                 'format_id': format_id,
2669                                 'manifest_url': mpd_url,
2670                                 'ext': mimetype2ext(mime_type),
2671                                 'width': int_or_none(representation_attrib.get('width')),
2672                                 'height': int_or_none(representation_attrib.get('height')),
2673                                 'tbr': float_or_none(bandwidth, 1000),
2674                                 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2675                                 'fps': int_or_none(representation_attrib.get('frameRate')),
2676                                 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2677                                 'format_note': 'DASH %s' % content_type,
2678                                 'filesize': filesize,
2679                                 'container': mimetype2ext(mime_type) + '_dash',
2680                             }
2681                             f.update(parse_codecs(representation_attrib.get('codecs')))
2682                         elif content_type == 'text':
2683                             f = {
2684                                 'ext': mimetype2ext(mime_type),
2685                                 'manifest_url': mpd_url,
2686                                 'filesize': filesize,
2687                             }
2688                         elif mime_type == 'image/jpeg':
2689                             # See test case in VikiIE
2690                             # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2691                             f = {
2692                                 'format_id': format_id,
2693                                 'ext': 'mhtml',
2694                                 'manifest_url': mpd_url,
2695                                 'format_note': 'DASH storyboards (jpeg)',
2696                                 'acodec': 'none',
2697                                 'vcodec': 'none',
2698                             }
2699                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2700
2701                         def prepare_template(template_name, identifiers):
2702                             tmpl = representation_ms_info[template_name]
2703                             # First of, % characters outside $...$ templates
2704                             # must be escaped by doubling for proper processing
2705                             # by % operator string formatting used further (see
2706                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2707                             t = ''
2708                             in_template = False
2709                             for c in tmpl:
2710                                 t += c
2711                                 if c == '$':
2712                                     in_template = not in_template
2713                                 elif c == '%' and not in_template:
2714                                     t += c
2715                             # Next, $...$ templates are translated to their
2716                             # %(...) counterparts to be used with % operator
2717                             if representation_id is not None:
2718                                 t = t.replace('$RepresentationID$', representation_id)
2719                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2720                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2721                             t.replace('$$', '$')
2722                             return t
2723
2724                         # @initialization is a regular template like @media one
2725                         # so it should be handled just the same way (see
2726                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2727                         if 'initialization' in representation_ms_info:
2728                             initialization_template = prepare_template(
2729                                 'initialization',
2730                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2731                                 # $Time$ shall not be included for @initialization thus
2732                                 # only $Bandwidth$ remains
2733                                 ('Bandwidth', ))
2734                             representation_ms_info['initialization_url'] = initialization_template % {
2735                                 'Bandwidth': bandwidth,
2736                             }
2737
2738                         def location_key(location):
2739                             return 'url' if re.match(r'^https?://', location) else 'path'
2740
2741                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2742
2743                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2744                             media_location_key = location_key(media_template)
2745
2746                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2747                             # can't be used at the same time
2748                             if '%(Number' in media_template and 's' not in representation_ms_info:
2749                                 segment_duration = None
2750                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2751                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2752                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2753                                 representation_ms_info['fragments'] = [{
2754                                     media_location_key: media_template % {
2755                                         'Number': segment_number,
2756                                         'Bandwidth': bandwidth,
2757                                     },
2758                                     'duration': segment_duration,
2759                                 } for segment_number in range(
2760                                     representation_ms_info['start_number'],
2761                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2762                             else:
2763                                 # $Number*$ or $Time$ in media template with S list available
2764                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2765                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2766                                 representation_ms_info['fragments'] = []
2767                                 segment_time = 0
2768                                 segment_d = None
2769                                 segment_number = representation_ms_info['start_number']
2770
2771                                 def add_segment_url():
2772                                     segment_url = media_template % {
2773                                         'Time': segment_time,
2774                                         'Bandwidth': bandwidth,
2775                                         'Number': segment_number,
2776                                     }
2777                                     representation_ms_info['fragments'].append({
2778                                         media_location_key: segment_url,
2779                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2780                                     })
2781
2782                                 for num, s in enumerate(representation_ms_info['s']):
2783                                     segment_time = s.get('t') or segment_time
2784                                     segment_d = s['d']
2785                                     add_segment_url()
2786                                     segment_number += 1
2787                                     for r in range(s.get('r', 0)):
2788                                         segment_time += segment_d
2789                                         add_segment_url()
2790                                         segment_number += 1
2791                                     segment_time += segment_d
2792                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2793                             # No media template
2794                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2795                             # or any YouTube dashsegments video
2796                             fragments = []
2797                             segment_index = 0
2798                             timescale = representation_ms_info['timescale']
2799                             for s in representation_ms_info['s']:
2800                                 duration = float_or_none(s['d'], timescale)
2801                                 for r in range(s.get('r', 0) + 1):
2802                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2803                                     fragments.append({
2804                                         location_key(segment_uri): segment_uri,
2805                                         'duration': duration,
2806                                     })
2807                                     segment_index += 1
2808                             representation_ms_info['fragments'] = fragments
2809                         elif 'segment_urls' in representation_ms_info:
2810                             # Segment URLs with no SegmentTimeline
2811                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2812                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2813                             fragments = []
2814                             segment_duration = float_or_none(
2815                                 representation_ms_info['segment_duration'],
2816                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2817                             for segment_url in representation_ms_info['segment_urls']:
2818                                 fragment = {
2819                                     location_key(segment_url): segment_url,
2820                                 }
2821                                 if segment_duration:
2822                                     fragment['duration'] = segment_duration
2823                                 fragments.append(fragment)
2824                             representation_ms_info['fragments'] = fragments
2825                         # If there is a fragments key available then we correctly recognized fragmented media.
2826                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2827                         # assumption is not necessarily correct since we may simply have no support for
2828                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2829                         if 'fragments' in representation_ms_info:
2830                             f.update({
2831                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2832                                 'url': mpd_url or base_url,
2833                                 'fragment_base_url': base_url,
2834                                 'fragments': [],
2835                                 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2836                             })
2837                             if 'initialization_url' in representation_ms_info:
2838                                 initialization_url = representation_ms_info['initialization_url']
2839                                 if not f.get('url'):
2840                                     f['url'] = initialization_url
2841                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2842                             f['fragments'].extend(representation_ms_info['fragments'])
2843                         else:
2844                             # Assuming direct URL to unfragmented media.
2845                             f['url'] = base_url
2846                         if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2847                             formats.append(f)
2848                         elif content_type == 'text':
2849                             subtitles.setdefault(lang or 'und', []).append(f)
2850                     else:
2851                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2852         return formats, subtitles
2853
2854     def _extract_ism_formats(self, *args, **kwargs):
2855         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2856         if subs:
2857             self.report_warning(bug_reports_message(
2858                 "Ignoring subtitle tracks found in the ISM manifest; "
2859                 "if any subtitle tracks are missing,"
2860             ))
2861         return fmts
2862
2863     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2864         res = self._download_xml_handle(
2865             ism_url, video_id,
2866             note='Downloading ISM manifest' if note is None else note,
2867             errnote='Failed to download ISM manifest' if errnote is None else errnote,
2868             fatal=fatal, data=data, headers=headers, query=query)
2869         if res is False:
2870             return [], {}
2871         ism_doc, urlh = res
2872         if ism_doc is None:
2873             return [], {}
2874
2875         return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2876
2877     def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2878         """
2879         Parse formats from ISM manifest.
2880         References:
2881          1. [MS-SSTR]: Smooth Streaming Protocol,
2882             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2883         """
2884         if ism_doc.get('IsLive') == 'TRUE':
2885             return [], {}
2886         if (not self.get_param('allow_unplayable_formats')
2887                 and ism_doc.find('Protection') is not None):
2888             return [], {}
2889
2890         duration = int(ism_doc.attrib['Duration'])
2891         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2892
2893         formats = []
2894         subtitles = {}
2895         for stream in ism_doc.findall('StreamIndex'):
2896             stream_type = stream.get('Type')
2897             if stream_type not in ('video', 'audio', 'text'):
2898                 continue
2899             url_pattern = stream.attrib['Url']
2900             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2901             stream_name = stream.get('Name')
2902             stream_language = stream.get('Language', 'und')
2903             for track in stream.findall('QualityLevel'):
2904                 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2905                 # TODO: add support for WVC1 and WMAP
2906                 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2907                     self.report_warning('%s is not a supported codec' % fourcc)
2908                     continue
2909                 tbr = int(track.attrib['Bitrate']) // 1000
2910                 # [1] does not mention Width and Height attributes. However,
2911                 # they're often present while MaxWidth and MaxHeight are
2912                 # missing, so should be used as fallbacks
2913                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2914                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2915                 sampling_rate = int_or_none(track.get('SamplingRate'))
2916
2917                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2918                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2919
2920                 fragments = []
2921                 fragment_ctx = {
2922                     'time': 0,
2923                 }
2924                 stream_fragments = stream.findall('c')
2925                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2926                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2927                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2928                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2929                     if not fragment_ctx['duration']:
2930                         try:
2931                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2932                         except IndexError:
2933                             next_fragment_time = duration
2934                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2935                     for _ in range(fragment_repeat):
2936                         fragments.append({
2937                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2938                             'duration': fragment_ctx['duration'] / stream_timescale,
2939                         })
2940                         fragment_ctx['time'] += fragment_ctx['duration']
2941
2942                 format_id = []
2943                 if ism_id:
2944                     format_id.append(ism_id)
2945                 if stream_name:
2946                     format_id.append(stream_name)
2947                 format_id.append(compat_str(tbr))
2948
2949                 if stream_type == 'text':
2950                     subtitles.setdefault(stream_language, []).append({
2951                         'ext': 'ismt',
2952                         'protocol': 'ism',
2953                         'url': ism_url,
2954                         'manifest_url': ism_url,
2955                         'fragments': fragments,
2956                         '_download_params': {
2957                             'stream_type': stream_type,
2958                             'duration': duration,
2959                             'timescale': stream_timescale,
2960                             'fourcc': fourcc,
2961                             'language': stream_language,
2962                             'codec_private_data': track.get('CodecPrivateData'),
2963                         }
2964                     })
2965                 elif stream_type in ('video', 'audio'):
2966                     formats.append({
2967                         'format_id': '-'.join(format_id),
2968                         'url': ism_url,
2969                         'manifest_url': ism_url,
2970                         'ext': 'ismv' if stream_type == 'video' else 'isma',
2971                         'width': width,
2972                         'height': height,
2973                         'tbr': tbr,
2974                         'asr': sampling_rate,
2975                         'vcodec': 'none' if stream_type == 'audio' else fourcc,
2976                         'acodec': 'none' if stream_type == 'video' else fourcc,
2977                         'protocol': 'ism',
2978                         'fragments': fragments,
2979                         '_download_params': {
2980                             'stream_type': stream_type,
2981                             'duration': duration,
2982                             'timescale': stream_timescale,
2983                             'width': width or 0,
2984                             'height': height or 0,
2985                             'fourcc': fourcc,
2986                             'language': stream_language,
2987                             'codec_private_data': track.get('CodecPrivateData'),
2988                             'sampling_rate': sampling_rate,
2989                             'channels': int_or_none(track.get('Channels', 2)),
2990                             'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2991                             'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2992                         },
2993                     })
2994         return formats, subtitles
2995
2996     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2997         def absolute_url(item_url):
2998             return urljoin(base_url, item_url)
2999
3000         def parse_content_type(content_type):
3001             if not content_type:
3002                 return {}
3003             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3004             if ctr:
3005                 mimetype, codecs = ctr.groups()
3006                 f = parse_codecs(codecs)
3007                 f['ext'] = mimetype2ext(mimetype)
3008                 return f
3009             return {}
3010
3011         def _media_formats(src, cur_media_type, type_info={}):
3012             full_url = absolute_url(src)
3013             ext = type_info.get('ext') or determine_ext(full_url)
3014             if ext == 'm3u8':
3015                 is_plain_url = False
3016                 formats = self._extract_m3u8_formats(
3017                     full_url, video_id, ext='mp4',
3018                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3019                     preference=preference, quality=quality, fatal=False)
3020             elif ext == 'mpd':
3021                 is_plain_url = False
3022                 formats = self._extract_mpd_formats(
3023                     full_url, video_id, mpd_id=mpd_id, fatal=False)
3024             else:
3025                 is_plain_url = True
3026                 formats = [{
3027                     'url': full_url,
3028                     'vcodec': 'none' if cur_media_type == 'audio' else None,
3029                 }]
3030             return is_plain_url, formats
3031
3032         entries = []
3033         # amp-video and amp-audio are very similar to their HTML5 counterparts
3034         # so we wll include them right here (see
3035         # https://www.ampproject.org/docs/reference/components/amp-video)
3036         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3037         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3038         media_tags = [(media_tag, media_tag_name, media_type, '')
3039                       for media_tag, media_tag_name, media_type
3040                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3041         media_tags.extend(re.findall(
3042             # We only allow video|audio followed by a whitespace or '>'.
3043             # Allowing more characters may end up in significant slow down (see
3044             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3045             # http://www.porntrex.com/maps/videositemap.xml).
3046             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3047         for media_tag, _, media_type, media_content in media_tags:
3048             media_info = {
3049                 'formats': [],
3050                 'subtitles': {},
3051             }
3052             media_attributes = extract_attributes(media_tag)
3053             src = strip_or_none(media_attributes.get('src'))
3054             if src:
3055                 _, formats = _media_formats(src, media_type)
3056                 media_info['formats'].extend(formats)
3057             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3058             if media_content:
3059                 for source_tag in re.findall(r'<source[^>]+>', media_content):
3060                     s_attr = extract_attributes(source_tag)
3061                     # data-video-src and data-src are non standard but seen
3062                     # several times in the wild
3063                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3064                     if not src:
3065                         continue
3066                     f = parse_content_type(s_attr.get('type'))
3067                     is_plain_url, formats = _media_formats(src, media_type, f)
3068                     if is_plain_url:
3069                         # width, height, res, label and title attributes are
3070                         # all not standard but seen several times in the wild
3071                         labels = [
3072                             s_attr.get(lbl)
3073                             for lbl in ('label', 'title')
3074                             if str_or_none(s_attr.get(lbl))
3075                         ]
3076                         width = int_or_none(s_attr.get('width'))
3077                         height = (int_or_none(s_attr.get('height'))
3078                                   or int_or_none(s_attr.get('res')))
3079                         if not width or not height:
3080                             for lbl in labels:
3081                                 resolution = parse_resolution(lbl)
3082                                 if not resolution:
3083                                     continue
3084                                 width = width or resolution.get('width')
3085                                 height = height or resolution.get('height')
3086                         for lbl in labels:
3087                             tbr = parse_bitrate(lbl)
3088                             if tbr:
3089                                 break
3090                         else:
3091                             tbr = None
3092                         f.update({
3093                             'width': width,
3094                             'height': height,
3095                             'tbr': tbr,
3096                             'format_id': s_attr.get('label') or s_attr.get('title'),
3097                         })
3098                         f.update(formats[0])
3099                         media_info['formats'].append(f)
3100                     else:
3101                         media_info['formats'].extend(formats)
3102                 for track_tag in re.findall(r'<track[^>]+>', media_content):
3103                     track_attributes = extract_attributes(track_tag)
3104                     kind = track_attributes.get('kind')
3105                     if not kind or kind in ('subtitles', 'captions'):
3106                         src = strip_or_none(track_attributes.get('src'))
3107                         if not src:
3108                             continue
3109                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3110                         media_info['subtitles'].setdefault(lang, []).append({
3111                             'url': absolute_url(src),
3112                         })
3113             for f in media_info['formats']:
3114                 f.setdefault('http_headers', {})['Referer'] = base_url
3115             if media_info['formats'] or media_info['subtitles']:
3116                 entries.append(media_info)
3117         return entries
3118
3119     def _extract_akamai_formats(self, *args, **kwargs):
3120         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3121         if subs:
3122             self.report_warning(bug_reports_message(
3123                 "Ignoring subtitle tracks found in the manifests; "
3124                 "if any subtitle tracks are missing,"
3125             ))
3126         return fmts
3127
3128     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3129         signed = 'hdnea=' in manifest_url
3130         if not signed:
3131             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3132             manifest_url = re.sub(
3133                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3134                 '', manifest_url).strip('?')
3135
3136         formats = []
3137         subtitles = {}
3138
3139         hdcore_sign = 'hdcore=3.7.0'
3140         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3141         hds_host = hosts.get('hds')
3142         if hds_host:
3143             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3144         if 'hdcore=' not in f4m_url:
3145             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3146         f4m_formats = self._extract_f4m_formats(
3147             f4m_url, video_id, f4m_id='hds', fatal=False)
3148         for entry in f4m_formats:
3149             entry.update({'extra_param_to_segment_url': hdcore_sign})
3150         formats.extend(f4m_formats)
3151
3152         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3153         hls_host = hosts.get('hls')
3154         if hls_host:
3155             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3156         m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3157             m3u8_url, video_id, 'mp4', 'm3u8_native',
3158             m3u8_id='hls', fatal=False)
3159         formats.extend(m3u8_formats)
3160         subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3161
3162         http_host = hosts.get('http')
3163         if http_host and m3u8_formats and not signed:
3164             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3165             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3166             qualities_length = len(qualities)
3167             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3168                 i = 0
3169                 for f in m3u8_formats:
3170                     if f['vcodec'] != 'none':
3171                         for protocol in ('http', 'https'):
3172                             http_f = f.copy()
3173                             del http_f['manifest_url']
3174                             http_url = re.sub(
3175                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3176                             http_f.update({
3177                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3178                                 'url': http_url,
3179                                 'protocol': protocol,
3180                             })
3181                             formats.append(http_f)
3182                         i += 1
3183
3184         return formats, subtitles
3185
3186     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3187         query = compat_urlparse.urlparse(url).query
3188         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3189         mobj = re.search(
3190             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3191         url_base = mobj.group('url')
3192         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3193         formats = []
3194
3195         def manifest_url(manifest):
3196             m_url = '%s/%s' % (http_base_url, manifest)
3197             if query:
3198                 m_url += '?%s' % query
3199             return m_url
3200
3201         if 'm3u8' not in skip_protocols:
3202             formats.extend(self._extract_m3u8_formats(
3203                 manifest_url('playlist.m3u8'), video_id, 'mp4',
3204                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3205         if 'f4m' not in skip_protocols:
3206             formats.extend(self._extract_f4m_formats(
3207                 manifest_url('manifest.f4m'),
3208                 video_id, f4m_id='hds', fatal=False))
3209         if 'dash' not in skip_protocols:
3210             formats.extend(self._extract_mpd_formats(
3211                 manifest_url('manifest.mpd'),
3212                 video_id, mpd_id='dash', fatal=False))
3213         if re.search(r'(?:/smil:|\.smil)', url_base):
3214             if 'smil' not in skip_protocols:
3215                 rtmp_formats = self._extract_smil_formats(
3216                     manifest_url('jwplayer.smil'),
3217                     video_id, fatal=False)
3218                 for rtmp_format in rtmp_formats:
3219                     rtsp_format = rtmp_format.copy()
3220                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3221                     del rtsp_format['play_path']
3222                     del rtsp_format['ext']
3223                     rtsp_format.update({
3224                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3225                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3226                         'protocol': 'rtsp',
3227                     })
3228                     formats.extend([rtmp_format, rtsp_format])
3229         else:
3230             for protocol in ('rtmp', 'rtsp'):
3231                 if protocol not in skip_protocols:
3232                     formats.append({
3233                         'url': '%s:%s' % (protocol, url_base),
3234                         'format_id': protocol,
3235                         'protocol': protocol,
3236                     })
3237         return formats
3238
3239     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3240         mobj = re.search(
3241             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3242             webpage)
3243         if mobj:
3244             try:
3245                 jwplayer_data = self._parse_json(mobj.group('options'),
3246                                                  video_id=video_id,
3247                                                  transform_source=transform_source)
3248             except ExtractorError:
3249                 pass
3250             else:
3251                 if isinstance(jwplayer_data, dict):
3252                     return jwplayer_data
3253
3254     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3255         jwplayer_data = self._find_jwplayer_data(
3256             webpage, video_id, transform_source=js_to_json)
3257         return self._parse_jwplayer_data(
3258             jwplayer_data, video_id, *args, **kwargs)
3259
3260     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3261                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3262         # JWPlayer backward compatibility: flattened playlists
3263         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3264         if 'playlist' not in jwplayer_data:
3265             jwplayer_data = {'playlist': [jwplayer_data]}
3266
3267         entries = []
3268
3269         # JWPlayer backward compatibility: single playlist item
3270         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3271         if not isinstance(jwplayer_data['playlist'], list):
3272             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3273
3274         for video_data in jwplayer_data['playlist']:
3275             # JWPlayer backward compatibility: flattened sources
3276             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3277             if 'sources' not in video_data:
3278                 video_data['sources'] = [video_data]
3279
3280             this_video_id = video_id or video_data['mediaid']
3281
3282             formats = self._parse_jwplayer_formats(
3283                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3284                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3285
3286             subtitles = {}
3287             tracks = video_data.get('tracks')
3288             if tracks and isinstance(tracks, list):
3289                 for track in tracks:
3290                     if not isinstance(track, dict):
3291                         continue
3292                     track_kind = track.get('kind')
3293                     if not track_kind or not isinstance(track_kind, compat_str):
3294                         continue
3295                     if track_kind.lower() not in ('captions', 'subtitles'):
3296                         continue
3297                     track_url = urljoin(base_url, track.get('file'))
3298                     if not track_url:
3299                         continue
3300                     subtitles.setdefault(track.get('label') or 'en', []).append({
3301                         'url': self._proto_relative_url(track_url)
3302                     })
3303
3304             entry = {
3305                 'id': this_video_id,
3306                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3307                 'description': clean_html(video_data.get('description')),
3308                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3309                 'timestamp': int_or_none(video_data.get('pubdate')),
3310                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3311                 'subtitles': subtitles,
3312             }
3313             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3314             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3315                 entry.update({
3316                     '_type': 'url_transparent',
3317                     'url': formats[0]['url'],
3318                 })
3319             else:
3320                 self._sort_formats(formats)
3321                 entry['formats'] = formats
3322             entries.append(entry)
3323         if len(entries) == 1:
3324             return entries[0]
3325         else:
3326             return self.playlist_result(entries)
3327
3328     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3329                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3330         urls = []
3331         formats = []
3332         for source in jwplayer_sources_data:
3333             if not isinstance(source, dict):
3334                 continue
3335             source_url = urljoin(
3336                 base_url, self._proto_relative_url(source.get('file')))
3337             if not source_url or source_url in urls:
3338                 continue
3339             urls.append(source_url)
3340             source_type = source.get('type') or ''
3341             ext = mimetype2ext(source_type) or determine_ext(source_url)
3342             if source_type == 'hls' or ext == 'm3u8':
3343                 formats.extend(self._extract_m3u8_formats(
3344                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3345                     m3u8_id=m3u8_id, fatal=False))
3346             elif source_type == 'dash' or ext == 'mpd':
3347                 formats.extend(self._extract_mpd_formats(
3348                     source_url, video_id, mpd_id=mpd_id, fatal=False))
3349             elif ext == 'smil':
3350                 formats.extend(self._extract_smil_formats(
3351                     source_url, video_id, fatal=False))
3352             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3353             elif source_type.startswith('audio') or ext in (
3354                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3355                 formats.append({
3356                     'url': source_url,
3357                     'vcodec': 'none',
3358                     'ext': ext,
3359                 })
3360             else:
3361                 height = int_or_none(source.get('height'))
3362                 if height is None:
3363                     # Often no height is provided but there is a label in
3364                     # format like "1080p", "720p SD", or 1080.
3365                     height = int_or_none(self._search_regex(
3366                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3367                         'height', default=None))
3368                 a_format = {
3369                     'url': source_url,
3370                     'width': int_or_none(source.get('width')),
3371                     'height': height,
3372                     'tbr': int_or_none(source.get('bitrate')),
3373                     'ext': ext,
3374                 }
3375                 if source_url.startswith('rtmp'):
3376                     a_format['ext'] = 'flv'
3377                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3378                     # of jwplayer.flash.swf
3379                     rtmp_url_parts = re.split(
3380                         r'((?:mp4|mp3|flv):)', source_url, 1)
3381                     if len(rtmp_url_parts) == 3:
3382                         rtmp_url, prefix, play_path = rtmp_url_parts
3383                         a_format.update({
3384                             'url': rtmp_url,
3385                             'play_path': prefix + play_path,
3386                         })
3387                     if rtmp_params:
3388                         a_format.update(rtmp_params)
3389                 formats.append(a_format)
3390         return formats
3391
3392     def _live_title(self, name):
3393         """ Generate the title for a live video """
3394         now = datetime.datetime.now()
3395         now_str = now.strftime('%Y-%m-%d %H:%M')
3396         return name + ' ' + now_str
3397
3398     def _int(self, v, name, fatal=False, **kwargs):
3399         res = int_or_none(v, **kwargs)
3400         if 'get_attr' in kwargs:
3401             print(getattr(v, kwargs['get_attr']))
3402         if res is None:
3403             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3404             if fatal:
3405                 raise ExtractorError(msg)
3406             else:
3407                 self.report_warning(msg)
3408         return res
3409
3410     def _float(self, v, name, fatal=False, **kwargs):
3411         res = float_or_none(v, **kwargs)
3412         if res is None:
3413             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3414             if fatal:
3415                 raise ExtractorError(msg)
3416             else:
3417                 self.report_warning(msg)
3418         return res
3419
3420     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3421                     path='/', secure=False, discard=False, rest={}, **kwargs):
3422         cookie = compat_cookiejar_Cookie(
3423             0, name, value, port, port is not None, domain, True,
3424             domain.startswith('.'), path, True, secure, expire_time,
3425             discard, None, None, rest)
3426         self._downloader.cookiejar.set_cookie(cookie)
3427
3428     def _get_cookies(self, url):
3429         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3430         req = sanitized_Request(url)
3431         self._downloader.cookiejar.add_cookie_header(req)
3432         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3433
3434     def _apply_first_set_cookie_header(self, url_handle, cookie):
3435         """
3436         Apply first Set-Cookie header instead of the last. Experimental.
3437
3438         Some sites (e.g. [1-3]) may serve two cookies under the same name
3439         in Set-Cookie header and expect the first (old) one to be set rather
3440         than second (new). However, as of RFC6265 the newer one cookie
3441         should be set into cookie store what actually happens.
3442         We will workaround this issue by resetting the cookie to
3443         the first one manually.
3444         1. https://new.vk.com/
3445         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3446         3. https://learning.oreilly.com/
3447         """
3448         for header, cookies in url_handle.headers.items():
3449             if header.lower() != 'set-cookie':
3450                 continue
3451             if sys.version_info[0] >= 3:
3452                 cookies = cookies.encode('iso-8859-1')
3453             cookies = cookies.decode('utf-8')
3454             cookie_value = re.search(
3455                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3456             if cookie_value:
3457                 value, domain = cookie_value.groups()
3458                 self._set_cookie(domain, cookie, value)
3459                 break
3460
3461     def get_testcases(self, include_onlymatching=False):
3462         t = getattr(self, '_TEST', None)
3463         if t:
3464             assert not hasattr(self, '_TESTS'), \
3465                 '%s has _TEST and _TESTS' % type(self).__name__
3466             tests = [t]
3467         else:
3468             tests = getattr(self, '_TESTS', [])
3469         for t in tests:
3470             if not include_onlymatching and t.get('only_matching', False):
3471                 continue
3472             t['name'] = type(self).__name__[:-len('IE')]
3473             yield t
3474
3475     def is_suitable(self, age_limit):
3476         """ Test whether the extractor is generally suitable for the given
3477         age limit (i.e. pornographic sites are not, all others usually are) """
3478
3479         any_restricted = False
3480         for tc in self.get_testcases(include_onlymatching=False):
3481             if tc.get('playlist', []):
3482                 tc = tc['playlist'][0]
3483             is_restricted = age_restricted(
3484                 tc.get('info_dict', {}).get('age_limit'), age_limit)
3485             if not is_restricted:
3486                 return True
3487             any_restricted = any_restricted or is_restricted
3488         return not any_restricted
3489
3490     def extract_subtitles(self, *args, **kwargs):
3491         if (self.get_param('writesubtitles', False)
3492                 or self.get_param('listsubtitles')):
3493             return self._get_subtitles(*args, **kwargs)
3494         return {}
3495
3496     def _get_subtitles(self, *args, **kwargs):
3497         raise NotImplementedError('This method must be implemented by subclasses')
3498
3499     @staticmethod
3500     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3501         """ Merge subtitle items for one language. Items with duplicated URLs
3502         will be dropped. """
3503         list1_urls = set([item['url'] for item in subtitle_list1])
3504         ret = list(subtitle_list1)
3505         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3506         return ret
3507
3508     @classmethod
3509     def _merge_subtitles(cls, *dicts, **kwargs):
3510         """ Merge subtitle dictionaries, language by language. """
3511
3512         target = (lambda target=None: target)(**kwargs)
3513         # The above lambda extracts the keyword argument 'target' from kwargs
3514         # while ensuring there are no stray ones. When Python 2 support
3515         # is dropped, remove it and change the function signature to:
3516         #
3517         #     def _merge_subtitles(cls, *dicts, target=None):
3518
3519         if target is None:
3520             target = {}
3521         for d in dicts:
3522             for lang, subs in d.items():
3523                 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3524         return target
3525
3526     def extract_automatic_captions(self, *args, **kwargs):
3527         if (self.get_param('writeautomaticsub', False)
3528                 or self.get_param('listsubtitles')):
3529             return self._get_automatic_captions(*args, **kwargs)
3530         return {}
3531
3532     def _get_automatic_captions(self, *args, **kwargs):
3533         raise NotImplementedError('This method must be implemented by subclasses')
3534
3535     def mark_watched(self, *args, **kwargs):
3536         if (self.get_param('mark_watched', False)
3537                 and (self._get_login_info()[0] is not None
3538                      or self.get_param('cookiefile') is not None)):
3539             self._mark_watched(*args, **kwargs)
3540
3541     def _mark_watched(self, *args, **kwargs):
3542         raise NotImplementedError('This method must be implemented by subclasses')
3543
3544     def geo_verification_headers(self):
3545         headers = {}
3546         geo_verification_proxy = self.get_param('geo_verification_proxy')
3547         if geo_verification_proxy:
3548             headers['Ytdl-request-proxy'] = geo_verification_proxy
3549         return headers
3550
3551     def _generic_id(self, url):
3552         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3553
3554     def _generic_title(self, url):
3555         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3556
3557     @staticmethod
3558     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3559         all_known = all(map(
3560             lambda x: x is not None,
3561             (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3562         return (
3563             'private' if is_private
3564             else 'premium_only' if needs_premium
3565             else 'subscriber_only' if needs_subscription
3566             else 'needs_auth' if needs_auth
3567             else 'unlisted' if is_unlisted
3568             else 'public' if all_known
3569             else None)
3570
3571     def _configuration_arg(self, key):
3572         return traverse_obj(
3573             self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
3574
3575
3576 class SearchInfoExtractor(InfoExtractor):
3577     """
3578     Base class for paged search queries extractors.
3579     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3580     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3581     """
3582
3583     @classmethod
3584     def _make_valid_url(cls):
3585         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3586
3587     @classmethod
3588     def suitable(cls, url):
3589         return re.match(cls._make_valid_url(), url) is not None
3590
3591     def _real_extract(self, query):
3592         mobj = re.match(self._make_valid_url(), query)
3593         if mobj is None:
3594             raise ExtractorError('Invalid search query "%s"' % query)
3595
3596         prefix = mobj.group('prefix')
3597         query = mobj.group('query')
3598         if prefix == '':
3599             return self._get_n_results(query, 1)
3600         elif prefix == 'all':
3601             return self._get_n_results(query, self._MAX_RESULTS)
3602         else:
3603             n = int(prefix)
3604             if n <= 0:
3605                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3606             elif n > self._MAX_RESULTS:
3607                 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3608                 n = self._MAX_RESULTS
3609             return self._get_n_results(query, n)
3610
3611     def _get_n_results(self, query, n):
3612         """Get a specified number of results for a query"""
3613         raise NotImplementedError('This method must be implemented by subclasses')
3614
3615     @property
3616     def SEARCH_KEY(self):
3617         return self._SEARCH_KEY