youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader.f4m import (
  33     get_base_url,
  34     remove_encrypted_media,
  35 )
  36 from ..utils import (
  37     NO_DEFAULT,
  38     age_restricted,
  39     base_url,
  40     bug_reports_message,
  41     clean_html,
  42     compiled_regex_type,
  43     determine_ext,
  44     determine_protocol,
  45     error_to_compat_str,
  46     ExtractorError,
  47     extract_attributes,
  48     fix_xml_ampersands,
  49     float_or_none,
  50     GeoRestrictedError,
  51     GeoUtils,
  52     int_or_none,
  53     js_to_json,
  54     mimetype2ext,
  55     orderedSet,
  56     parse_codecs,
  57     parse_duration,
  58     parse_iso8601,
  59     parse_m3u8_attributes,
  60     RegexNotFoundError,
  61     sanitized_Request,
  62     sanitize_filename,
  63     unescapeHTML,
  64     unified_strdate,
  65     unified_timestamp,
  66     update_Request,
  67     update_url_query,
  68     urljoin,
  69     url_basename,
  70     xpath_element,
  71     xpath_text,
  72     xpath_with_ns,
  73 )
  74
  75
  76 class InfoExtractor(object):
  77     """Information Extractor class.
  78
  79     Information extractors are the classes that, given a URL, extract
  80     information about the video (or videos) the URL refers to. This
  81     information includes the real video URL, the video title, author and
  82     others. The information is stored in a dictionary which is then
  83     passed to the YoutubeDL. The YoutubeDL processes this
  84     information possibly downloading the video to the file system, among
  85     other possible outcomes.
  86
  87     The type field determines the type of the result.
  88     By far the most common value (and the default if _type is missing) is
  89     "video", which indicates a single video.
  90
  91     For a video, the dictionaries must include the following fields:
  92
  93     id:             Video identifier.
  94     title:          Video title, unescaped.
  95
  96     Additionally, it must contain either a formats entry or a url one:
  97
  98     formats:        A list of dictionaries for each format available, ordered
  99                     from worst to best quality.
 100
 101                     Potential fields:
 102                     * url        Mandatory. The URL of the video file
 103                     * manifest_url
 104                                  The URL of the manifest file in case of
 105                                  fragmented media (DASH, hls, hds)
 106                     * ext        Will be calculated from URL if missing
 107                     * format     A human-readable description of the format
 108                                  ("mp4 container with h264/opus").
 109                                  Calculated from the format_id, width, height.
 110                                  and format_note fields if missing.
 111                     * format_id  A short description of the format
 112                                  ("mp4_h264_opus" or "19").
 113                                 Technically optional, but strongly recommended.
 114                     * format_note Additional info about the format
 115                                  ("3D" or "DASH video")
 116                     * width      Width of the video, if known
 117                     * height     Height of the video, if known
 118                     * resolution Textual description of width and height
 119                     * tbr        Average bitrate of audio and video in KBit/s
 120                     * abr        Average audio bitrate in KBit/s
 121                     * acodec     Name of the audio codec in use
 122                     * asr        Audio sampling rate in Hertz
 123                     * vbr        Average video bitrate in KBit/s
 124                     * fps        Frame rate
 125                     * vcodec     Name of the video codec in use
 126                     * container  Name of the container format
 127                     * filesize   The number of bytes, if known in advance
 128                     * filesize_approx  An estimate for the number of bytes
 129                     * player_url SWF Player URL (used for rtmpdump).
 130                     * protocol   The protocol that will be used for the actual
 131                                  download, lower-case.
 132                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 133                                  "m3u8", "m3u8_native" or "http_dash_segments".
 134                     * fragment_base_url
 135                                  Base URL for fragments. Each fragment's path
 136                                  value (if present) will be relative to
 137                                  this URL.
 138                     * fragments  A list of fragments of a fragmented media.
 139                                  Each fragment entry must contain either an url
 140                                  or a path. If an url is present it should be
 141                                  considered by a client. Otherwise both path and
 142                                  fragment_base_url must be present. Here is
 143                                  the list of all potential fields:
 144                                  * "url" - fragment's URL
 145                                  * "path" - fragment's path relative to
 146                                             fragment_base_url
 147                                  * "duration" (optional, int or float)
 148                                  * "filesize" (optional, int)
 149                     * preference Order number of this format. If this field is
 150                                  present and not None, the formats get sorted
 151                                  by this field, regardless of all other values.
 152                                  -1 for default (order by other properties),
 153                                  -2 or smaller for less than default.
 154                                  < -1000 to hide the format (if there is
 155                                     another one which is strictly better)
 156                     * language   Language code, e.g. "de" or "en-US".
 157                     * language_preference  Is this in the language mentioned in
 158                                  the URL?
 159                                  10 if it's what the URL is about,
 160                                  -1 for default (don't know),
 161                                  -10 otherwise, other values reserved for now.
 162                     * quality    Order number of the video quality of this
 163                                  format, irrespective of the file format.
 164                                  -1 for default (order by other properties),
 165                                  -2 or smaller for less than default.
 166                     * source_preference  Order number for this video source
 167                                   (quality takes higher priority)
 168                                  -1 for default (order by other properties),
 169                                  -2 or smaller for less than default.
 170                     * http_headers  A dictionary of additional HTTP headers
 171                                  to add to the request.
 172                     * stretched_ratio  If given and not 1, indicates that the
 173                                  video's pixels are not square.
 174                                  width : height ratio as float.
 175                     * no_resume  The server does not support resuming the
 176                                  (HTTP or RTMP) download. Boolean.
 177                     * downloader_options  A dictionary of downloader options as
 178                                  described in FileDownloader
 179
 180     url:            Final video URL.
 181     ext:            Video filename extension.
 182     format:         The video format, defaults to ext (used for --get-format)
 183     player_url:     SWF Player URL (used for rtmpdump).
 184
 185     The following fields are optional:
 186
 187     alt_title:      A secondary title of the video.
 188     display_id      An alternative identifier for the video, not necessarily
 189                     unique, but available before title. Typically, id is
 190                     something like "4234987", title "Dancing naked mole rats",
 191                     and display_id "dancing-naked-mole-rats"
 192     thumbnails:     A list of dictionaries, with the following entries:
 193                         * "id" (optional, string) - Thumbnail format ID
 194                         * "url"
 195                         * "preference" (optional, int) - quality of the image
 196                         * "width" (optional, int)
 197                         * "height" (optional, int)
 198                         * "resolution" (optional, string "{width}x{height"},
 199                                         deprecated)
 200                         * "filesize" (optional, int)
 201     thumbnail:      Full URL to a video thumbnail image.
 202     description:    Full video description.
 203     uploader:       Full name of the video uploader.
 204     license:        License name the video is licensed under.
 205     creator:        The creator of the video.
 206     release_date:   The date (YYYYMMDD) when the video was released.
 207     timestamp:      UNIX timestamp of the moment the video became available.
 208     upload_date:    Video upload date (YYYYMMDD).
 209                     If not explicitly set, calculated from timestamp.
 210     uploader_id:    Nickname or id of the video uploader.
 211     uploader_url:   Full URL to a personal webpage of the video uploader.
 212     location:       Physical location where the video was filmed.
 213     subtitles:      The available subtitles as a dictionary in the format
 214                     {tag: subformats}. "tag" is usually a language code, and
 215                     "subformats" is a list sorted from lower to higher
 216                     preference, each element is a dictionary with the "ext"
 217                     entry and one of:
 218                         * "data": The subtitles file contents
 219                         * "url": A URL pointing to the subtitles file
 220                     "ext" will be calculated from URL if missing
 221     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 222                     automatically generated captions
 223     duration:       Length of the video in seconds, as an integer or float.
 224     view_count:     How many users have watched the video on the platform.
 225     like_count:     Number of positive ratings of the video
 226     dislike_count:  Number of negative ratings of the video
 227     repost_count:   Number of reposts of the video
 228     average_rating: Average rating give by users, the scale used depends on the webpage
 229     comment_count:  Number of comments on the video
 230     comments:       A list of comments, each with one or more of the following
 231                     properties (all but one of text or html optional):
 232                         * "author" - human-readable name of the comment author
 233                         * "author_id" - user ID of the comment author
 234                         * "id" - Comment ID
 235                         * "html" - Comment as HTML
 236                         * "text" - Plain text of the comment
 237                         * "timestamp" - UNIX timestamp of comment
 238                         * "parent" - ID of the comment this one is replying to.
 239                                      Set to "root" to indicate that this is a
 240                                      comment to the original video.
 241     age_limit:      Age restriction for the video, as an integer (years)
 242     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 243                     should allow to get the same result again. (It will be set
 244                     by YoutubeDL if it's missing)
 245     categories:     A list of categories that the video falls in, for example
 246                     ["Sports", "Berlin"]
 247     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 248     is_live:        True, False, or None (=unknown). Whether this video is a
 249                     live stream that goes on instead of a fixed-length video.
 250     start_time:     Time in seconds where the reproduction should start, as
 251                     specified in the URL.
 252     end_time:       Time in seconds where the reproduction should end, as
 253                     specified in the URL.
 254     chapters:       A list of dictionaries, with the following entries:
 255                         * "start_time" - The start time of the chapter in seconds
 256                         * "end_time" - The end time of the chapter in seconds
 257                         * "title" (optional, string)
 258
 259     The following fields should only be used when the video belongs to some logical
 260     chapter or section:
 261
 262     chapter:        Name or title of the chapter the video belongs to.
 263     chapter_number: Number of the chapter the video belongs to, as an integer.
 264     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 265
 266     The following fields should only be used when the video is an episode of some
 267     series, programme or podcast:
 268
 269     series:         Title of the series or programme the video episode belongs to.
 270     season:         Title of the season the video episode belongs to.
 271     season_number:  Number of the season the video episode belongs to, as an integer.
 272     season_id:      Id of the season the video episode belongs to, as a unicode string.
 273     episode:        Title of the video episode. Unlike mandatory video title field,
 274                     this field should denote the exact title of the video episode
 275                     without any kind of decoration.
 276     episode_number: Number of the video episode within a season, as an integer.
 277     episode_id:     Id of the video episode, as a unicode string.
 278
 279     The following fields should only be used when the media is a track or a part of
 280     a music album:
 281
 282     track:          Title of the track.
 283     track_number:   Number of the track within an album or a disc, as an integer.
 284     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 285                     as a unicode string.
 286     artist:         Artist(s) of the track.
 287     genre:          Genre(s) of the track.
 288     album:          Title of the album the track belongs to.
 289     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 290     album_artist:   List of all artists appeared on the album (e.g.
 291                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 292                     and compilations).
 293     disc_number:    Number of the disc or other physical medium the track belongs to,
 294                     as an integer.
 295     release_year:   Year (YYYY) when the album was released.
 296
 297     Unless mentioned otherwise, the fields should be Unicode strings.
 298
 299     Unless mentioned otherwise, None is equivalent to absence of information.
 300
 301
 302     _type "playlist" indicates multiple videos.
 303     There must be a key "entries", which is a list, an iterable, or a PagedList
 304     object, each element of which is a valid dictionary by this specification.
 305
 306     Additionally, playlists can have "id", "title", "description", "uploader",
 307     "uploader_id", "uploader_url" attributes with the same semantics as videos
 308     (see above).
 309
 310
 311     _type "multi_video" indicates that there are multiple videos that
 312     form a single show, for examples multiple acts of an opera or TV episode.
 313     It must have an entries key like a playlist and contain all the keys
 314     required for a video at the same time.
 315
 316
 317     _type "url" indicates that the video must be extracted from another
 318     location, possibly by a different extractor. Its only required key is:
 319     "url" - the next URL to extract.
 320     The key "ie_key" can be set to the class name (minus the trailing "IE",
 321     e.g. "Youtube") if the extractor class is known in advance.
 322     Additionally, the dictionary may have any properties of the resolved entity
 323     known in advance, for example "title" if the title of the referred video is
 324     known ahead of time.
 325
 326
 327     _type "url_transparent" entities have the same specification as "url", but
 328     indicate that the given additional information is more precise than the one
 329     associated with the resolved URL.
 330     This is useful when a site employs a video service that hosts the video and
 331     its technical metadata, but that video service does not embed a useful
 332     title, description etc.
 333
 334
 335     Subclasses of this one should re-define the _real_initialize() and
 336     _real_extract() methods and define a _VALID_URL regexp.
 337     Probably, they should also be added to the list of extractors.
 338
 339     _GEO_BYPASS attribute may be set to False in order to disable
 340     geo restriction bypass mechanisms for a particular extractor.
 341     Though it won't disable explicit geo restriction bypass based on
 342     country code provided with geo_bypass_country. (experimental)
 343
 344     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 345     countries for this extractor. One of these countries will be used by
 346     geo restriction bypass mechanism right away in order to bypass
 347     geo restriction, of course, if the mechanism is not disabled. (experimental)
 348
 349     NB: both these geo attributes are experimental and may change in future
 350     or be completely removed.
 351
 352     Finally, the _WORKING attribute should be set to False for broken IEs
 353     in order to warn the users and skip the tests.
 354     """
 355
 356     _ready = False
 357     _downloader = None
 358     _x_forwarded_for_ip = None
 359     _GEO_BYPASS = True
 360     _GEO_COUNTRIES = None
 361     _WORKING = True
 362
 363     def __init__(self, downloader=None):
 364         """Constructor. Receives an optional downloader."""
 365         self._ready = False
 366         self._x_forwarded_for_ip = None
 367         self.set_downloader(downloader)
 368
 369     @classmethod
 370     def suitable(cls, url):
 371         """Receives a URL and returns True if suitable for this IE."""
 372
 373         # This does not use has/getattr intentionally - we want to know whether
 374         # we have cached the regexp for *this* class, whereas getattr would also
 375         # match the superclass
 376         if '_VALID_URL_RE' not in cls.__dict__:
 377             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 378         return cls._VALID_URL_RE.match(url) is not None
 379
 380     @classmethod
 381     def _match_id(cls, url):
 382         if '_VALID_URL_RE' not in cls.__dict__:
 383             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 384         m = cls._VALID_URL_RE.match(url)
 385         assert m
 386         return compat_str(m.group('id'))
 387
 388     @classmethod
 389     def working(cls):
 390         """Getter method for _WORKING."""
 391         return cls._WORKING
 392
 393     def initialize(self):
 394         """Initializes an instance (authentication, etc)."""
 395         self._initialize_geo_bypass(self._GEO_COUNTRIES)
 396         if not self._ready:
 397             self._real_initialize()
 398             self._ready = True
 399
 400     def _initialize_geo_bypass(self, countries):
 401         """
 402         Initialize geo restriction bypass mechanism.
 403
 404         This method is used to initialize geo bypass mechanism based on faking
 405         X-Forwarded-For HTTP header. A random country from provided country list
 406         is selected and a random IP belonging to this country is generated. This
 407         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 408         HTTP requests.
 409
 410         This method will be used for initial geo bypass mechanism initialization
 411         during the instance initialization with _GEO_COUNTRIES.
 412
 413         You may also manually call it from extractor's code if geo countries
 414         information is not available beforehand (e.g. obtained during
 415         extraction) or due to some another reason.
 416         """
 417         if not self._x_forwarded_for_ip:
 418             country_code = self._downloader.params.get('geo_bypass_country', None)
 419             # If there is no explicit country for geo bypass specified and
 420             # the extractor is known to be geo restricted let's fake IP
 421             # as X-Forwarded-For right away.
 422             if (not country_code and
 423                     self._GEO_BYPASS and
 424                     self._downloader.params.get('geo_bypass', True) and
 425                     countries):
 426                 country_code = random.choice(countries)
 427             if country_code:
 428                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 429                 if self._downloader.params.get('verbose', False):
 430                     self._downloader.to_screen(
 431                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 432                         % (self._x_forwarded_for_ip, country_code.upper()))
 433
 434     def extract(self, url):
 435         """Extracts URL information and returns it in list of dicts."""
 436         try:
 437             for _ in range(2):
 438                 try:
 439                     self.initialize()
 440                     ie_result = self._real_extract(url)
 441                     if self._x_forwarded_for_ip:
 442                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 443                     return ie_result
 444                 except GeoRestrictedError as e:
 445                     if self.__maybe_fake_ip_and_retry(e.countries):
 446                         continue
 447                     raise
 448         except ExtractorError:
 449             raise
 450         except compat_http_client.IncompleteRead as e:
 451             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 452         except (KeyError, StopIteration) as e:
 453             raise ExtractorError('An extractor error has occurred.', cause=e)
 454
 455     def __maybe_fake_ip_and_retry(self, countries):
 456         if (not self._downloader.params.get('geo_bypass_country', None) and
 457                 self._GEO_BYPASS and
 458                 self._downloader.params.get('geo_bypass', True) and
 459                 not self._x_forwarded_for_ip and
 460                 countries):
 461             country_code = random.choice(countries)
 462             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 463             if self._x_forwarded_for_ip:
 464                 self.report_warning(
 465                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 466                     % (self._x_forwarded_for_ip, country_code.upper()))
 467                 return True
 468         return False
 469
 470     def set_downloader(self, downloader):
 471         """Sets the downloader for this IE."""
 472         self._downloader = downloader
 473
 474     def _real_initialize(self):
 475         """Real initialization process. Redefine in subclasses."""
 476         pass
 477
 478     def _real_extract(self, url):
 479         """Real extraction process. Redefine in subclasses."""
 480         pass
 481
 482     @classmethod
 483     def ie_key(cls):
 484         """A string for getting the InfoExtractor with get_info_extractor"""
 485         return compat_str(cls.__name__[:-2])
 486
 487     @property
 488     def IE_NAME(self):
 489         return compat_str(type(self).__name__[:-2])
 490
 491     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 492         """ Returns the response handle """
 493         if note is None:
 494             self.report_download_webpage(video_id)
 495         elif note is not False:
 496             if video_id is None:
 497                 self.to_screen('%s' % (note,))
 498             else:
 499                 self.to_screen('%s: %s' % (video_id, note))
 500
 501         # Some sites check X-Forwarded-For HTTP header in order to figure out
 502         # the origin of the client behind proxy. This allows bypassing geo
 503         # restriction by faking this header's value to IP that belongs to some
 504         # geo unrestricted country. We will do so once we encounter any
 505         # geo restriction error.
 506         if self._x_forwarded_for_ip:
 507             if 'X-Forwarded-For' not in headers:
 508                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 509
 510         if isinstance(url_or_request, compat_urllib_request.Request):
 511             url_or_request = update_Request(
 512                 url_or_request, data=data, headers=headers, query=query)
 513         else:
 514             if query:
 515                 url_or_request = update_url_query(url_or_request, query)
 516             if data is not None or headers:
 517                 url_or_request = sanitized_Request(url_or_request, data, headers)
 518         try:
 519             return self._downloader.urlopen(url_or_request)
 520         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 521             if errnote is False:
 522                 return False
 523             if errnote is None:
 524                 errnote = 'Unable to download webpage'
 525
 526             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 527             if fatal:
 528                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 529             else:
 530                 self._downloader.report_warning(errmsg)
 531                 return False
 532
 533     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 534         """ Returns a tuple (page content as string, URL handle) """
 535         # Strip hashes from the URL (#1038)
 536         if isinstance(url_or_request, (compat_str, str)):
 537             url_or_request = url_or_request.partition('#')[0]
 538
 539         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 540         if urlh is False:
 541             assert not fatal
 542             return False
 543         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 544         return (content, urlh)
 545
 546     @staticmethod
 547     def _guess_encoding_from_content(content_type, webpage_bytes):
 548         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 549         if m:
 550             encoding = m.group(1)
 551         else:
 552             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 553                           webpage_bytes[:1024])
 554             if m:
 555                 encoding = m.group(1).decode('ascii')
 556             elif webpage_bytes.startswith(b'\xff\xfe'):
 557                 encoding = 'utf-16'
 558             else:
 559                 encoding = 'utf-8'
 560
 561         return encoding
 562
 563     def __check_blocked(self, content):
 564         first_block = content[:512]
 565         if ('<title>Access to this site is blocked</title>' in content and
 566                 'Websense' in first_block):
 567             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 568             blocked_iframe = self._html_search_regex(
 569                 r'<iframe src="([^"]+)"', content,
 570                 'Websense information URL', default=None)
 571             if blocked_iframe:
 572                 msg += ' Visit %s for more details' % blocked_iframe
 573             raise ExtractorError(msg, expected=True)
 574         if '<title>The URL you requested has been blocked</title>' in first_block:
 575             msg = (
 576                 'Access to this webpage has been blocked by Indian censorship. '
 577                 'Use a VPN or proxy server (with --proxy) to route around it.')
 578             block_msg = self._html_search_regex(
 579                 r'</h1><p>(.*?)</p>',
 580                 content, 'block message', default=None)
 581             if block_msg:
 582                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 583             raise ExtractorError(msg, expected=True)
 584         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 585                 'blocklist.rkn.gov.ru' in content):
 586             raise ExtractorError(
 587                 'Access to this webpage has been blocked by decision of the Russian government. '
 588                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 589                 expected=True)
 590
 591     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 592         content_type = urlh.headers.get('Content-Type', '')
 593         webpage_bytes = urlh.read()
 594         if prefix is not None:
 595             webpage_bytes = prefix + webpage_bytes
 596         if not encoding:
 597             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 598         if self._downloader.params.get('dump_intermediate_pages', False):
 599             self.to_screen('Dumping request to ' + urlh.geturl())
 600             dump = base64.b64encode(webpage_bytes).decode('ascii')
 601             self._downloader.to_screen(dump)
 602         if self._downloader.params.get('write_pages', False):
 603             basen = '%s_%s' % (video_id, urlh.geturl())
 604             if len(basen) > 240:
 605                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 606                 basen = basen[:240 - len(h)] + h
 607             raw_filename = basen + '.dump'
 608             filename = sanitize_filename(raw_filename, restricted=True)
 609             self.to_screen('Saving request to ' + filename)
 610             # Working around MAX_PATH limitation on Windows (see
 611             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 612             if compat_os_name == 'nt':
 613                 absfilepath = os.path.abspath(filename)
 614                 if len(absfilepath) > 259:
 615                     filename = '\\\\?\\' + absfilepath
 616             with open(filename, 'wb') as outf:
 617                 outf.write(webpage_bytes)
 618
 619         try:
 620             content = webpage_bytes.decode(encoding, 'replace')
 621         except LookupError:
 622             content = webpage_bytes.decode('utf-8', 'replace')
 623
 624         self.__check_blocked(content)
 625
 626         return content
 627
 628     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 629         """ Returns the data of the page as a string """
 630         success = False
 631         try_count = 0
 632         while success is False:
 633             try:
 634                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 635                 success = True
 636             except compat_http_client.IncompleteRead as e:
 637                 try_count += 1
 638                 if try_count >= tries:
 639                     raise e
 640                 self._sleep(timeout, video_id)
 641         if res is False:
 642             return res
 643         else:
 644             content, _ = res
 645             return content
 646
 647     def _download_xml(self, url_or_request, video_id,
 648                       note='Downloading XML', errnote='Unable to download XML',
 649                       transform_source=None, fatal=True, encoding=None,
 650                       data=None, headers={}, query={}):
 651         """Return the xml as an xml.etree.ElementTree.Element"""
 652         xml_string = self._download_webpage(
 653             url_or_request, video_id, note, errnote, fatal=fatal,
 654             encoding=encoding, data=data, headers=headers, query=query)
 655         if xml_string is False:
 656             return xml_string
 657         return self._parse_xml(
 658             xml_string, video_id, transform_source=transform_source,
 659             fatal=fatal)
 660
 661     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 662         if transform_source:
 663             xml_string = transform_source(xml_string)
 664         try:
 665             return compat_etree_fromstring(xml_string.encode('utf-8'))
 666         except compat_xml_parse_error as ve:
 667             errmsg = '%s: Failed to parse XML ' % video_id
 668             if fatal:
 669                 raise ExtractorError(errmsg, cause=ve)
 670             else:
 671                 self.report_warning(errmsg + str(ve))
 672
 673     def _download_json(self, url_or_request, video_id,
 674                        note='Downloading JSON metadata',
 675                        errnote='Unable to download JSON metadata',
 676                        transform_source=None,
 677                        fatal=True, encoding=None, data=None, headers={}, query={}):
 678         json_string = self._download_webpage(
 679             url_or_request, video_id, note, errnote, fatal=fatal,
 680             encoding=encoding, data=data, headers=headers, query=query)
 681         if (not fatal) and json_string is False:
 682             return None
 683         return self._parse_json(
 684             json_string, video_id, transform_source=transform_source, fatal=fatal)
 685
 686     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 687         if transform_source:
 688             json_string = transform_source(json_string)
 689         try:
 690             return json.loads(json_string)
 691         except ValueError as ve:
 692             errmsg = '%s: Failed to parse JSON ' % video_id
 693             if fatal:
 694                 raise ExtractorError(errmsg, cause=ve)
 695             else:
 696                 self.report_warning(errmsg + str(ve))
 697
 698     def report_warning(self, msg, video_id=None):
 699         idstr = '' if video_id is None else '%s: ' % video_id
 700         self._downloader.report_warning(
 701             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 702
 703     def to_screen(self, msg):
 704         """Print msg to screen, prefixing it with '[ie_name]'"""
 705         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 706
 707     def report_extraction(self, id_or_name):
 708         """Report information extraction."""
 709         self.to_screen('%s: Extracting information' % id_or_name)
 710
 711     def report_download_webpage(self, video_id):
 712         """Report webpage download."""
 713         self.to_screen('%s: Downloading webpage' % video_id)
 714
 715     def report_age_confirmation(self):
 716         """Report attempt to confirm age."""
 717         self.to_screen('Confirming age')
 718
 719     def report_login(self):
 720         """Report attempt to log in."""
 721         self.to_screen('Logging in')
 722
 723     @staticmethod
 724     def raise_login_required(msg='This video is only available for registered users'):
 725         raise ExtractorError(
 726             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 727             expected=True)
 728
 729     @staticmethod
 730     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 731         raise GeoRestrictedError(msg, countries=countries)
 732
 733     # Methods for following #608
 734     @staticmethod
 735     def url_result(url, ie=None, video_id=None, video_title=None):
 736         """Returns a URL that points to a page that should be processed"""
 737         # TODO: ie should be the class used for getting the info
 738         video_info = {'_type': 'url',
 739                       'url': url,
 740                       'ie_key': ie}
 741         if video_id is not None:
 742             video_info['id'] = video_id
 743         if video_title is not None:
 744             video_info['title'] = video_title
 745         return video_info
 746
 747     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 748         urls = orderedSet(
 749             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 750             for m in matches)
 751         return self.playlist_result(
 752             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 753
 754     @staticmethod
 755     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 756         """Returns a playlist"""
 757         video_info = {'_type': 'playlist',
 758                       'entries': entries}
 759         if playlist_id:
 760             video_info['id'] = playlist_id
 761         if playlist_title:
 762             video_info['title'] = playlist_title
 763         if playlist_description:
 764             video_info['description'] = playlist_description
 765         return video_info
 766
 767     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 768         """
 769         Perform a regex search on the given string, using a single or a list of
 770         patterns returning the first matching group.
 771         In case of failure return a default value or raise a WARNING or a
 772         RegexNotFoundError, depending on fatal, specifying the field name.
 773         """
 774         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 775             mobj = re.search(pattern, string, flags)
 776         else:
 777             for p in pattern:
 778                 mobj = re.search(p, string, flags)
 779                 if mobj:
 780                     break
 781
 782         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 783             _name = '\033[0;34m%s\033[0m' % name
 784         else:
 785             _name = name
 786
 787         if mobj:
 788             if group is None:
 789                 # return the first matching group
 790                 return next(g for g in mobj.groups() if g is not None)
 791             else:
 792                 return mobj.group(group)
 793         elif default is not NO_DEFAULT:
 794             return default
 795         elif fatal:
 796             raise RegexNotFoundError('Unable to extract %s' % _name)
 797         else:
 798             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 799             return None
 800
 801     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 802         """
 803         Like _search_regex, but strips HTML tags and unescapes entities.
 804         """
 805         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 806         if res:
 807             return clean_html(res).strip()
 808         else:
 809             return res
 810
 811     def _get_netrc_login_info(self, netrc_machine=None):
 812         username = None
 813         password = None
 814         netrc_machine = netrc_machine or self._NETRC_MACHINE
 815
 816         if self._downloader.params.get('usenetrc', False):
 817             try:
 818                 info = netrc.netrc().authenticators(netrc_machine)
 819                 if info is not None:
 820                     username = info[0]
 821                     password = info[2]
 822                 else:
 823                     raise netrc.NetrcParseError(
 824                         'No authenticators for %s' % netrc_machine)
 825             except (IOError, netrc.NetrcParseError) as err:
 826                 self._downloader.report_warning(
 827                     'parsing .netrc: %s' % error_to_compat_str(err))
 828
 829         return username, password
 830
 831     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 832         """
 833         Get the login info as (username, password)
 834         First look for the manually specified credentials using username_option
 835         and password_option as keys in params dictionary. If no such credentials
 836         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 837         value.
 838         If there's no info available, return (None, None)
 839         """
 840         if self._downloader is None:
 841             return (None, None)
 842
 843         downloader_params = self._downloader.params
 844
 845         # Attempt to use provided username and password or .netrc data
 846         if downloader_params.get(username_option) is not None:
 847             username = downloader_params[username_option]
 848             password = downloader_params[password_option]
 849         else:
 850             username, password = self._get_netrc_login_info(netrc_machine)
 851
 852         return username, password
 853
 854     def _get_tfa_info(self, note='two-factor verification code'):
 855         """
 856         Get the two-factor authentication info
 857         TODO - asking the user will be required for sms/phone verify
 858         currently just uses the command line option
 859         If there's no info available, return None
 860         """
 861         if self._downloader is None:
 862             return None
 863         downloader_params = self._downloader.params
 864
 865         if downloader_params.get('twofactor') is not None:
 866             return downloader_params['twofactor']
 867
 868         return compat_getpass('Type %s and press [Return]: ' % note)
 869
 870     # Helper functions for extracting OpenGraph info
 871     @staticmethod
 872     def _og_regexes(prop):
 873         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 874         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 875                        % {'prop': re.escape(prop)})
 876         template = r'<meta[^>]+?%s[^>]+?%s'
 877         return [
 878             template % (property_re, content_re),
 879             template % (content_re, property_re),
 880         ]
 881
 882     @staticmethod
 883     def _meta_regex(prop):
 884         return r'''(?isx)<meta
 885                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 886                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 887
 888     def _og_search_property(self, prop, html, name=None, **kargs):
 889         if not isinstance(prop, (list, tuple)):
 890             prop = [prop]
 891         if name is None:
 892             name = 'OpenGraph %s' % prop[0]
 893         og_regexes = []
 894         for p in prop:
 895             og_regexes.extend(self._og_regexes(p))
 896         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 897         if escaped is None:
 898             return None
 899         return unescapeHTML(escaped)
 900
 901     def _og_search_thumbnail(self, html, **kargs):
 902         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 903
 904     def _og_search_description(self, html, **kargs):
 905         return self._og_search_property('description', html, fatal=False, **kargs)
 906
 907     def _og_search_title(self, html, **kargs):
 908         return self._og_search_property('title', html, **kargs)
 909
 910     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 911         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 912         if secure:
 913             regexes = self._og_regexes('video:secure_url') + regexes
 914         return self._html_search_regex(regexes, html, name, **kargs)
 915
 916     def _og_search_url(self, html, **kargs):
 917         return self._og_search_property('url', html, **kargs)
 918
 919     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 920         if not isinstance(name, (list, tuple)):
 921             name = [name]
 922         if display_name is None:
 923             display_name = name[0]
 924         return self._html_search_regex(
 925             [self._meta_regex(n) for n in name],
 926             html, display_name, fatal=fatal, group='content', **kwargs)
 927
 928     def _dc_search_uploader(self, html):
 929         return self._html_search_meta('dc.creator', html, 'uploader')
 930
 931     def _rta_search(self, html):
 932         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 933         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 934                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 935                      html):
 936             return 18
 937         return 0
 938
 939     def _media_rating_search(self, html):
 940         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 941         rating = self._html_search_meta('rating', html)
 942
 943         if not rating:
 944             return None
 945
 946         RATING_TABLE = {
 947             'safe for kids': 0,
 948             'general': 8,
 949             '14 years': 14,
 950             'mature': 17,
 951             'restricted': 19,
 952         }
 953         return RATING_TABLE.get(rating.lower())
 954
 955     def _family_friendly_search(self, html):
 956         # See http://schema.org/VideoObject
 957         family_friendly = self._html_search_meta(
 958             'isFamilyFriendly', html, default=None)
 959
 960         if not family_friendly:
 961             return None
 962
 963         RATING_TABLE = {
 964             '1': 0,
 965             'true': 0,
 966             '0': 18,
 967             'false': 18,
 968         }
 969         return RATING_TABLE.get(family_friendly.lower())
 970
 971     def _twitter_search_player(self, html):
 972         return self._html_search_meta('twitter:player', html,
 973                                       'twitter card player')
 974
 975     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 976         json_ld = self._search_regex(
 977             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 978             html, 'JSON-LD', group='json_ld', **kwargs)
 979         default = kwargs.get('default', NO_DEFAULT)
 980         if not json_ld:
 981             return default if default is not NO_DEFAULT else {}
 982         # JSON-LD may be malformed and thus `fatal` should be respected.
 983         # At the same time `default` may be passed that assumes `fatal=False`
 984         # for _search_regex. Let's simulate the same behavior here as well.
 985         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 986         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 987
 988     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 989         if isinstance(json_ld, compat_str):
 990             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 991         if not json_ld:
 992             return {}
 993         info = {}
 994         if not isinstance(json_ld, (list, tuple, dict)):
 995             return info
 996         if isinstance(json_ld, dict):
 997             json_ld = [json_ld]
 998
 999         def extract_video_object(e):
1000             assert e['@type'] == 'VideoObject'
1001             info.update({
1002                 'url': e.get('contentUrl'),
1003                 'title': unescapeHTML(e.get('name')),
1004                 'description': unescapeHTML(e.get('description')),
1005                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1006                 'duration': parse_duration(e.get('duration')),
1007                 'timestamp': unified_timestamp(e.get('uploadDate')),
1008                 'filesize': float_or_none(e.get('contentSize')),
1009                 'tbr': int_or_none(e.get('bitrate')),
1010                 'width': int_or_none(e.get('width')),
1011                 'height': int_or_none(e.get('height')),
1012                 'view_count': int_or_none(e.get('interactionCount')),
1013             })
1014
1015         for e in json_ld:
1016             if e.get('@context') == 'http://schema.org':
1017                 item_type = e.get('@type')
1018                 if expected_type is not None and expected_type != item_type:
1019                     return info
1020                 if item_type in ('TVEpisode', 'Episode'):
1021                     info.update({
1022                         'episode': unescapeHTML(e.get('name')),
1023                         'episode_number': int_or_none(e.get('episodeNumber')),
1024                         'description': unescapeHTML(e.get('description')),
1025                     })
1026                     part_of_season = e.get('partOfSeason')
1027                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1028                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1029                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1030                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1031                         info['series'] = unescapeHTML(part_of_series.get('name'))
1032                 elif item_type in ('Article', 'NewsArticle'):
1033                     info.update({
1034                         'timestamp': parse_iso8601(e.get('datePublished')),
1035                         'title': unescapeHTML(e.get('headline')),
1036                         'description': unescapeHTML(e.get('articleBody')),
1037                     })
1038                 elif item_type == 'VideoObject':
1039                     extract_video_object(e)
1040                     continue
1041                 video = e.get('video')
1042                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1043                     extract_video_object(video)
1044                 break
1045         return dict((k, v) for k, v in info.items() if v is not None)
1046
1047     @staticmethod
1048     def _hidden_inputs(html):
1049         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1050         hidden_inputs = {}
1051         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1052             attrs = extract_attributes(input)
1053             if not input:
1054                 continue
1055             if attrs.get('type') not in ('hidden', 'submit'):
1056                 continue
1057             name = attrs.get('name') or attrs.get('id')
1058             value = attrs.get('value')
1059             if name and value is not None:
1060                 hidden_inputs[name] = value
1061         return hidden_inputs
1062
1063     def _form_hidden_inputs(self, form_id, html):
1064         form = self._search_regex(
1065             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1066             html, '%s form' % form_id, group='form')
1067         return self._hidden_inputs(form)
1068
1069     def _sort_formats(self, formats, field_preference=None):
1070         if not formats:
1071             raise ExtractorError('No video formats found')
1072
1073         for f in formats:
1074             # Automatically determine tbr when missing based on abr and vbr (improves
1075             # formats sorting in some cases)
1076             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1077                 f['tbr'] = f['abr'] + f['vbr']
1078
1079         def _formats_key(f):
1080             # TODO remove the following workaround
1081             from ..utils import determine_ext
1082             if not f.get('ext') and 'url' in f:
1083                 f['ext'] = determine_ext(f['url'])
1084
1085             if isinstance(field_preference, (list, tuple)):
1086                 return tuple(
1087                     f.get(field)
1088                     if f.get(field) is not None
1089                     else ('' if field == 'format_id' else -1)
1090                     for field in field_preference)
1091
1092             preference = f.get('preference')
1093             if preference is None:
1094                 preference = 0
1095                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1096                     preference -= 0.5
1097
1098             protocol = f.get('protocol') or determine_protocol(f)
1099             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1100
1101             if f.get('vcodec') == 'none':  # audio only
1102                 preference -= 50
1103                 if self._downloader.params.get('prefer_free_formats'):
1104                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1105                 else:
1106                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1107                 ext_preference = 0
1108                 try:
1109                     audio_ext_preference = ORDER.index(f['ext'])
1110                 except ValueError:
1111                     audio_ext_preference = -1
1112             else:
1113                 if f.get('acodec') == 'none':  # video only
1114                     preference -= 40
1115                 if self._downloader.params.get('prefer_free_formats'):
1116                     ORDER = ['flv', 'mp4', 'webm']
1117                 else:
1118                     ORDER = ['webm', 'flv', 'mp4']
1119                 try:
1120                     ext_preference = ORDER.index(f['ext'])
1121                 except ValueError:
1122                     ext_preference = -1
1123                 audio_ext_preference = 0
1124
1125             return (
1126                 preference,
1127                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1128                 f.get('quality') if f.get('quality') is not None else -1,
1129                 f.get('tbr') if f.get('tbr') is not None else -1,
1130                 f.get('filesize') if f.get('filesize') is not None else -1,
1131                 f.get('vbr') if f.get('vbr') is not None else -1,
1132                 f.get('height') if f.get('height') is not None else -1,
1133                 f.get('width') if f.get('width') is not None else -1,
1134                 proto_preference,
1135                 ext_preference,
1136                 f.get('abr') if f.get('abr') is not None else -1,
1137                 audio_ext_preference,
1138                 f.get('fps') if f.get('fps') is not None else -1,
1139                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1140                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1141                 f.get('format_id') if f.get('format_id') is not None else '',
1142             )
1143         formats.sort(key=_formats_key)
1144
1145     def _check_formats(self, formats, video_id):
1146         if formats:
1147             formats[:] = filter(
1148                 lambda f: self._is_valid_url(
1149                     f['url'], video_id,
1150                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1151                 formats)
1152
1153     @staticmethod
1154     def _remove_duplicate_formats(formats):
1155         format_urls = set()
1156         unique_formats = []
1157         for f in formats:
1158             if f['url'] not in format_urls:
1159                 format_urls.add(f['url'])
1160                 unique_formats.append(f)
1161         formats[:] = unique_formats
1162
1163     def _is_valid_url(self, url, video_id, item='video', headers={}):
1164         url = self._proto_relative_url(url, scheme='http:')
1165         # For now assume non HTTP(S) URLs always valid
1166         if not (url.startswith('http://') or url.startswith('https://')):
1167             return True
1168         try:
1169             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1170             return True
1171         except ExtractorError as e:
1172             if isinstance(e.cause, compat_urllib_error.URLError):
1173                 self.to_screen(
1174                     '%s: %s URL is invalid, skipping' % (video_id, item))
1175                 return False
1176             raise
1177
1178     def http_scheme(self):
1179         """ Either "http:" or "https:", depending on the user's preferences """
1180         return (
1181             'http:'
1182             if self._downloader.params.get('prefer_insecure', False)
1183             else 'https:')
1184
1185     def _proto_relative_url(self, url, scheme=None):
1186         if url is None:
1187             return url
1188         if url.startswith('//'):
1189             if scheme is None:
1190                 scheme = self.http_scheme()
1191             return scheme + url
1192         else:
1193             return url
1194
1195     def _sleep(self, timeout, video_id, msg_template=None):
1196         if msg_template is None:
1197             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1198         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1199         self.to_screen(msg)
1200         time.sleep(timeout)
1201
1202     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1203                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1204                              fatal=True, m3u8_id=None):
1205         manifest = self._download_xml(
1206             manifest_url, video_id, 'Downloading f4m manifest',
1207             'Unable to download f4m manifest',
1208             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1209             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1210             transform_source=transform_source,
1211             fatal=fatal)
1212
1213         if manifest is False:
1214             return []
1215
1216         return self._parse_f4m_formats(
1217             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1218             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1219
1220     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1221                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1222                            fatal=True, m3u8_id=None):
1223         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1224         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1225         if akamai_pv is not None and ';' in akamai_pv.text:
1226             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1227             if playerVerificationChallenge.strip() != '':
1228                 return []
1229
1230         formats = []
1231         manifest_version = '1.0'
1232         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1233         if not media_nodes:
1234             manifest_version = '2.0'
1235             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1236         # Remove unsupported DRM protected media from final formats
1237         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1238         media_nodes = remove_encrypted_media(media_nodes)
1239         if not media_nodes:
1240             return formats
1241
1242         manifest_base_url = get_base_url(manifest)
1243
1244         bootstrap_info = xpath_element(
1245             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1246             'bootstrap info', default=None)
1247
1248         vcodec = None
1249         mime_type = xpath_text(
1250             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1251             'base URL', default=None)
1252         if mime_type and mime_type.startswith('audio/'):
1253             vcodec = 'none'
1254
1255         for i, media_el in enumerate(media_nodes):
1256             tbr = int_or_none(media_el.attrib.get('bitrate'))
1257             width = int_or_none(media_el.attrib.get('width'))
1258             height = int_or_none(media_el.attrib.get('height'))
1259             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1260             # If <bootstrapInfo> is present, the specified f4m is a
1261             # stream-level manifest, and only set-level manifests may refer to
1262             # external resources.  See section 11.4 and section 4 of F4M spec
1263             if bootstrap_info is None:
1264                 media_url = None
1265                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1266                 if manifest_version == '2.0':
1267                     media_url = media_el.attrib.get('href')
1268                 if media_url is None:
1269                     media_url = media_el.attrib.get('url')
1270                 if not media_url:
1271                     continue
1272                 manifest_url = (
1273                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1274                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1275                 # If media_url is itself a f4m manifest do the recursive extraction
1276                 # since bitrates in parent manifest (this one) and media_url manifest
1277                 # may differ leading to inability to resolve the format by requested
1278                 # bitrate in f4m downloader
1279                 ext = determine_ext(manifest_url)
1280                 if ext == 'f4m':
1281                     f4m_formats = self._extract_f4m_formats(
1282                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1283                         transform_source=transform_source, fatal=fatal)
1284                     # Sometimes stream-level manifest contains single media entry that
1285                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1286                     # At the same time parent's media entry in set-level manifest may
1287                     # contain it. We will copy it from parent in such cases.
1288                     if len(f4m_formats) == 1:
1289                         f = f4m_formats[0]
1290                         f.update({
1291                             'tbr': f.get('tbr') or tbr,
1292                             'width': f.get('width') or width,
1293                             'height': f.get('height') or height,
1294                             'format_id': f.get('format_id') if not tbr else format_id,
1295                             'vcodec': vcodec,
1296                         })
1297                     formats.extend(f4m_formats)
1298                     continue
1299                 elif ext == 'm3u8':
1300                     formats.extend(self._extract_m3u8_formats(
1301                         manifest_url, video_id, 'mp4', preference=preference,
1302                         m3u8_id=m3u8_id, fatal=fatal))
1303                     continue
1304             formats.append({
1305                 'format_id': format_id,
1306                 'url': manifest_url,
1307                 'manifest_url': manifest_url,
1308                 'ext': 'flv' if bootstrap_info is not None else None,
1309                 'protocol': 'f4m',
1310                 'tbr': tbr,
1311                 'width': width,
1312                 'height': height,
1313                 'vcodec': vcodec,
1314                 'preference': preference,
1315             })
1316         return formats
1317
1318     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1319         return {
1320             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1321             'url': m3u8_url,
1322             'ext': ext,
1323             'protocol': 'm3u8',
1324             'preference': preference - 100 if preference else -100,
1325             'resolution': 'multiple',
1326             'format_note': 'Quality selection URL',
1327         }
1328
1329     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1330                               entry_protocol='m3u8', preference=None,
1331                               m3u8_id=None, note=None, errnote=None,
1332                               fatal=True, live=False):
1333         res = self._download_webpage_handle(
1334             m3u8_url, video_id,
1335             note=note or 'Downloading m3u8 information',
1336             errnote=errnote or 'Failed to download m3u8 information',
1337             fatal=fatal)
1338
1339         if res is False:
1340             return []
1341
1342         m3u8_doc, urlh = res
1343         m3u8_url = urlh.geturl()
1344
1345         return self._parse_m3u8_formats(
1346             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1347             preference=preference, m3u8_id=m3u8_id, live=live)
1348
1349     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1350                             entry_protocol='m3u8', preference=None,
1351                             m3u8_id=None, live=False):
1352         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1353             return []
1354
1355         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1356             return []
1357
1358         formats = []
1359
1360         format_url = lambda u: (
1361             u
1362             if re.match(r'^https?://', u)
1363             else compat_urlparse.urljoin(m3u8_url, u))
1364
1365         # References:
1366         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1367         # 2. https://github.com/rg3/youtube-dl/issues/12211
1368
1369         # We should try extracting formats only from master playlists [1, 4.3.4],
1370         # i.e. playlists that describe available qualities. On the other hand
1371         # media playlists [1, 4.3.3] should be returned as is since they contain
1372         # just the media without qualities renditions.
1373         # Fortunately, master playlist can be easily distinguished from media
1374         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1375         # master playlist tags MUST NOT appear in a media playist and vice versa.
1376         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1377         # media playlist and MUST NOT appear in master playlist thus we can
1378         # clearly detect media playlist with this criterion.
1379
1380         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1381             return [{
1382                 'url': m3u8_url,
1383                 'format_id': m3u8_id,
1384                 'ext': ext,
1385                 'protocol': entry_protocol,
1386                 'preference': preference,
1387             }]
1388
1389         groups = {}
1390         last_stream_inf = {}
1391
1392         def extract_media(x_media_line):
1393             media = parse_m3u8_attributes(x_media_line)
1394             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1395             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1396             if not (media_type and group_id and name):
1397                 return
1398             groups.setdefault(group_id, []).append(media)
1399             if media_type not in ('VIDEO', 'AUDIO'):
1400                 return
1401             media_url = media.get('URI')
1402             if media_url:
1403                 format_id = []
1404                 for v in (m3u8_id, group_id, name):
1405                     if v:
1406                         format_id.append(v)
1407                 f = {
1408                     'format_id': '-'.join(format_id),
1409                     'url': format_url(media_url),
1410                     'manifest_url': m3u8_url,
1411                     'language': media.get('LANGUAGE'),
1412                     'ext': ext,
1413                     'protocol': entry_protocol,
1414                     'preference': preference,
1415                 }
1416                 if media_type == 'AUDIO':
1417                     f['vcodec'] = 'none'
1418                 formats.append(f)
1419
1420         def build_stream_name():
1421             # Despite specification does not mention NAME attribute for
1422             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1423             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1424             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1425             stream_name = last_stream_inf.get('NAME')
1426             if stream_name:
1427                 return stream_name
1428             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1429             # from corresponding rendition group
1430             stream_group_id = last_stream_inf.get('VIDEO')
1431             if not stream_group_id:
1432                 return
1433             stream_group = groups.get(stream_group_id)
1434             if not stream_group:
1435                 return stream_group_id
1436             rendition = stream_group[0]
1437             return rendition.get('NAME') or stream_group_id
1438
1439         for line in m3u8_doc.splitlines():
1440             if line.startswith('#EXT-X-STREAM-INF:'):
1441                 last_stream_inf = parse_m3u8_attributes(line)
1442             elif line.startswith('#EXT-X-MEDIA:'):
1443                 extract_media(line)
1444             elif line.startswith('#') or not line.strip():
1445                 continue
1446             else:
1447                 tbr = float_or_none(
1448                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1449                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1450                 format_id = []
1451                 if m3u8_id:
1452                     format_id.append(m3u8_id)
1453                 stream_name = build_stream_name()
1454                 # Bandwidth of live streams may differ over time thus making
1455                 # format_id unpredictable. So it's better to keep provided
1456                 # format_id intact.
1457                 if not live:
1458                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1459                 manifest_url = format_url(line.strip())
1460                 f = {
1461                     'format_id': '-'.join(format_id),
1462                     'url': manifest_url,
1463                     'manifest_url': m3u8_url,
1464                     'tbr': tbr,
1465                     'ext': ext,
1466                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1467                     'protocol': entry_protocol,
1468                     'preference': preference,
1469                 }
1470                 resolution = last_stream_inf.get('RESOLUTION')
1471                 if resolution:
1472                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1473                     if mobj:
1474                         f['width'] = int(mobj.group('width'))
1475                         f['height'] = int(mobj.group('height'))
1476                 # Unified Streaming Platform
1477                 mobj = re.search(
1478                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1479                 if mobj:
1480                     abr, vbr = mobj.groups()
1481                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1482                     f.update({
1483                         'vbr': vbr,
1484                         'abr': abr,
1485                     })
1486                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1487                 f.update(codecs)
1488                 audio_group_id = last_stream_inf.get('AUDIO')
1489                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1490                 # references a rendition group MUST have a CODECS attribute.
1491                 # However, this is not always respected, for example, [2]
1492                 # contains EXT-X-STREAM-INF tag which references AUDIO
1493                 # rendition group but does not have CODECS and despite
1494                 # referencing audio group an audio group, it represents
1495                 # a complete (with audio and video) format. So, for such cases
1496                 # we will ignore references to rendition groups and treat them
1497                 # as complete formats.
1498                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1499                     audio_group = groups.get(audio_group_id)
1500                     if audio_group and audio_group[0].get('URI'):
1501                         # TODO: update acodec for audio only formats with
1502                         # the same GROUP-ID
1503                         f['acodec'] = 'none'
1504                 formats.append(f)
1505                 last_stream_inf = {}
1506         return formats
1507
1508     @staticmethod
1509     def _xpath_ns(path, namespace=None):
1510         if not namespace:
1511             return path
1512         out = []
1513         for c in path.split('/'):
1514             if not c or c == '.':
1515                 out.append(c)
1516             else:
1517                 out.append('{%s}%s' % (namespace, c))
1518         return '/'.join(out)
1519
1520     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1521         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1522
1523         if smil is False:
1524             assert not fatal
1525             return []
1526
1527         namespace = self._parse_smil_namespace(smil)
1528
1529         return self._parse_smil_formats(
1530             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1531
1532     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1533         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1534         if smil is False:
1535             return {}
1536         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1537
1538     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1539         return self._download_xml(
1540             smil_url, video_id, 'Downloading SMIL file',
1541             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1542
1543     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1544         namespace = self._parse_smil_namespace(smil)
1545
1546         formats = self._parse_smil_formats(
1547             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1548         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1549
1550         video_id = os.path.splitext(url_basename(smil_url))[0]
1551         title = None
1552         description = None
1553         upload_date = None
1554         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1555             name = meta.attrib.get('name')
1556             content = meta.attrib.get('content')
1557             if not name or not content:
1558                 continue
1559             if not title and name == 'title':
1560                 title = content
1561             elif not description and name in ('description', 'abstract'):
1562                 description = content
1563             elif not upload_date and name == 'date':
1564                 upload_date = unified_strdate(content)
1565
1566         thumbnails = [{
1567             'id': image.get('type'),
1568             'url': image.get('src'),
1569             'width': int_or_none(image.get('width')),
1570             'height': int_or_none(image.get('height')),
1571         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1572
1573         return {
1574             'id': video_id,
1575             'title': title or video_id,
1576             'description': description,
1577             'upload_date': upload_date,
1578             'thumbnails': thumbnails,
1579             'formats': formats,
1580             'subtitles': subtitles,
1581         }
1582
1583     def _parse_smil_namespace(self, smil):
1584         return self._search_regex(
1585             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1586
1587     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1588         base = smil_url
1589         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1590             b = meta.get('base') or meta.get('httpBase')
1591             if b:
1592                 base = b
1593                 break
1594
1595         formats = []
1596         rtmp_count = 0
1597         http_count = 0
1598         m3u8_count = 0
1599
1600         srcs = []
1601         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1602         for medium in media:
1603             src = medium.get('src')
1604             if not src or src in srcs:
1605                 continue
1606             srcs.append(src)
1607
1608             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1609             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1610             width = int_or_none(medium.get('width'))
1611             height = int_or_none(medium.get('height'))
1612             proto = medium.get('proto')
1613             ext = medium.get('ext')
1614             src_ext = determine_ext(src)
1615             streamer = medium.get('streamer') or base
1616
1617             if proto == 'rtmp' or streamer.startswith('rtmp'):
1618                 rtmp_count += 1
1619                 formats.append({
1620                     'url': streamer,
1621                     'play_path': src,
1622                     'ext': 'flv',
1623                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1624                     'tbr': bitrate,
1625                     'filesize': filesize,
1626                     'width': width,
1627                     'height': height,
1628                 })
1629                 if transform_rtmp_url:
1630                     streamer, src = transform_rtmp_url(streamer, src)
1631                     formats[-1].update({
1632                         'url': streamer,
1633                         'play_path': src,
1634                     })
1635                 continue
1636
1637             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1638             src_url = src_url.strip()
1639
1640             if proto == 'm3u8' or src_ext == 'm3u8':
1641                 m3u8_formats = self._extract_m3u8_formats(
1642                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1643                 if len(m3u8_formats) == 1:
1644                     m3u8_count += 1
1645                     m3u8_formats[0].update({
1646                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1647                         'tbr': bitrate,
1648                         'width': width,
1649                         'height': height,
1650                     })
1651                 formats.extend(m3u8_formats)
1652                 continue
1653
1654             if src_ext == 'f4m':
1655                 f4m_url = src_url
1656                 if not f4m_params:
1657                     f4m_params = {
1658                         'hdcore': '3.2.0',
1659                         'plugin': 'flowplayer-3.2.0.1',
1660                     }
1661                 f4m_url += '&' if '?' in f4m_url else '?'
1662                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1663                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1664                 continue
1665
1666             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1667                 http_count += 1
1668                 formats.append({
1669                     'url': src_url,
1670                     'ext': ext or src_ext or 'flv',
1671                     'format_id': 'http-%d' % (bitrate or http_count),
1672                     'tbr': bitrate,
1673                     'filesize': filesize,
1674                     'width': width,
1675                     'height': height,
1676                 })
1677                 continue
1678
1679         return formats
1680
1681     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1682         urls = []
1683         subtitles = {}
1684         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1685             src = textstream.get('src')
1686             if not src or src in urls:
1687                 continue
1688             urls.append(src)
1689             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1690             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1691             subtitles.setdefault(lang, []).append({
1692                 'url': src,
1693                 'ext': ext,
1694             })
1695         return subtitles
1696
1697     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1698         xspf = self._download_xml(
1699             playlist_url, playlist_id, 'Downloading xpsf playlist',
1700             'Unable to download xspf manifest', fatal=fatal)
1701         if xspf is False:
1702             return []
1703         return self._parse_xspf(xspf, playlist_id)
1704
1705     def _parse_xspf(self, playlist, playlist_id):
1706         NS_MAP = {
1707             'xspf': 'http://xspf.org/ns/0/',
1708             's1': 'http://static.streamone.nl/player/ns/0',
1709         }
1710
1711         entries = []
1712         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1713             title = xpath_text(
1714                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1715             description = xpath_text(
1716                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1717             thumbnail = xpath_text(
1718                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1719             duration = float_or_none(
1720                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1721
1722             formats = [{
1723                 'url': location.text,
1724                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1725                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1726                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1727             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1728             self._sort_formats(formats)
1729
1730             entries.append({
1731                 'id': playlist_id,
1732                 'title': title,
1733                 'description': description,
1734                 'thumbnail': thumbnail,
1735                 'duration': duration,
1736                 'formats': formats,
1737             })
1738         return entries
1739
1740     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1741         res = self._download_webpage_handle(
1742             mpd_url, video_id,
1743             note=note or 'Downloading MPD manifest',
1744             errnote=errnote or 'Failed to download MPD manifest',
1745             fatal=fatal)
1746         if res is False:
1747             return []
1748         mpd, urlh = res
1749         mpd_base_url = base_url(urlh.geturl())
1750
1751         return self._parse_mpd_formats(
1752             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1753             formats_dict=formats_dict, mpd_url=mpd_url)
1754
1755     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1756         """
1757         Parse formats from MPD manifest.
1758         References:
1759          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1760             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1761          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1762         """
1763         if mpd_doc.get('type') == 'dynamic':
1764             return []
1765
1766         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1767
1768         def _add_ns(path):
1769             return self._xpath_ns(path, namespace)
1770
1771         def is_drm_protected(element):
1772             return element.find(_add_ns('ContentProtection')) is not None
1773
1774         def extract_multisegment_info(element, ms_parent_info):
1775             ms_info = ms_parent_info.copy()
1776
1777             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1778             # common attributes and elements.  We will only extract relevant
1779             # for us.
1780             def extract_common(source):
1781                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1782                 if segment_timeline is not None:
1783                     s_e = segment_timeline.findall(_add_ns('S'))
1784                     if s_e:
1785                         ms_info['total_number'] = 0
1786                         ms_info['s'] = []
1787                         for s in s_e:
1788                             r = int(s.get('r', 0))
1789                             ms_info['total_number'] += 1 + r
1790                             ms_info['s'].append({
1791                                 't': int(s.get('t', 0)),
1792                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1793                                 'd': int(s.attrib['d']),
1794                                 'r': r,
1795                             })
1796                 start_number = source.get('startNumber')
1797                 if start_number:
1798                     ms_info['start_number'] = int(start_number)
1799                 timescale = source.get('timescale')
1800                 if timescale:
1801                     ms_info['timescale'] = int(timescale)
1802                 segment_duration = source.get('duration')
1803                 if segment_duration:
1804                     ms_info['segment_duration'] = float(segment_duration)
1805
1806             def extract_Initialization(source):
1807                 initialization = source.find(_add_ns('Initialization'))
1808                 if initialization is not None:
1809                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1810
1811             segment_list = element.find(_add_ns('SegmentList'))
1812             if segment_list is not None:
1813                 extract_common(segment_list)
1814                 extract_Initialization(segment_list)
1815                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1816                 if segment_urls_e:
1817                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1818             else:
1819                 segment_template = element.find(_add_ns('SegmentTemplate'))
1820                 if segment_template is not None:
1821                     extract_common(segment_template)
1822                     media = segment_template.get('media')
1823                     if media:
1824                         ms_info['media'] = media
1825                     initialization = segment_template.get('initialization')
1826                     if initialization:
1827                         ms_info['initialization'] = initialization
1828                     else:
1829                         extract_Initialization(segment_template)
1830             return ms_info
1831
1832         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1833         formats = []
1834         for period in mpd_doc.findall(_add_ns('Period')):
1835             period_duration = parse_duration(period.get('duration')) or mpd_duration
1836             period_ms_info = extract_multisegment_info(period, {
1837                 'start_number': 1,
1838                 'timescale': 1,
1839             })
1840             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1841                 if is_drm_protected(adaptation_set):
1842                     continue
1843                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1844                 for representation in adaptation_set.findall(_add_ns('Representation')):
1845                     if is_drm_protected(representation):
1846                         continue
1847                     representation_attrib = adaptation_set.attrib.copy()
1848                     representation_attrib.update(representation.attrib)
1849                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1850                     mime_type = representation_attrib['mimeType']
1851                     content_type = mime_type.split('/')[0]
1852                     if content_type == 'text':
1853                         # TODO implement WebVTT downloading
1854                         pass
1855                     elif content_type in ('video', 'audio'):
1856                         base_url = ''
1857                         for element in (representation, adaptation_set, period, mpd_doc):
1858                             base_url_e = element.find(_add_ns('BaseURL'))
1859                             if base_url_e is not None:
1860                                 base_url = base_url_e.text + base_url
1861                                 if re.match(r'^https?://', base_url):
1862                                     break
1863                         if mpd_base_url and not re.match(r'^https?://', base_url):
1864                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1865                                 mpd_base_url += '/'
1866                             base_url = mpd_base_url + base_url
1867                         representation_id = representation_attrib.get('id')
1868                         lang = representation_attrib.get('lang')
1869                         url_el = representation.find(_add_ns('BaseURL'))
1870                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1871                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1872                         f = {
1873                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1874                             'url': base_url,
1875                             'manifest_url': mpd_url,
1876                             'ext': mimetype2ext(mime_type),
1877                             'width': int_or_none(representation_attrib.get('width')),
1878                             'height': int_or_none(representation_attrib.get('height')),
1879                             'tbr': float_or_none(bandwidth, 1000),
1880                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1881                             'fps': int_or_none(representation_attrib.get('frameRate')),
1882                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1883                             'format_note': 'DASH %s' % content_type,
1884                             'filesize': filesize,
1885                             'container': mimetype2ext(mime_type) + '_dash',
1886                         }
1887                         f.update(parse_codecs(representation_attrib.get('codecs')))
1888                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1889
1890                         def prepare_template(template_name, identifiers):
1891                             t = representation_ms_info[template_name]
1892                             t = t.replace('$RepresentationID$', representation_id)
1893                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1894                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1895                             t.replace('$$', '$')
1896                             return t
1897
1898                         # @initialization is a regular template like @media one
1899                         # so it should be handled just the same way (see
1900                         # https://github.com/rg3/youtube-dl/issues/11605)
1901                         if 'initialization' in representation_ms_info:
1902                             initialization_template = prepare_template(
1903                                 'initialization',
1904                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1905                                 # $Time$ shall not be included for @initialization thus
1906                                 # only $Bandwidth$ remains
1907                                 ('Bandwidth', ))
1908                             representation_ms_info['initialization_url'] = initialization_template % {
1909                                 'Bandwidth': bandwidth,
1910                             }
1911
1912                         def location_key(location):
1913                             return 'url' if re.match(r'^https?://', location) else 'path'
1914
1915                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1916
1917                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1918                             media_location_key = location_key(media_template)
1919
1920                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1921                             # can't be used at the same time
1922                             if '%(Number' in media_template and 's' not in representation_ms_info:
1923                                 segment_duration = None
1924                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
1925                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1926                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1927                                 representation_ms_info['fragments'] = [{
1928                                     media_location_key: media_template % {
1929                                         'Number': segment_number,
1930                                         'Bandwidth': bandwidth,
1931                                     },
1932                                     'duration': segment_duration,
1933                                 } for segment_number in range(
1934                                     representation_ms_info['start_number'],
1935                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1936                             else:
1937                                 # $Number*$ or $Time$ in media template with S list available
1938                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1939                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1940                                 representation_ms_info['fragments'] = []
1941                                 segment_time = 0
1942                                 segment_d = None
1943                                 segment_number = representation_ms_info['start_number']
1944
1945                                 def add_segment_url():
1946                                     segment_url = media_template % {
1947                                         'Time': segment_time,
1948                                         'Bandwidth': bandwidth,
1949                                         'Number': segment_number,
1950                                     }
1951                                     representation_ms_info['fragments'].append({
1952                                         media_location_key: segment_url,
1953                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1954                                     })
1955
1956                                 for num, s in enumerate(representation_ms_info['s']):
1957                                     segment_time = s.get('t') or segment_time
1958                                     segment_d = s['d']
1959                                     add_segment_url()
1960                                     segment_number += 1
1961                                     for r in range(s.get('r', 0)):
1962                                         segment_time += segment_d
1963                                         add_segment_url()
1964                                         segment_number += 1
1965                                     segment_time += segment_d
1966                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1967                             # No media template
1968                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1969                             # or any YouTube dashsegments video
1970                             fragments = []
1971                             segment_index = 0
1972                             timescale = representation_ms_info['timescale']
1973                             for s in representation_ms_info['s']:
1974                                 duration = float_or_none(s['d'], timescale)
1975                                 for r in range(s.get('r', 0) + 1):
1976                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
1977                                     fragments.append({
1978                                         location_key(segment_uri): segment_uri,
1979                                         'duration': duration,
1980                                     })
1981                                     segment_index += 1
1982                             representation_ms_info['fragments'] = fragments
1983                         elif 'segment_urls' in representation_ms_info:
1984                             # Segment URLs with no SegmentTimeline
1985                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
1986                             # https://github.com/rg3/youtube-dl/pull/14844
1987                             fragments = []
1988                             segment_duration = float_or_none(
1989                                 representation_ms_info['segment_duration'],
1990                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
1991                             for segment_url in representation_ms_info['segment_urls']:
1992                                 fragment = {
1993                                     location_key(segment_url): segment_url,
1994                                 }
1995                                 if segment_duration:
1996                                     fragment['duration'] = segment_duration
1997                                 fragments.append(fragment)
1998                             representation_ms_info['fragments'] = fragments
1999                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2000                         # No fragments key is present in this case.
2001                         if 'fragments' in representation_ms_info:
2002                             f.update({
2003                                 'fragment_base_url': base_url,
2004                                 'fragments': [],
2005                                 'protocol': 'http_dash_segments',
2006                             })
2007                             if 'initialization_url' in representation_ms_info:
2008                                 initialization_url = representation_ms_info['initialization_url']
2009                                 if not f.get('url'):
2010                                     f['url'] = initialization_url
2011                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2012                             f['fragments'].extend(representation_ms_info['fragments'])
2013                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2014                         # is not necessarily unique within a Period thus formats with
2015                         # the same `format_id` are quite possible. There are numerous examples
2016                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2017                         # https://github.com/rg3/youtube-dl/issues/13919)
2018                         full_info = formats_dict.get(representation_id, {}).copy()
2019                         full_info.update(f)
2020                         formats.append(full_info)
2021                     else:
2022                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2023         return formats
2024
2025     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2026         res = self._download_webpage_handle(
2027             ism_url, video_id,
2028             note=note or 'Downloading ISM manifest',
2029             errnote=errnote or 'Failed to download ISM manifest',
2030             fatal=fatal)
2031         if res is False:
2032             return []
2033         ism, urlh = res
2034
2035         return self._parse_ism_formats(
2036             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
2037
2038     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2039         """
2040         Parse formats from ISM manifest.
2041         References:
2042          1. [MS-SSTR]: Smooth Streaming Protocol,
2043             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2044         """
2045         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2046             return []
2047
2048         duration = int(ism_doc.attrib['Duration'])
2049         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2050
2051         formats = []
2052         for stream in ism_doc.findall('StreamIndex'):
2053             stream_type = stream.get('Type')
2054             if stream_type not in ('video', 'audio'):
2055                 continue
2056             url_pattern = stream.attrib['Url']
2057             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2058             stream_name = stream.get('Name')
2059             for track in stream.findall('QualityLevel'):
2060                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2061                 # TODO: add support for WVC1 and WMAP
2062                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2063                     self.report_warning('%s is not a supported codec' % fourcc)
2064                     continue
2065                 tbr = int(track.attrib['Bitrate']) // 1000
2066                 # [1] does not mention Width and Height attributes. However,
2067                 # they're often present while MaxWidth and MaxHeight are
2068                 # missing, so should be used as fallbacks
2069                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2070                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2071                 sampling_rate = int_or_none(track.get('SamplingRate'))
2072
2073                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2074                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2075
2076                 fragments = []
2077                 fragment_ctx = {
2078                     'time': 0,
2079                 }
2080                 stream_fragments = stream.findall('c')
2081                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2082                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2083                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2084                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2085                     if not fragment_ctx['duration']:
2086                         try:
2087                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2088                         except IndexError:
2089                             next_fragment_time = duration
2090                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2091                     for _ in range(fragment_repeat):
2092                         fragments.append({
2093                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2094                             'duration': fragment_ctx['duration'] / stream_timescale,
2095                         })
2096                         fragment_ctx['time'] += fragment_ctx['duration']
2097
2098                 format_id = []
2099                 if ism_id:
2100                     format_id.append(ism_id)
2101                 if stream_name:
2102                     format_id.append(stream_name)
2103                 format_id.append(compat_str(tbr))
2104
2105                 formats.append({
2106                     'format_id': '-'.join(format_id),
2107                     'url': ism_url,
2108                     'manifest_url': ism_url,
2109                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2110                     'width': width,
2111                     'height': height,
2112                     'tbr': tbr,
2113                     'asr': sampling_rate,
2114                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2115                     'acodec': 'none' if stream_type == 'video' else fourcc,
2116                     'protocol': 'ism',
2117                     'fragments': fragments,
2118                     '_download_params': {
2119                         'duration': duration,
2120                         'timescale': stream_timescale,
2121                         'width': width or 0,
2122                         'height': height or 0,
2123                         'fourcc': fourcc,
2124                         'codec_private_data': track.get('CodecPrivateData'),
2125                         'sampling_rate': sampling_rate,
2126                         'channels': int_or_none(track.get('Channels', 2)),
2127                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2128                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2129                     },
2130                 })
2131         return formats
2132
2133     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2134         def absolute_url(video_url):
2135             return compat_urlparse.urljoin(base_url, video_url)
2136
2137         def parse_content_type(content_type):
2138             if not content_type:
2139                 return {}
2140             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2141             if ctr:
2142                 mimetype, codecs = ctr.groups()
2143                 f = parse_codecs(codecs)
2144                 f['ext'] = mimetype2ext(mimetype)
2145                 return f
2146             return {}
2147
2148         def _media_formats(src, cur_media_type, type_info={}):
2149             full_url = absolute_url(src)
2150             ext = type_info.get('ext') or determine_ext(full_url)
2151             if ext == 'm3u8':
2152                 is_plain_url = False
2153                 formats = self._extract_m3u8_formats(
2154                     full_url, video_id, ext='mp4',
2155                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2156                     preference=preference, fatal=False)
2157             elif ext == 'mpd':
2158                 is_plain_url = False
2159                 formats = self._extract_mpd_formats(
2160                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2161             else:
2162                 is_plain_url = True
2163                 formats = [{
2164                     'url': full_url,
2165                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2166                 }]
2167             return is_plain_url, formats
2168
2169         entries = []
2170         # amp-video and amp-audio are very similar to their HTML5 counterparts
2171         # so we wll include them right here (see
2172         # https://www.ampproject.org/docs/reference/components/amp-video)
2173         media_tags = [(media_tag, media_type, '')
2174                       for media_tag, media_type
2175                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2176         media_tags.extend(re.findall(
2177             # We only allow video|audio followed by a whitespace or '>'.
2178             # Allowing more characters may end up in significant slow down (see
2179             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2180             # http://www.porntrex.com/maps/videositemap.xml).
2181             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2182         for media_tag, media_type, media_content in media_tags:
2183             media_info = {
2184                 'formats': [],
2185                 'subtitles': {},
2186             }
2187             media_attributes = extract_attributes(media_tag)
2188             src = media_attributes.get('src')
2189             if src:
2190                 _, formats = _media_formats(src, media_type)
2191                 media_info['formats'].extend(formats)
2192             media_info['thumbnail'] = media_attributes.get('poster')
2193             if media_content:
2194                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2195                     source_attributes = extract_attributes(source_tag)
2196                     src = source_attributes.get('src')
2197                     if not src:
2198                         continue
2199                     f = parse_content_type(source_attributes.get('type'))
2200                     is_plain_url, formats = _media_formats(src, media_type, f)
2201                     if is_plain_url:
2202                         # res attribute is not standard but seen several times
2203                         # in the wild
2204                         f.update({
2205                             'height': int_or_none(source_attributes.get('res')),
2206                             'format_id': source_attributes.get('label'),
2207                         })
2208                         f.update(formats[0])
2209                         media_info['formats'].append(f)
2210                     else:
2211                         media_info['formats'].extend(formats)
2212                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2213                     track_attributes = extract_attributes(track_tag)
2214                     kind = track_attributes.get('kind')
2215                     if not kind or kind in ('subtitles', 'captions'):
2216                         src = track_attributes.get('src')
2217                         if not src:
2218                             continue
2219                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2220                         media_info['subtitles'].setdefault(lang, []).append({
2221                             'url': absolute_url(src),
2222                         })
2223             if media_info['formats'] or media_info['subtitles']:
2224                 entries.append(media_info)
2225         return entries
2226
2227     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2228         formats = []
2229         hdcore_sign = 'hdcore=3.7.0'
2230         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2231         hds_host = hosts.get('hds')
2232         if hds_host:
2233             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2234         if 'hdcore=' not in f4m_url:
2235             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2236         f4m_formats = self._extract_f4m_formats(
2237             f4m_url, video_id, f4m_id='hds', fatal=False)
2238         for entry in f4m_formats:
2239             entry.update({'extra_param_to_segment_url': hdcore_sign})
2240         formats.extend(f4m_formats)
2241         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2242         hls_host = hosts.get('hls')
2243         if hls_host:
2244             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2245         formats.extend(self._extract_m3u8_formats(
2246             m3u8_url, video_id, 'mp4', 'm3u8_native',
2247             m3u8_id='hls', fatal=False))
2248         return formats
2249
2250     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2251         query = compat_urlparse.urlparse(url).query
2252         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2253         url_base = self._search_regex(
2254             r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
2255         http_base_url = '%s:%s' % ('http', url_base)
2256         formats = []
2257
2258         def manifest_url(manifest):
2259             m_url = '%s/%s' % (http_base_url, manifest)
2260             if query:
2261                 m_url += '?%s' % query
2262             return m_url
2263
2264         if 'm3u8' not in skip_protocols:
2265             formats.extend(self._extract_m3u8_formats(
2266                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2267                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2268         if 'f4m' not in skip_protocols:
2269             formats.extend(self._extract_f4m_formats(
2270                 manifest_url('manifest.f4m'),
2271                 video_id, f4m_id='hds', fatal=False))
2272         if 'dash' not in skip_protocols:
2273             formats.extend(self._extract_mpd_formats(
2274                 manifest_url('manifest.mpd'),
2275                 video_id, mpd_id='dash', fatal=False))
2276         if re.search(r'(?:/smil:|\.smil)', url_base):
2277             if 'smil' not in skip_protocols:
2278                 rtmp_formats = self._extract_smil_formats(
2279                     manifest_url('jwplayer.smil'),
2280                     video_id, fatal=False)
2281                 for rtmp_format in rtmp_formats:
2282                     rtsp_format = rtmp_format.copy()
2283                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2284                     del rtsp_format['play_path']
2285                     del rtsp_format['ext']
2286                     rtsp_format.update({
2287                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2288                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2289                         'protocol': 'rtsp',
2290                     })
2291                     formats.extend([rtmp_format, rtsp_format])
2292         else:
2293             for protocol in ('rtmp', 'rtsp'):
2294                 if protocol not in skip_protocols:
2295                     formats.append({
2296                         'url': '%s:%s' % (protocol, url_base),
2297                         'format_id': protocol,
2298                         'protocol': protocol,
2299                     })
2300         return formats
2301
2302     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2303         mobj = re.search(
2304             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2305             webpage)
2306         if mobj:
2307             try:
2308                 jwplayer_data = self._parse_json(mobj.group('options'),
2309                                                  video_id=video_id,
2310                                                  transform_source=transform_source)
2311             except ExtractorError:
2312                 pass
2313             else:
2314                 if isinstance(jwplayer_data, dict):
2315                     return jwplayer_data
2316
2317     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2318         jwplayer_data = self._find_jwplayer_data(
2319             webpage, video_id, transform_source=js_to_json)
2320         return self._parse_jwplayer_data(
2321             jwplayer_data, video_id, *args, **kwargs)
2322
2323     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2324                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2325         # JWPlayer backward compatibility: flattened playlists
2326         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2327         if 'playlist' not in jwplayer_data:
2328             jwplayer_data = {'playlist': [jwplayer_data]}
2329
2330         entries = []
2331
2332         # JWPlayer backward compatibility: single playlist item
2333         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2334         if not isinstance(jwplayer_data['playlist'], list):
2335             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2336
2337         for video_data in jwplayer_data['playlist']:
2338             # JWPlayer backward compatibility: flattened sources
2339             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2340             if 'sources' not in video_data:
2341                 video_data['sources'] = [video_data]
2342
2343             this_video_id = video_id or video_data['mediaid']
2344
2345             formats = self._parse_jwplayer_formats(
2346                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2347                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2348
2349             subtitles = {}
2350             tracks = video_data.get('tracks')
2351             if tracks and isinstance(tracks, list):
2352                 for track in tracks:
2353                     if not isinstance(track, dict):
2354                         continue
2355                     if track.get('kind') != 'captions':
2356                         continue
2357                     track_url = urljoin(base_url, track.get('file'))
2358                     if not track_url:
2359                         continue
2360                     subtitles.setdefault(track.get('label') or 'en', []).append({
2361                         'url': self._proto_relative_url(track_url)
2362                     })
2363
2364             entry = {
2365                 'id': this_video_id,
2366                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2367                 'description': video_data.get('description'),
2368                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2369                 'timestamp': int_or_none(video_data.get('pubdate')),
2370                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2371                 'subtitles': subtitles,
2372             }
2373             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2374             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2375                 entry.update({
2376                     '_type': 'url_transparent',
2377                     'url': formats[0]['url'],
2378                 })
2379             else:
2380                 self._sort_formats(formats)
2381                 entry['formats'] = formats
2382             entries.append(entry)
2383         if len(entries) == 1:
2384             return entries[0]
2385         else:
2386             return self.playlist_result(entries)
2387
2388     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2389                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2390         urls = []
2391         formats = []
2392         for source in jwplayer_sources_data:
2393             if not isinstance(source, dict):
2394                 continue
2395             source_url = self._proto_relative_url(source.get('file'))
2396             if not source_url:
2397                 continue
2398             if base_url:
2399                 source_url = compat_urlparse.urljoin(base_url, source_url)
2400             if source_url in urls:
2401                 continue
2402             urls.append(source_url)
2403             source_type = source.get('type') or ''
2404             ext = mimetype2ext(source_type) or determine_ext(source_url)
2405             if source_type == 'hls' or ext == 'm3u8':
2406                 formats.extend(self._extract_m3u8_formats(
2407                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2408                     m3u8_id=m3u8_id, fatal=False))
2409             elif source_type == 'dash' or ext == 'mpd':
2410                 formats.extend(self._extract_mpd_formats(
2411                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2412             elif ext == 'smil':
2413                 formats.extend(self._extract_smil_formats(
2414                     source_url, video_id, fatal=False))
2415             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2416             elif source_type.startswith('audio') or ext in (
2417                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2418                 formats.append({
2419                     'url': source_url,
2420                     'vcodec': 'none',
2421                     'ext': ext,
2422                 })
2423             else:
2424                 height = int_or_none(source.get('height'))
2425                 if height is None:
2426                     # Often no height is provided but there is a label in
2427                     # format like "1080p", "720p SD", or 1080.
2428                     height = int_or_none(self._search_regex(
2429                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2430                         'height', default=None))
2431                 a_format = {
2432                     'url': source_url,
2433                     'width': int_or_none(source.get('width')),
2434                     'height': height,
2435                     'tbr': int_or_none(source.get('bitrate')),
2436                     'ext': ext,
2437                 }
2438                 if source_url.startswith('rtmp'):
2439                     a_format['ext'] = 'flv'
2440                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2441                     # of jwplayer.flash.swf
2442                     rtmp_url_parts = re.split(
2443                         r'((?:mp4|mp3|flv):)', source_url, 1)
2444                     if len(rtmp_url_parts) == 3:
2445                         rtmp_url, prefix, play_path = rtmp_url_parts
2446                         a_format.update({
2447                             'url': rtmp_url,
2448                             'play_path': prefix + play_path,
2449                         })
2450                     if rtmp_params:
2451                         a_format.update(rtmp_params)
2452                 formats.append(a_format)
2453         return formats
2454
2455     def _live_title(self, name):
2456         """ Generate the title for a live video """
2457         now = datetime.datetime.now()
2458         now_str = now.strftime('%Y-%m-%d %H:%M')
2459         return name + ' ' + now_str
2460
2461     def _int(self, v, name, fatal=False, **kwargs):
2462         res = int_or_none(v, **kwargs)
2463         if 'get_attr' in kwargs:
2464             print(getattr(v, kwargs['get_attr']))
2465         if res is None:
2466             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2467             if fatal:
2468                 raise ExtractorError(msg)
2469             else:
2470                 self._downloader.report_warning(msg)
2471         return res
2472
2473     def _float(self, v, name, fatal=False, **kwargs):
2474         res = float_or_none(v, **kwargs)
2475         if res is None:
2476             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2477             if fatal:
2478                 raise ExtractorError(msg)
2479             else:
2480                 self._downloader.report_warning(msg)
2481         return res
2482
2483     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2484                     path='/', secure=False, discard=False, rest={}, **kwargs):
2485         cookie = compat_cookiejar.Cookie(
2486             0, name, value, port, port is not None, domain, True,
2487             domain.startswith('.'), path, True, secure, expire_time,
2488             discard, None, None, rest)
2489         self._downloader.cookiejar.set_cookie(cookie)
2490
2491     def _get_cookies(self, url):
2492         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2493         req = sanitized_Request(url)
2494         self._downloader.cookiejar.add_cookie_header(req)
2495         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2496
2497     def get_testcases(self, include_onlymatching=False):
2498         t = getattr(self, '_TEST', None)
2499         if t:
2500             assert not hasattr(self, '_TESTS'), \
2501                 '%s has _TEST and _TESTS' % type(self).__name__
2502             tests = [t]
2503         else:
2504             tests = getattr(self, '_TESTS', [])
2505         for t in tests:
2506             if not include_onlymatching and t.get('only_matching', False):
2507                 continue
2508             t['name'] = type(self).__name__[:-len('IE')]
2509             yield t
2510
2511     def is_suitable(self, age_limit):
2512         """ Test whether the extractor is generally suitable for the given
2513         age limit (i.e. pornographic sites are not, all others usually are) """
2514
2515         any_restricted = False
2516         for tc in self.get_testcases(include_onlymatching=False):
2517             if tc.get('playlist', []):
2518                 tc = tc['playlist'][0]
2519             is_restricted = age_restricted(
2520                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2521             if not is_restricted:
2522                 return True
2523             any_restricted = any_restricted or is_restricted
2524         return not any_restricted
2525
2526     def extract_subtitles(self, *args, **kwargs):
2527         if (self._downloader.params.get('writesubtitles', False) or
2528                 self._downloader.params.get('listsubtitles')):
2529             return self._get_subtitles(*args, **kwargs)
2530         return {}
2531
2532     def _get_subtitles(self, *args, **kwargs):
2533         raise NotImplementedError('This method must be implemented by subclasses')
2534
2535     @staticmethod
2536     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2537         """ Merge subtitle items for one language. Items with duplicated URLs
2538         will be dropped. """
2539         list1_urls = set([item['url'] for item in subtitle_list1])
2540         ret = list(subtitle_list1)
2541         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2542         return ret
2543
2544     @classmethod
2545     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2546         """ Merge two subtitle dictionaries, language by language. """
2547         ret = dict(subtitle_dict1)
2548         for lang in subtitle_dict2:
2549             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2550         return ret
2551
2552     def extract_automatic_captions(self, *args, **kwargs):
2553         if (self._downloader.params.get('writeautomaticsub', False) or
2554                 self._downloader.params.get('listsubtitles')):
2555             return self._get_automatic_captions(*args, **kwargs)
2556         return {}
2557
2558     def _get_automatic_captions(self, *args, **kwargs):
2559         raise NotImplementedError('This method must be implemented by subclasses')
2560
2561     def mark_watched(self, *args, **kwargs):
2562         if (self._downloader.params.get('mark_watched', False) and
2563                 (self._get_login_info()[0] is not None or
2564                     self._downloader.params.get('cookiefile') is not None)):
2565             self._mark_watched(*args, **kwargs)
2566
2567     def _mark_watched(self, *args, **kwargs):
2568         raise NotImplementedError('This method must be implemented by subclasses')
2569
2570     def geo_verification_headers(self):
2571         headers = {}
2572         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2573         if geo_verification_proxy:
2574             headers['Ytdl-request-proxy'] = geo_verification_proxy
2575         return headers
2576
2577     def _generic_id(self, url):
2578         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2579
2580     def _generic_title(self, url):
2581         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2582
2583
2584 class SearchInfoExtractor(InfoExtractor):
2585     """
2586     Base class for paged search queries extractors.
2587     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2588     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2589     """
2590
2591     @classmethod
2592     def _make_valid_url(cls):
2593         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2594
2595     @classmethod
2596     def suitable(cls, url):
2597         return re.match(cls._make_valid_url(), url) is not None
2598
2599     def _real_extract(self, query):
2600         mobj = re.match(self._make_valid_url(), query)
2601         if mobj is None:
2602             raise ExtractorError('Invalid search query "%s"' % query)
2603
2604         prefix = mobj.group('prefix')
2605         query = mobj.group('query')
2606         if prefix == '':
2607             return self._get_n_results(query, 1)
2608         elif prefix == 'all':
2609             return self._get_n_results(query, self._MAX_RESULTS)
2610         else:
2611             n = int(prefix)
2612             if n <= 0:
2613                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2614             elif n > self._MAX_RESULTS:
2615                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2616                 n = self._MAX_RESULTS
2617             return self._get_n_results(query, n)
2618
2619     def _get_n_results(self, query, n):
2620         """Get a specified number of results for a query"""
2621         raise NotImplementedError('This method must be implemented by subclasses')
2622
2623     @property
2624     def SEARCH_KEY(self):
2625         return self._SEARCH_KEY