youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13
  14 from ..compat import (
  15     compat_cookiejar,
  16     compat_cookies,
  17     compat_getpass,
  18     compat_http_client,
  19     compat_urllib_error,
  20     compat_urllib_parse,
  21     compat_urlparse,
  22     compat_str,
  23     compat_etree_fromstring,
  24 )
  25 from ..utils import (
  26     NO_DEFAULT,
  27     age_restricted,
  28     bug_reports_message,
  29     clean_html,
  30     compiled_regex_type,
  31     determine_ext,
  32     error_to_compat_str,
  33     ExtractorError,
  34     fix_xml_ampersands,
  35     float_or_none,
  36     int_or_none,
  37     parse_iso8601,
  38     RegexNotFoundError,
  39     sanitize_filename,
  40     sanitized_Request,
  41     unescapeHTML,
  42     unified_strdate,
  43     url_basename,
  44     xpath_text,
  45     xpath_with_ns,
  46     determine_protocol,
  47 )
  48
  49
  50 class InfoExtractor(object):
  51     """Information Extractor class.
  52
  53     Information extractors are the classes that, given a URL, extract
  54     information about the video (or videos) the URL refers to. This
  55     information includes the real video URL, the video title, author and
  56     others. The information is stored in a dictionary which is then
  57     passed to the YoutubeDL. The YoutubeDL processes this
  58     information possibly downloading the video to the file system, among
  59     other possible outcomes.
  60
  61     The type field determines the type of the result.
  62     By far the most common value (and the default if _type is missing) is
  63     "video", which indicates a single video.
  64
  65     For a video, the dictionaries must include the following fields:
  66
  67     id:             Video identifier.
  68     title:          Video title, unescaped.
  69
  70     Additionally, it must contain either a formats entry or a url one:
  71
  72     formats:        A list of dictionaries for each format available, ordered
  73                     from worst to best quality.
  74
  75                     Potential fields:
  76                     * url        Mandatory. The URL of the video file
  77                     * ext        Will be calculated from URL if missing
  78                     * format     A human-readable description of the format
  79                                  ("mp4 container with h264/opus").
  80                                  Calculated from the format_id, width, height.
  81                                  and format_note fields if missing.
  82                     * format_id  A short description of the format
  83                                  ("mp4_h264_opus" or "19").
  84                                 Technically optional, but strongly recommended.
  85                     * format_note Additional info about the format
  86                                  ("3D" or "DASH video")
  87                     * width      Width of the video, if known
  88                     * height     Height of the video, if known
  89                     * resolution Textual description of width and height
  90                     * tbr        Average bitrate of audio and video in KBit/s
  91                     * abr        Average audio bitrate in KBit/s
  92                     * acodec     Name of the audio codec in use
  93                     * asr        Audio sampling rate in Hertz
  94                     * vbr        Average video bitrate in KBit/s
  95                     * fps        Frame rate
  96                     * vcodec     Name of the video codec in use
  97                     * container  Name of the container format
  98                     * filesize   The number of bytes, if known in advance
  99                     * filesize_approx  An estimate for the number of bytes
 100                     * player_url SWF Player URL (used for rtmpdump).
 101                     * protocol   The protocol that will be used for the actual
 102                                  download, lower-case.
 103                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 104                                  "m3u8", or "m3u8_native".
 105                     * preference Order number of this format. If this field is
 106                                  present and not None, the formats get sorted
 107                                  by this field, regardless of all other values.
 108                                  -1 for default (order by other properties),
 109                                  -2 or smaller for less than default.
 110                                  < -1000 to hide the format (if there is
 111                                     another one which is strictly better)
 112                     * language   Language code, e.g. "de" or "en-US".
 113                     * language_preference  Is this in the language mentioned in
 114                                  the URL?
 115                                  10 if it's what the URL is about,
 116                                  -1 for default (don't know),
 117                                  -10 otherwise, other values reserved for now.
 118                     * quality    Order number of the video quality of this
 119                                  format, irrespective of the file format.
 120                                  -1 for default (order by other properties),
 121                                  -2 or smaller for less than default.
 122                     * source_preference  Order number for this video source
 123                                   (quality takes higher priority)
 124                                  -1 for default (order by other properties),
 125                                  -2 or smaller for less than default.
 126                     * http_headers  A dictionary of additional HTTP headers
 127                                  to add to the request.
 128                     * stretched_ratio  If given and not 1, indicates that the
 129                                  video's pixels are not square.
 130                                  width : height ratio as float.
 131                     * no_resume  The server does not support resuming the
 132                                  (HTTP or RTMP) download. Boolean.
 133
 134     url:            Final video URL.
 135     ext:            Video filename extension.
 136     format:         The video format, defaults to ext (used for --get-format)
 137     player_url:     SWF Player URL (used for rtmpdump).
 138
 139     The following fields are optional:
 140
 141     alt_title:      A secondary title of the video.
 142     display_id      An alternative identifier for the video, not necessarily
 143                     unique, but available before title. Typically, id is
 144                     something like "4234987", title "Dancing naked mole rats",
 145                     and display_id "dancing-naked-mole-rats"
 146     thumbnails:     A list of dictionaries, with the following entries:
 147                         * "id" (optional, string) - Thumbnail format ID
 148                         * "url"
 149                         * "preference" (optional, int) - quality of the image
 150                         * "width" (optional, int)
 151                         * "height" (optional, int)
 152                         * "resolution" (optional, string "{width}x{height"},
 153                                         deprecated)
 154     thumbnail:      Full URL to a video thumbnail image.
 155     description:    Full video description.
 156     uploader:       Full name of the video uploader.
 157     creator:        The main artist who created the video.
 158     release_date:   The date (YYYYMMDD) when the video was released.
 159     timestamp:      UNIX timestamp of the moment the video became available.
 160     upload_date:    Video upload date (YYYYMMDD).
 161                     If not explicitly set, calculated from timestamp.
 162     uploader_id:    Nickname or id of the video uploader.
 163     location:       Physical location where the video was filmed.
 164     subtitles:      The available subtitles as a dictionary in the format
 165                     {language: subformats}. "subformats" is a list sorted from
 166                     lower to higher preference, each element is a dictionary
 167                     with the "ext" entry and one of:
 168                         * "data": The subtitles file contents
 169                         * "url": A URL pointing to the subtitles file
 170                     "ext" will be calculated from URL if missing
 171     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 172                     automatically generated captions
 173     duration:       Length of the video in seconds, as an integer or float.
 174     view_count:     How many users have watched the video on the platform.
 175     like_count:     Number of positive ratings of the video
 176     dislike_count:  Number of negative ratings of the video
 177     repost_count:   Number of reposts of the video
 178     average_rating: Average rating give by users, the scale used depends on the webpage
 179     comment_count:  Number of comments on the video
 180     comments:       A list of comments, each with one or more of the following
 181                     properties (all but one of text or html optional):
 182                         * "author" - human-readable name of the comment author
 183                         * "author_id" - user ID of the comment author
 184                         * "id" - Comment ID
 185                         * "html" - Comment as HTML
 186                         * "text" - Plain text of the comment
 187                         * "timestamp" - UNIX timestamp of comment
 188                         * "parent" - ID of the comment this one is replying to.
 189                                      Set to "root" to indicate that this is a
 190                                      comment to the original video.
 191     age_limit:      Age restriction for the video, as an integer (years)
 192     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 193                     should allow to get the same result again. (It will be set
 194                     by YoutubeDL if it's missing)
 195     categories:     A list of categories that the video falls in, for example
 196                     ["Sports", "Berlin"]
 197     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 198     is_live:        True, False, or None (=unknown). Whether this video is a
 199                     live stream that goes on instead of a fixed-length video.
 200     start_time:     Time in seconds where the reproduction should start, as
 201                     specified in the URL.
 202     end_time:       Time in seconds where the reproduction should end, as
 203                     specified in the URL.
 204
 205     The following fields should only be used when the video belongs to some logical
 206     chapter or section:
 207
 208     chapter:        Name or title of the chapter the video belongs to.
 209     chapter_number: Number of the chapter the video belongs to, as an integer.
 210     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 211
 212     The following fields should only be used when the video is an episode of some
 213     series or programme:
 214
 215     series:         Title of the series or programme the video episode belongs to.
 216     season:         Title of the season the video episode belongs to.
 217     season_number:  Number of the season the video episode belongs to, as an integer.
 218     season_id:      Id of the season the video episode belongs to, as a unicode string.
 219     episode:        Title of the video episode. Unlike mandatory video title field,
 220                     this field should denote the exact title of the video episode
 221                     without any kind of decoration.
 222     episode_number: Number of the video episode within a season, as an integer.
 223     episode_id:     Id of the video episode, as a unicode string.
 224
 225     Unless mentioned otherwise, the fields should be Unicode strings.
 226
 227     Unless mentioned otherwise, None is equivalent to absence of information.
 228
 229
 230     _type "playlist" indicates multiple videos.
 231     There must be a key "entries", which is a list, an iterable, or a PagedList
 232     object, each element of which is a valid dictionary by this specification.
 233
 234     Additionally, playlists can have "title", "description" and "id" attributes
 235     with the same semantics as videos (see above).
 236
 237
 238     _type "multi_video" indicates that there are multiple videos that
 239     form a single show, for examples multiple acts of an opera or TV episode.
 240     It must have an entries key like a playlist and contain all the keys
 241     required for a video at the same time.
 242
 243
 244     _type "url" indicates that the video must be extracted from another
 245     location, possibly by a different extractor. Its only required key is:
 246     "url" - the next URL to extract.
 247     The key "ie_key" can be set to the class name (minus the trailing "IE",
 248     e.g. "Youtube") if the extractor class is known in advance.
 249     Additionally, the dictionary may have any properties of the resolved entity
 250     known in advance, for example "title" if the title of the referred video is
 251     known ahead of time.
 252
 253
 254     _type "url_transparent" entities have the same specification as "url", but
 255     indicate that the given additional information is more precise than the one
 256     associated with the resolved URL.
 257     This is useful when a site employs a video service that hosts the video and
 258     its technical metadata, but that video service does not embed a useful
 259     title, description etc.
 260
 261
 262     Subclasses of this one should re-define the _real_initialize() and
 263     _real_extract() methods and define a _VALID_URL regexp.
 264     Probably, they should also be added to the list of extractors.
 265
 266     Finally, the _WORKING attribute should be set to False for broken IEs
 267     in order to warn the users and skip the tests.
 268     """
 269
 270     _ready = False
 271     _downloader = None
 272     _WORKING = True
 273
 274     def __init__(self, downloader=None):
 275         """Constructor. Receives an optional downloader."""
 276         self._ready = False
 277         self.set_downloader(downloader)
 278
 279     @classmethod
 280     def suitable(cls, url):
 281         """Receives a URL and returns True if suitable for this IE."""
 282
 283         # This does not use has/getattr intentionally - we want to know whether
 284         # we have cached the regexp for *this* class, whereas getattr would also
 285         # match the superclass
 286         if '_VALID_URL_RE' not in cls.__dict__:
 287             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 288         return cls._VALID_URL_RE.match(url) is not None
 289
 290     @classmethod
 291     def _match_id(cls, url):
 292         if '_VALID_URL_RE' not in cls.__dict__:
 293             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 294         m = cls._VALID_URL_RE.match(url)
 295         assert m
 296         return m.group('id')
 297
 298     @classmethod
 299     def working(cls):
 300         """Getter method for _WORKING."""
 301         return cls._WORKING
 302
 303     def initialize(self):
 304         """Initializes an instance (authentication, etc)."""
 305         if not self._ready:
 306             self._real_initialize()
 307             self._ready = True
 308
 309     def extract(self, url):
 310         """Extracts URL information and returns it in list of dicts."""
 311         try:
 312             self.initialize()
 313             return self._real_extract(url)
 314         except ExtractorError:
 315             raise
 316         except compat_http_client.IncompleteRead as e:
 317             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 318         except (KeyError, StopIteration) as e:
 319             raise ExtractorError('An extractor error has occurred.', cause=e)
 320
 321     def set_downloader(self, downloader):
 322         """Sets the downloader for this IE."""
 323         self._downloader = downloader
 324
 325     def _real_initialize(self):
 326         """Real initialization process. Redefine in subclasses."""
 327         pass
 328
 329     def _real_extract(self, url):
 330         """Real extraction process. Redefine in subclasses."""
 331         pass
 332
 333     @classmethod
 334     def ie_key(cls):
 335         """A string for getting the InfoExtractor with get_info_extractor"""
 336         return compat_str(cls.__name__[:-2])
 337
 338     @property
 339     def IE_NAME(self):
 340         return compat_str(type(self).__name__[:-2])
 341
 342     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 343         """ Returns the response handle """
 344         if note is None:
 345             self.report_download_webpage(video_id)
 346         elif note is not False:
 347             if video_id is None:
 348                 self.to_screen('%s' % (note,))
 349             else:
 350                 self.to_screen('%s: %s' % (video_id, note))
 351         try:
 352             return self._downloader.urlopen(url_or_request)
 353         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 354             if errnote is False:
 355                 return False
 356             if errnote is None:
 357                 errnote = 'Unable to download webpage'
 358
 359             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 360             if fatal:
 361                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 362             else:
 363                 self._downloader.report_warning(errmsg)
 364                 return False
 365
 366     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 367         """ Returns a tuple (page content as string, URL handle) """
 368         # Strip hashes from the URL (#1038)
 369         if isinstance(url_or_request, (compat_str, str)):
 370             url_or_request = url_or_request.partition('#')[0]
 371
 372         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 373         if urlh is False:
 374             assert not fatal
 375             return False
 376         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 377         return (content, urlh)
 378
 379     @staticmethod
 380     def _guess_encoding_from_content(content_type, webpage_bytes):
 381         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 382         if m:
 383             encoding = m.group(1)
 384         else:
 385             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 386                           webpage_bytes[:1024])
 387             if m:
 388                 encoding = m.group(1).decode('ascii')
 389             elif webpage_bytes.startswith(b'\xff\xfe'):
 390                 encoding = 'utf-16'
 391             else:
 392                 encoding = 'utf-8'
 393
 394         return encoding
 395
 396     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 397         content_type = urlh.headers.get('Content-Type', '')
 398         webpage_bytes = urlh.read()
 399         if prefix is not None:
 400             webpage_bytes = prefix + webpage_bytes
 401         if not encoding:
 402             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 403         if self._downloader.params.get('dump_intermediate_pages', False):
 404             try:
 405                 url = url_or_request.get_full_url()
 406             except AttributeError:
 407                 url = url_or_request
 408             self.to_screen('Dumping request to ' + url)
 409             dump = base64.b64encode(webpage_bytes).decode('ascii')
 410             self._downloader.to_screen(dump)
 411         if self._downloader.params.get('write_pages', False):
 412             try:
 413                 url = url_or_request.get_full_url()
 414             except AttributeError:
 415                 url = url_or_request
 416             basen = '%s_%s' % (video_id, url)
 417             if len(basen) > 240:
 418                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 419                 basen = basen[:240 - len(h)] + h
 420             raw_filename = basen + '.dump'
 421             filename = sanitize_filename(raw_filename, restricted=True)
 422             self.to_screen('Saving request to ' + filename)
 423             # Working around MAX_PATH limitation on Windows (see
 424             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 425             if os.name == 'nt':
 426                 absfilepath = os.path.abspath(filename)
 427                 if len(absfilepath) > 259:
 428                     filename = '\\\\?\\' + absfilepath
 429             with open(filename, 'wb') as outf:
 430                 outf.write(webpage_bytes)
 431
 432         try:
 433             content = webpage_bytes.decode(encoding, 'replace')
 434         except LookupError:
 435             content = webpage_bytes.decode('utf-8', 'replace')
 436
 437         if ('<title>Access to this site is blocked</title>' in content and
 438                 'Websense' in content[:512]):
 439             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 440             blocked_iframe = self._html_search_regex(
 441                 r'<iframe src="([^"]+)"', content,
 442                 'Websense information URL', default=None)
 443             if blocked_iframe:
 444                 msg += ' Visit %s for more details' % blocked_iframe
 445             raise ExtractorError(msg, expected=True)
 446         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 447             msg = (
 448                 'Access to this webpage has been blocked by Indian censorship. '
 449                 'Use a VPN or proxy server (with --proxy) to route around it.')
 450             block_msg = self._html_search_regex(
 451                 r'</h1><p>(.*?)</p>',
 452                 content, 'block message', default=None)
 453             if block_msg:
 454                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 455             raise ExtractorError(msg, expected=True)
 456
 457         return content
 458
 459     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 460         """ Returns the data of the page as a string """
 461         success = False
 462         try_count = 0
 463         while success is False:
 464             try:
 465                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 466                 success = True
 467             except compat_http_client.IncompleteRead as e:
 468                 try_count += 1
 469                 if try_count >= tries:
 470                     raise e
 471                 self._sleep(timeout, video_id)
 472         if res is False:
 473             return res
 474         else:
 475             content, _ = res
 476             return content
 477
 478     def _download_xml(self, url_or_request, video_id,
 479                       note='Downloading XML', errnote='Unable to download XML',
 480                       transform_source=None, fatal=True, encoding=None):
 481         """Return the xml as an xml.etree.ElementTree.Element"""
 482         xml_string = self._download_webpage(
 483             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 484         if xml_string is False:
 485             return xml_string
 486         if transform_source:
 487             xml_string = transform_source(xml_string)
 488         return compat_etree_fromstring(xml_string.encode('utf-8'))
 489
 490     def _download_json(self, url_or_request, video_id,
 491                        note='Downloading JSON metadata',
 492                        errnote='Unable to download JSON metadata',
 493                        transform_source=None,
 494                        fatal=True, encoding=None):
 495         json_string = self._download_webpage(
 496             url_or_request, video_id, note, errnote, fatal=fatal,
 497             encoding=encoding)
 498         if (not fatal) and json_string is False:
 499             return None
 500         return self._parse_json(
 501             json_string, video_id, transform_source=transform_source, fatal=fatal)
 502
 503     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 504         if transform_source:
 505             json_string = transform_source(json_string)
 506         try:
 507             return json.loads(json_string)
 508         except ValueError as ve:
 509             errmsg = '%s: Failed to parse JSON ' % video_id
 510             if fatal:
 511                 raise ExtractorError(errmsg, cause=ve)
 512             else:
 513                 self.report_warning(errmsg + str(ve))
 514
 515     def report_warning(self, msg, video_id=None):
 516         idstr = '' if video_id is None else '%s: ' % video_id
 517         self._downloader.report_warning(
 518             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 519
 520     def to_screen(self, msg):
 521         """Print msg to screen, prefixing it with '[ie_name]'"""
 522         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 523
 524     def report_extraction(self, id_or_name):
 525         """Report information extraction."""
 526         self.to_screen('%s: Extracting information' % id_or_name)
 527
 528     def report_download_webpage(self, video_id):
 529         """Report webpage download."""
 530         self.to_screen('%s: Downloading webpage' % video_id)
 531
 532     def report_age_confirmation(self):
 533         """Report attempt to confirm age."""
 534         self.to_screen('Confirming age')
 535
 536     def report_login(self):
 537         """Report attempt to log in."""
 538         self.to_screen('Logging in')
 539
 540     @staticmethod
 541     def raise_login_required(msg='This video is only available for registered users'):
 542         raise ExtractorError(
 543             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 544             expected=True)
 545
 546     @staticmethod
 547     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
 548         raise ExtractorError(
 549             '%s. You might want to use --proxy to workaround.' % msg,
 550             expected=True)
 551
 552     # Methods for following #608
 553     @staticmethod
 554     def url_result(url, ie=None, video_id=None, video_title=None):
 555         """Returns a URL that points to a page that should be processed"""
 556         # TODO: ie should be the class used for getting the info
 557         video_info = {'_type': 'url',
 558                       'url': url,
 559                       'ie_key': ie}
 560         if video_id is not None:
 561             video_info['id'] = video_id
 562         if video_title is not None:
 563             video_info['title'] = video_title
 564         return video_info
 565
 566     @staticmethod
 567     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 568         """Returns a playlist"""
 569         video_info = {'_type': 'playlist',
 570                       'entries': entries}
 571         if playlist_id:
 572             video_info['id'] = playlist_id
 573         if playlist_title:
 574             video_info['title'] = playlist_title
 575         if playlist_description:
 576             video_info['description'] = playlist_description
 577         return video_info
 578
 579     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 580         """
 581         Perform a regex search on the given string, using a single or a list of
 582         patterns returning the first matching group.
 583         In case of failure return a default value or raise a WARNING or a
 584         RegexNotFoundError, depending on fatal, specifying the field name.
 585         """
 586         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 587             mobj = re.search(pattern, string, flags)
 588         else:
 589             for p in pattern:
 590                 mobj = re.search(p, string, flags)
 591                 if mobj:
 592                     break
 593
 594         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 595             _name = '\033[0;34m%s\033[0m' % name
 596         else:
 597             _name = name
 598
 599         if mobj:
 600             if group is None:
 601                 # return the first matching group
 602                 return next(g for g in mobj.groups() if g is not None)
 603             else:
 604                 return mobj.group(group)
 605         elif default is not NO_DEFAULT:
 606             return default
 607         elif fatal:
 608             raise RegexNotFoundError('Unable to extract %s' % _name)
 609         else:
 610             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 611             return None
 612
 613     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 614         """
 615         Like _search_regex, but strips HTML tags and unescapes entities.
 616         """
 617         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 618         if res:
 619             return clean_html(res).strip()
 620         else:
 621             return res
 622
 623     def _get_login_info(self):
 624         """
 625         Get the login info as (username, password)
 626         It will look in the netrc file using the _NETRC_MACHINE value
 627         If there's no info available, return (None, None)
 628         """
 629         if self._downloader is None:
 630             return (None, None)
 631
 632         username = None
 633         password = None
 634         downloader_params = self._downloader.params
 635
 636         # Attempt to use provided username and password or .netrc data
 637         if downloader_params.get('username', None) is not None:
 638             username = downloader_params['username']
 639             password = downloader_params['password']
 640         elif downloader_params.get('usenetrc', False):
 641             try:
 642                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 643                 if info is not None:
 644                     username = info[0]
 645                     password = info[2]
 646                 else:
 647                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 648             except (IOError, netrc.NetrcParseError) as err:
 649                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
 650
 651         return (username, password)
 652
 653     def _get_tfa_info(self, note='two-factor verification code'):
 654         """
 655         Get the two-factor authentication info
 656         TODO - asking the user will be required for sms/phone verify
 657         currently just uses the command line option
 658         If there's no info available, return None
 659         """
 660         if self._downloader is None:
 661             return None
 662         downloader_params = self._downloader.params
 663
 664         if downloader_params.get('twofactor', None) is not None:
 665             return downloader_params['twofactor']
 666
 667         return compat_getpass('Type %s and press [Return]: ' % note)
 668
 669     # Helper functions for extracting OpenGraph info
 670     @staticmethod
 671     def _og_regexes(prop):
 672         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 673         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 674                        % {'prop': re.escape(prop)})
 675         template = r'<meta[^>]+?%s[^>]+?%s'
 676         return [
 677             template % (property_re, content_re),
 678             template % (content_re, property_re),
 679         ]
 680
 681     @staticmethod
 682     def _meta_regex(prop):
 683         return r'''(?isx)<meta
 684                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 685                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 686
 687     def _og_search_property(self, prop, html, name=None, **kargs):
 688         if name is None:
 689             name = 'OpenGraph %s' % prop
 690         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 691         if escaped is None:
 692             return None
 693         return unescapeHTML(escaped)
 694
 695     def _og_search_thumbnail(self, html, **kargs):
 696         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 697
 698     def _og_search_description(self, html, **kargs):
 699         return self._og_search_property('description', html, fatal=False, **kargs)
 700
 701     def _og_search_title(self, html, **kargs):
 702         return self._og_search_property('title', html, **kargs)
 703
 704     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 705         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 706         if secure:
 707             regexes = self._og_regexes('video:secure_url') + regexes
 708         return self._html_search_regex(regexes, html, name, **kargs)
 709
 710     def _og_search_url(self, html, **kargs):
 711         return self._og_search_property('url', html, **kargs)
 712
 713     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 714         if display_name is None:
 715             display_name = name
 716         return self._html_search_regex(
 717             self._meta_regex(name),
 718             html, display_name, fatal=fatal, group='content', **kwargs)
 719
 720     def _dc_search_uploader(self, html):
 721         return self._html_search_meta('dc.creator', html, 'uploader')
 722
 723     def _rta_search(self, html):
 724         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 725         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 726                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 727                      html):
 728             return 18
 729         return 0
 730
 731     def _media_rating_search(self, html):
 732         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 733         rating = self._html_search_meta('rating', html)
 734
 735         if not rating:
 736             return None
 737
 738         RATING_TABLE = {
 739             'safe for kids': 0,
 740             'general': 8,
 741             '14 years': 14,
 742             'mature': 17,
 743             'restricted': 19,
 744         }
 745         return RATING_TABLE.get(rating.lower(), None)
 746
 747     def _family_friendly_search(self, html):
 748         # See http://schema.org/VideoObject
 749         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 750
 751         if not family_friendly:
 752             return None
 753
 754         RATING_TABLE = {
 755             '1': 0,
 756             'true': 0,
 757             '0': 18,
 758             'false': 18,
 759         }
 760         return RATING_TABLE.get(family_friendly.lower(), None)
 761
 762     def _twitter_search_player(self, html):
 763         return self._html_search_meta('twitter:player', html,
 764                                       'twitter card player')
 765
 766     def _search_json_ld(self, html, video_id, **kwargs):
 767         json_ld = self._search_regex(
 768             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 769             html, 'JSON-LD', group='json_ld', **kwargs)
 770         if not json_ld:
 771             return {}
 772         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
 773
 774     def _json_ld(self, json_ld, video_id, fatal=True):
 775         if isinstance(json_ld, compat_str):
 776             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 777         if not json_ld:
 778             return {}
 779         info = {}
 780         if json_ld.get('@context') == 'http://schema.org':
 781             item_type = json_ld.get('@type')
 782             if item_type == 'TVEpisode':
 783                 info.update({
 784                     'episode': unescapeHTML(json_ld.get('name')),
 785                     'episode_number': int_or_none(json_ld.get('episodeNumber')),
 786                     'description': unescapeHTML(json_ld.get('description')),
 787                 })
 788                 part_of_season = json_ld.get('partOfSeason')
 789                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 790                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 791                 part_of_series = json_ld.get('partOfSeries')
 792                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 793                     info['series'] = unescapeHTML(part_of_series.get('name'))
 794             elif item_type == 'Article':
 795                 info.update({
 796                     'timestamp': parse_iso8601(json_ld.get('datePublished')),
 797                     'title': unescapeHTML(json_ld.get('headline')),
 798                     'description': unescapeHTML(json_ld.get('articleBody')),
 799                 })
 800         return dict((k, v) for k, v in info.items() if v is not None)
 801
 802     @staticmethod
 803     def _hidden_inputs(html):
 804         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
 805         hidden_inputs = {}
 806         for input in re.findall(r'(?i)<input([^>]+)>', html):
 807             if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
 808                 continue
 809             name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
 810             if not name:
 811                 continue
 812             value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
 813             if not value:
 814                 continue
 815             hidden_inputs[name.group('value')] = value.group('value')
 816         return hidden_inputs
 817
 818     def _form_hidden_inputs(self, form_id, html):
 819         form = self._search_regex(
 820             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 821             html, '%s form' % form_id, group='form')
 822         return self._hidden_inputs(form)
 823
 824     def _sort_formats(self, formats, field_preference=None):
 825         if not formats:
 826             raise ExtractorError('No video formats found')
 827
 828         for f in formats:
 829             # Automatically determine tbr when missing based on abr and vbr (improves
 830             # formats sorting in some cases)
 831             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
 832                 f['tbr'] = f['abr'] + f['vbr']
 833
 834         def _formats_key(f):
 835             # TODO remove the following workaround
 836             from ..utils import determine_ext
 837             if not f.get('ext') and 'url' in f:
 838                 f['ext'] = determine_ext(f['url'])
 839
 840             if isinstance(field_preference, (list, tuple)):
 841                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 842
 843             preference = f.get('preference')
 844             if preference is None:
 845                 preference = 0
 846                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 847                     preference -= 0.5
 848
 849             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
 850
 851             if f.get('vcodec') == 'none':  # audio only
 852                 if self._downloader.params.get('prefer_free_formats'):
 853                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 854                 else:
 855                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 856                 ext_preference = 0
 857                 try:
 858                     audio_ext_preference = ORDER.index(f['ext'])
 859                 except ValueError:
 860                     audio_ext_preference = -1
 861             else:
 862                 if self._downloader.params.get('prefer_free_formats'):
 863                     ORDER = ['flv', 'mp4', 'webm']
 864                 else:
 865                     ORDER = ['webm', 'flv', 'mp4']
 866                 try:
 867                     ext_preference = ORDER.index(f['ext'])
 868                 except ValueError:
 869                     ext_preference = -1
 870                 audio_ext_preference = 0
 871
 872             return (
 873                 preference,
 874                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 875                 f.get('quality') if f.get('quality') is not None else -1,
 876                 f.get('tbr') if f.get('tbr') is not None else -1,
 877                 f.get('filesize') if f.get('filesize') is not None else -1,
 878                 f.get('vbr') if f.get('vbr') is not None else -1,
 879                 f.get('height') if f.get('height') is not None else -1,
 880                 f.get('width') if f.get('width') is not None else -1,
 881                 proto_preference,
 882                 ext_preference,
 883                 f.get('abr') if f.get('abr') is not None else -1,
 884                 audio_ext_preference,
 885                 f.get('fps') if f.get('fps') is not None else -1,
 886                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 887                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 888                 f.get('format_id') if f.get('format_id') is not None else '',
 889             )
 890         formats.sort(key=_formats_key)
 891
 892     def _check_formats(self, formats, video_id):
 893         if formats:
 894             formats[:] = filter(
 895                 lambda f: self._is_valid_url(
 896                     f['url'], video_id,
 897                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 898                 formats)
 899
 900     def _is_valid_url(self, url, video_id, item='video'):
 901         url = self._proto_relative_url(url, scheme='http:')
 902         # For now assume non HTTP(S) URLs always valid
 903         if not (url.startswith('http://') or url.startswith('https://')):
 904             return True
 905         try:
 906             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 907             return True
 908         except ExtractorError as e:
 909             if isinstance(e.cause, compat_urllib_error.URLError):
 910                 self.to_screen(
 911                     '%s: %s URL is invalid, skipping' % (video_id, item))
 912                 return False
 913             raise
 914
 915     def http_scheme(self):
 916         """ Either "http:" or "https:", depending on the user's preferences """
 917         return (
 918             'http:'
 919             if self._downloader.params.get('prefer_insecure', False)
 920             else 'https:')
 921
 922     def _proto_relative_url(self, url, scheme=None):
 923         if url is None:
 924             return url
 925         if url.startswith('//'):
 926             if scheme is None:
 927                 scheme = self.http_scheme()
 928             return scheme + url
 929         else:
 930             return url
 931
 932     def _sleep(self, timeout, video_id, msg_template=None):
 933         if msg_template is None:
 934             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 935         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 936         self.to_screen(msg)
 937         time.sleep(timeout)
 938
 939     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 940                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
 941                              fatal=True):
 942         manifest = self._download_xml(
 943             manifest_url, video_id, 'Downloading f4m manifest',
 944             'Unable to download f4m manifest',
 945             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 946             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 947             transform_source=transform_source,
 948             fatal=fatal)
 949
 950         if manifest is False:
 951             return []
 952
 953         formats = []
 954         manifest_version = '1.0'
 955         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 956         if not media_nodes:
 957             manifest_version = '2.0'
 958             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 959         base_url = xpath_text(
 960             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
 961             'base URL', default=None)
 962         if base_url:
 963             base_url = base_url.strip()
 964         for i, media_el in enumerate(media_nodes):
 965             if manifest_version == '2.0':
 966                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 967                 if not media_url:
 968                     continue
 969                 manifest_url = (
 970                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 971                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
 972                 # If media_url is itself a f4m manifest do the recursive extraction
 973                 # since bitrates in parent manifest (this one) and media_url manifest
 974                 # may differ leading to inability to resolve the format by requested
 975                 # bitrate in f4m downloader
 976                 if determine_ext(manifest_url) == 'f4m':
 977                     formats.extend(self._extract_f4m_formats(
 978                         manifest_url, video_id, preference, f4m_id, fatal=fatal))
 979                     continue
 980             tbr = int_or_none(media_el.attrib.get('bitrate'))
 981             formats.append({
 982                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 983                 'url': manifest_url,
 984                 'ext': 'flv',
 985                 'tbr': tbr,
 986                 'width': int_or_none(media_el.attrib.get('width')),
 987                 'height': int_or_none(media_el.attrib.get('height')),
 988                 'preference': preference,
 989             })
 990         self._sort_formats(formats)
 991
 992         return formats
 993
 994     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 995                               entry_protocol='m3u8', preference=None,
 996                               m3u8_id=None, note=None, errnote=None,
 997                               fatal=True):
 998
 999         formats = [{
1000             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1001             'url': m3u8_url,
1002             'ext': ext,
1003             'protocol': 'm3u8',
1004             'preference': preference - 1 if preference else -1,
1005             'resolution': 'multiple',
1006             'format_note': 'Quality selection URL',
1007         }]
1008
1009         format_url = lambda u: (
1010             u
1011             if re.match(r'^https?://', u)
1012             else compat_urlparse.urljoin(m3u8_url, u))
1013
1014         res = self._download_webpage_handle(
1015             m3u8_url, video_id,
1016             note=note or 'Downloading m3u8 information',
1017             errnote=errnote or 'Failed to download m3u8 information',
1018             fatal=fatal)
1019         if res is False:
1020             return []
1021         m3u8_doc, urlh = res
1022         m3u8_url = urlh.geturl()
1023         # A Media Playlist Tag MUST NOT appear in a Master Playlist
1024         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1025         # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
1026         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1027         if '#EXT-X-TARGETDURATION' in m3u8_doc:
1028             return [{
1029                 'url': m3u8_url,
1030                 'format_id': m3u8_id,
1031                 'ext': ext,
1032                 'protocol': entry_protocol,
1033                 'preference': preference,
1034             }]
1035         last_info = None
1036         last_media = None
1037         kv_rex = re.compile(
1038             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1039         for line in m3u8_doc.splitlines():
1040             if line.startswith('#EXT-X-STREAM-INF:'):
1041                 last_info = {}
1042                 for m in kv_rex.finditer(line):
1043                     v = m.group('val')
1044                     if v.startswith('"'):
1045                         v = v[1:-1]
1046                     last_info[m.group('key')] = v
1047             elif line.startswith('#EXT-X-MEDIA:'):
1048                 last_media = {}
1049                 for m in kv_rex.finditer(line):
1050                     v = m.group('val')
1051                     if v.startswith('"'):
1052                         v = v[1:-1]
1053                     last_media[m.group('key')] = v
1054             elif line.startswith('#') or not line.strip():
1055                 continue
1056             else:
1057                 if last_info is None:
1058                     formats.append({'url': format_url(line)})
1059                     continue
1060                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1061                 format_id = []
1062                 if m3u8_id:
1063                     format_id.append(m3u8_id)
1064                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1065                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1066                 f = {
1067                     'format_id': '-'.join(format_id),
1068                     'url': format_url(line.strip()),
1069                     'tbr': tbr,
1070                     'ext': ext,
1071                     'protocol': entry_protocol,
1072                     'preference': preference,
1073                 }
1074                 codecs = last_info.get('CODECS')
1075                 if codecs:
1076                     # TODO: looks like video codec is not always necessarily goes first
1077                     va_codecs = codecs.split(',')
1078                     if va_codecs[0]:
1079                         f['vcodec'] = va_codecs[0]
1080                     if len(va_codecs) > 1 and va_codecs[1]:
1081                         f['acodec'] = va_codecs[1]
1082                 resolution = last_info.get('RESOLUTION')
1083                 if resolution:
1084                     width_str, height_str = resolution.split('x')
1085                     f['width'] = int(width_str)
1086                     f['height'] = int(height_str)
1087                 if last_media is not None:
1088                     f['m3u8_media'] = last_media
1089                     last_media = None
1090                 formats.append(f)
1091                 last_info = {}
1092         self._sort_formats(formats)
1093         return formats
1094
1095     @staticmethod
1096     def _xpath_ns(path, namespace=None):
1097         if not namespace:
1098             return path
1099         out = []
1100         for c in path.split('/'):
1101             if not c or c == '.':
1102                 out.append(c)
1103             else:
1104                 out.append('{%s}%s' % (namespace, c))
1105         return '/'.join(out)
1106
1107     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1108         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1109
1110         if smil is False:
1111             assert not fatal
1112             return []
1113
1114         namespace = self._parse_smil_namespace(smil)
1115
1116         return self._parse_smil_formats(
1117             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1118
1119     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1120         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1121         if smil is False:
1122             return {}
1123         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1124
1125     def _download_smil(self, smil_url, video_id, fatal=True):
1126         return self._download_xml(
1127             smil_url, video_id, 'Downloading SMIL file',
1128             'Unable to download SMIL file', fatal=fatal)
1129
1130     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1131         namespace = self._parse_smil_namespace(smil)
1132
1133         formats = self._parse_smil_formats(
1134             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1135         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1136
1137         video_id = os.path.splitext(url_basename(smil_url))[0]
1138         title = None
1139         description = None
1140         upload_date = None
1141         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1142             name = meta.attrib.get('name')
1143             content = meta.attrib.get('content')
1144             if not name or not content:
1145                 continue
1146             if not title and name == 'title':
1147                 title = content
1148             elif not description and name in ('description', 'abstract'):
1149                 description = content
1150             elif not upload_date and name == 'date':
1151                 upload_date = unified_strdate(content)
1152
1153         thumbnails = [{
1154             'id': image.get('type'),
1155             'url': image.get('src'),
1156             'width': int_or_none(image.get('width')),
1157             'height': int_or_none(image.get('height')),
1158         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1159
1160         return {
1161             'id': video_id,
1162             'title': title or video_id,
1163             'description': description,
1164             'upload_date': upload_date,
1165             'thumbnails': thumbnails,
1166             'formats': formats,
1167             'subtitles': subtitles,
1168         }
1169
1170     def _parse_smil_namespace(self, smil):
1171         return self._search_regex(
1172             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1173
1174     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1175         base = smil_url
1176         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1177             b = meta.get('base') or meta.get('httpBase')
1178             if b:
1179                 base = b
1180                 break
1181
1182         formats = []
1183         rtmp_count = 0
1184         http_count = 0
1185         m3u8_count = 0
1186
1187         videos = smil.findall(self._xpath_ns('.//video', namespace))
1188         for video in videos:
1189             src = video.get('src')
1190             if not src:
1191                 continue
1192
1193             bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1194             filesize = int_or_none(video.get('size') or video.get('fileSize'))
1195             width = int_or_none(video.get('width'))
1196             height = int_or_none(video.get('height'))
1197             proto = video.get('proto')
1198             ext = video.get('ext')
1199             src_ext = determine_ext(src)
1200             streamer = video.get('streamer') or base
1201
1202             if proto == 'rtmp' or streamer.startswith('rtmp'):
1203                 rtmp_count += 1
1204                 formats.append({
1205                     'url': streamer,
1206                     'play_path': src,
1207                     'ext': 'flv',
1208                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1209                     'tbr': bitrate,
1210                     'filesize': filesize,
1211                     'width': width,
1212                     'height': height,
1213                 })
1214                 if transform_rtmp_url:
1215                     streamer, src = transform_rtmp_url(streamer, src)
1216                     formats[-1].update({
1217                         'url': streamer,
1218                         'play_path': src,
1219                     })
1220                 continue
1221
1222             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1223
1224             if proto == 'm3u8' or src_ext == 'm3u8':
1225                 m3u8_formats = self._extract_m3u8_formats(
1226                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1227                 if len(m3u8_formats) == 1:
1228                     m3u8_count += 1
1229                     m3u8_formats[0].update({
1230                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1231                         'tbr': bitrate,
1232                         'width': width,
1233                         'height': height,
1234                     })
1235                 formats.extend(m3u8_formats)
1236                 continue
1237
1238             if src_ext == 'f4m':
1239                 f4m_url = src_url
1240                 if not f4m_params:
1241                     f4m_params = {
1242                         'hdcore': '3.2.0',
1243                         'plugin': 'flowplayer-3.2.0.1',
1244                     }
1245                 f4m_url += '&' if '?' in f4m_url else '?'
1246                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1247                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1248                 continue
1249
1250             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1251                 http_count += 1
1252                 formats.append({
1253                     'url': src_url,
1254                     'ext': ext or src_ext or 'flv',
1255                     'format_id': 'http-%d' % (bitrate or http_count),
1256                     'tbr': bitrate,
1257                     'filesize': filesize,
1258                     'width': width,
1259                     'height': height,
1260                 })
1261                 continue
1262
1263         self._sort_formats(formats)
1264
1265         return formats
1266
1267     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1268         subtitles = {}
1269         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1270             src = textstream.get('src')
1271             if not src:
1272                 continue
1273             ext = textstream.get('ext') or determine_ext(src)
1274             if not ext:
1275                 type_ = textstream.get('type')
1276                 SUBTITLES_TYPES = {
1277                     'text/vtt': 'vtt',
1278                     'text/srt': 'srt',
1279                     'application/smptett+xml': 'tt',
1280                 }
1281                 if type_ in SUBTITLES_TYPES:
1282                     ext = SUBTITLES_TYPES[type_]
1283             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1284             subtitles.setdefault(lang, []).append({
1285                 'url': src,
1286                 'ext': ext,
1287             })
1288         return subtitles
1289
1290     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1291         xspf = self._download_xml(
1292             playlist_url, playlist_id, 'Downloading xpsf playlist',
1293             'Unable to download xspf manifest', fatal=fatal)
1294         if xspf is False:
1295             return []
1296         return self._parse_xspf(xspf, playlist_id)
1297
1298     def _parse_xspf(self, playlist, playlist_id):
1299         NS_MAP = {
1300             'xspf': 'http://xspf.org/ns/0/',
1301             's1': 'http://static.streamone.nl/player/ns/0',
1302         }
1303
1304         entries = []
1305         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1306             title = xpath_text(
1307                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1308             description = xpath_text(
1309                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1310             thumbnail = xpath_text(
1311                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1312             duration = float_or_none(
1313                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1314
1315             formats = [{
1316                 'url': location.text,
1317                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1318                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1319                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1320             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1321             self._sort_formats(formats)
1322
1323             entries.append({
1324                 'id': playlist_id,
1325                 'title': title,
1326                 'description': description,
1327                 'thumbnail': thumbnail,
1328                 'duration': duration,
1329                 'formats': formats,
1330             })
1331         return entries
1332
1333     def _parse_dash_manifest(self, dash_doc, namespace=None, formats_dict={}):
1334         def _add_ns(path):
1335             return self._xpath_ns(path, namespace)
1336
1337         formats = []
1338         for a in dash_doc.findall('.//' + _add_ns('AdaptationSet')):
1339             mime_type = a.attrib.get('mimeType')
1340             for r in a.findall(_add_ns('Representation')):
1341                 mime_type = r.attrib.get('mimeType') or mime_type
1342                 url_el = r.find(_add_ns('BaseURL'))
1343                 if mime_type == 'text/vtt':
1344                     # TODO implement WebVTT downloading
1345                     pass
1346                 elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
1347                     segment_list = r.find(_add_ns('SegmentList'))
1348                     format_id = r.attrib['id']
1349                     video_url = url_el.text if url_el is not None else None
1350                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1351                     f = {
1352                         'format_id': format_id,
1353                         'url': video_url,
1354                         'width': int_or_none(r.attrib.get('width')),
1355                         'height': int_or_none(r.attrib.get('height')),
1356                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1357                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1358                         'filesize': filesize,
1359                         'fps': int_or_none(r.attrib.get('frameRate')),
1360                     }
1361                     if segment_list is not None:
1362                         initialization_url = segment_list.find(_add_ns('Initialization')).attrib['sourceURL']
1363                         f.update({
1364                             'initialization_url': initialization_url,
1365                             'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall(_add_ns('SegmentURL'))],
1366                             'protocol': 'http_dash_segments',
1367                         })
1368                         if not f.get('url'):
1369                             f['url'] = initialization_url
1370                     try:
1371                         existing_format = next(
1372                             fo for fo in formats
1373                             if fo['format_id'] == format_id)
1374                     except StopIteration:
1375                         full_info = formats_dict.get(format_id, {}).copy()
1376                         full_info.update(f)
1377                         codecs = r.attrib.get('codecs')
1378                         if codecs:
1379                             if mime_type.startswith('video/'):
1380                                 vcodec, acodec = codecs, 'none'
1381                             else:  # mime_type.startswith('audio/')
1382                                 vcodec, acodec = 'none', codecs
1383
1384                             full_info.update({
1385                                 'vcodec': vcodec,
1386                                 'acodec': acodec,
1387                             })
1388                         formats.append(full_info)
1389                     else:
1390                         existing_format.update(f)
1391                 else:
1392                     self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1393         return formats
1394
1395     def _live_title(self, name):
1396         """ Generate the title for a live video """
1397         now = datetime.datetime.now()
1398         now_str = now.strftime("%Y-%m-%d %H:%M")
1399         return name + ' ' + now_str
1400
1401     def _int(self, v, name, fatal=False, **kwargs):
1402         res = int_or_none(v, **kwargs)
1403         if 'get_attr' in kwargs:
1404             print(getattr(v, kwargs['get_attr']))
1405         if res is None:
1406             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1407             if fatal:
1408                 raise ExtractorError(msg)
1409             else:
1410                 self._downloader.report_warning(msg)
1411         return res
1412
1413     def _float(self, v, name, fatal=False, **kwargs):
1414         res = float_or_none(v, **kwargs)
1415         if res is None:
1416             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1417             if fatal:
1418                 raise ExtractorError(msg)
1419             else:
1420                 self._downloader.report_warning(msg)
1421         return res
1422
1423     def _set_cookie(self, domain, name, value, expire_time=None):
1424         cookie = compat_cookiejar.Cookie(
1425             0, name, value, None, None, domain, None,
1426             None, '/', True, False, expire_time, '', None, None, None)
1427         self._downloader.cookiejar.set_cookie(cookie)
1428
1429     def _get_cookies(self, url):
1430         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1431         req = sanitized_Request(url)
1432         self._downloader.cookiejar.add_cookie_header(req)
1433         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1434
1435     def get_testcases(self, include_onlymatching=False):
1436         t = getattr(self, '_TEST', None)
1437         if t:
1438             assert not hasattr(self, '_TESTS'), \
1439                 '%s has _TEST and _TESTS' % type(self).__name__
1440             tests = [t]
1441         else:
1442             tests = getattr(self, '_TESTS', [])
1443         for t in tests:
1444             if not include_onlymatching and t.get('only_matching', False):
1445                 continue
1446             t['name'] = type(self).__name__[:-len('IE')]
1447             yield t
1448
1449     def is_suitable(self, age_limit):
1450         """ Test whether the extractor is generally suitable for the given
1451         age limit (i.e. pornographic sites are not, all others usually are) """
1452
1453         any_restricted = False
1454         for tc in self.get_testcases(include_onlymatching=False):
1455             if 'playlist' in tc:
1456                 tc = tc['playlist'][0]
1457             is_restricted = age_restricted(
1458                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1459             if not is_restricted:
1460                 return True
1461             any_restricted = any_restricted or is_restricted
1462         return not any_restricted
1463
1464     def extract_subtitles(self, *args, **kwargs):
1465         if (self._downloader.params.get('writesubtitles', False) or
1466                 self._downloader.params.get('listsubtitles')):
1467             return self._get_subtitles(*args, **kwargs)
1468         return {}
1469
1470     def _get_subtitles(self, *args, **kwargs):
1471         raise NotImplementedError("This method must be implemented by subclasses")
1472
1473     @staticmethod
1474     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1475         """ Merge subtitle items for one language. Items with duplicated URLs
1476         will be dropped. """
1477         list1_urls = set([item['url'] for item in subtitle_list1])
1478         ret = list(subtitle_list1)
1479         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1480         return ret
1481
1482     @classmethod
1483     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1484         """ Merge two subtitle dictionaries, language by language. """
1485         ret = dict(subtitle_dict1)
1486         for lang in subtitle_dict2:
1487             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1488         return ret
1489
1490     def extract_automatic_captions(self, *args, **kwargs):
1491         if (self._downloader.params.get('writeautomaticsub', False) or
1492                 self._downloader.params.get('listsubtitles')):
1493             return self._get_automatic_captions(*args, **kwargs)
1494         return {}
1495
1496     def _get_automatic_captions(self, *args, **kwargs):
1497         raise NotImplementedError("This method must be implemented by subclasses")
1498
1499
1500 class SearchInfoExtractor(InfoExtractor):
1501     """
1502     Base class for paged search queries extractors.
1503     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1504     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1505     """
1506
1507     @classmethod
1508     def _make_valid_url(cls):
1509         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1510
1511     @classmethod
1512     def suitable(cls, url):
1513         return re.match(cls._make_valid_url(), url) is not None
1514
1515     def _real_extract(self, query):
1516         mobj = re.match(self._make_valid_url(), query)
1517         if mobj is None:
1518             raise ExtractorError('Invalid search query "%s"' % query)
1519
1520         prefix = mobj.group('prefix')
1521         query = mobj.group('query')
1522         if prefix == '':
1523             return self._get_n_results(query, 1)
1524         elif prefix == 'all':
1525             return self._get_n_results(query, self._MAX_RESULTS)
1526         else:
1527             n = int(prefix)
1528             if n <= 0:
1529                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1530             elif n > self._MAX_RESULTS:
1531                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1532                 n = self._MAX_RESULTS
1533             return self._get_n_results(query, n)
1534
1535     def _get_n_results(self, query, n):
1536         """Get a specified number of results for a query"""
1537         raise NotImplementedError("This method must be implemented by subclasses")
1538
1539     @property
1540     def SEARCH_KEY(self):
1541         return self._SEARCH_KEY