youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import hashlib
   5 import json
   6 import netrc
   7 import os
   8 import re
   9 import socket
  10 import sys
  11 import time
  12 import xml.etree.ElementTree
  13
  14 from ..utils import (
  15     compat_http_client,
  16     compat_urllib_error,
  17     compat_urllib_parse_urlparse,
  18     compat_urlparse,
  19     compat_str,
  20
  21     clean_html,
  22     compiled_regex_type,
  23     ExtractorError,
  24     int_or_none,
  25     RegexNotFoundError,
  26     sanitize_filename,
  27     unescapeHTML,
  28 )
  29 _NO_DEFAULT = object()
  30
  31
  32 class InfoExtractor(object):
  33     """Information Extractor class.
  34
  35     Information extractors are the classes that, given a URL, extract
  36     information about the video (or videos) the URL refers to. This
  37     information includes the real video URL, the video title, author and
  38     others. The information is stored in a dictionary which is then
  39     passed to the FileDownloader. The FileDownloader processes this
  40     information possibly downloading the video to the file system, among
  41     other possible outcomes.
  42
  43     The dictionaries must include the following fields:
  44
  45     id:             Video identifier.
  46     title:          Video title, unescaped.
  47
  48     Additionally, it must contain either a formats entry or a url one:
  49
  50     formats:        A list of dictionaries for each format available, ordered
  51                     from worst to best quality.
  52
  53                     Potential fields:
  54                     * url        Mandatory. The URL of the video file
  55                     * ext        Will be calculated from url if missing
  56                     * format     A human-readable description of the format
  57                                  ("mp4 container with h264/opus").
  58                                  Calculated from the format_id, width, height.
  59                                  and format_note fields if missing.
  60                     * format_id  A short description of the format
  61                                  ("mp4_h264_opus" or "19").
  62                                 Technically optional, but strongly recommended.
  63                     * format_note Additional info about the format
  64                                  ("3D" or "DASH video")
  65                     * width      Width of the video, if known
  66                     * height     Height of the video, if known
  67                     * resolution Textual description of width and height
  68                     * tbr        Average bitrate of audio and video in KBit/s
  69                     * abr        Average audio bitrate in KBit/s
  70                     * acodec     Name of the audio codec in use
  71                     * asr        Audio sampling rate in Hertz
  72                     * vbr        Average video bitrate in KBit/s
  73                     * vcodec     Name of the video codec in use
  74                     * container  Name of the container format
  75                     * filesize   The number of bytes, if known in advance
  76                     * filesize_approx  An estimate for the number of bytes
  77                     * player_url SWF Player URL (used for rtmpdump).
  78                     * protocol   The protocol that will be used for the actual
  79                                  download, lower-case.
  80                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  81                     * preference Order number of this format. If this field is
  82                                  present and not None, the formats get sorted
  83                                  by this field, regardless of all other values.
  84                                  -1 for default (order by other properties),
  85                                  -2 or smaller for less than default.
  86                     * quality    Order number of the video quality of this
  87                                  format, irrespective of the file format.
  88                                  -1 for default (order by other properties),
  89                                  -2 or smaller for less than default.
  90                     * http_referer  HTTP Referer header value to set.
  91                     * http_method  HTTP method to use for the download.
  92                     * http_headers  A dictionary of additional HTTP headers
  93                                  to add to the request.
  94                     * http_post_data  Additional data to send with a POST
  95                                  request.
  96     url:            Final video URL.
  97     ext:            Video filename extension.
  98     format:         The video format, defaults to ext (used for --get-format)
  99     player_url:     SWF Player URL (used for rtmpdump).
 100
 101     The following fields are optional:
 102
 103     display_id      An alternative identifier for the video, not necessarily
 104                     unique, but available before title. Typically, id is
 105                     something like "4234987", title "Dancing naked mole rats",
 106                     and display_id "dancing-naked-mole-rats"
 107     thumbnails:     A list of dictionaries, with the following entries:
 108                         * "url"
 109                         * "width" (optional, int)
 110                         * "height" (optional, int)
 111                         * "resolution" (optional, string "{width}x{height"},
 112                                         deprecated)
 113     thumbnail:      Full URL to a video thumbnail image.
 114     description:    One-line video description.
 115     uploader:       Full name of the video uploader.
 116     timestamp:      UNIX timestamp of the moment the video became available.
 117     upload_date:    Video upload date (YYYYMMDD).
 118                     If not explicitly set, calculated from timestamp.
 119     uploader_id:    Nickname or id of the video uploader.
 120     location:       Physical location where the video was filmed.
 121     subtitles:      The subtitle file contents as a dictionary in the format
 122                     {language: subtitles}.
 123     duration:       Length of the video in seconds, as an integer.
 124     view_count:     How many users have watched the video on the platform.
 125     like_count:     Number of positive ratings of the video
 126     dislike_count:  Number of negative ratings of the video
 127     comment_count:  Number of comments on the video
 128     age_limit:      Age restriction for the video, as an integer (years)
 129     webpage_url:    The url to the video webpage, if given to youtube-dl it
 130                     should allow to get the same result again. (It will be set
 131                     by YoutubeDL if it's missing)
 132     categories:     A list of categories that the video falls in, for example
 133                     ["Sports", "Berlin"]
 134     is_live:        True, False, or None (=unknown). Whether this video is a
 135                     live stream that goes on instead of a fixed-length video.
 136
 137     Unless mentioned otherwise, the fields should be Unicode strings.
 138
 139     Subclasses of this one should re-define the _real_initialize() and
 140     _real_extract() methods and define a _VALID_URL regexp.
 141     Probably, they should also be added to the list of extractors.
 142
 143     Finally, the _WORKING attribute should be set to False for broken IEs
 144     in order to warn the users and skip the tests.
 145     """
 146
 147     _ready = False
 148     _downloader = None
 149     _WORKING = True
 150
 151     def __init__(self, downloader=None):
 152         """Constructor. Receives an optional downloader."""
 153         self._ready = False
 154         self.set_downloader(downloader)
 155
 156     @classmethod
 157     def suitable(cls, url):
 158         """Receives a URL and returns True if suitable for this IE."""
 159
 160         # This does not use has/getattr intentionally - we want to know whether
 161         # we have cached the regexp for *this* class, whereas getattr would also
 162         # match the superclass
 163         if '_VALID_URL_RE' not in cls.__dict__:
 164             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 165         return cls._VALID_URL_RE.match(url) is not None
 166
 167     @classmethod
 168     def working(cls):
 169         """Getter method for _WORKING."""
 170         return cls._WORKING
 171
 172     def initialize(self):
 173         """Initializes an instance (authentication, etc)."""
 174         if not self._ready:
 175             self._real_initialize()
 176             self._ready = True
 177
 178     def extract(self, url):
 179         """Extracts URL information and returns it in list of dicts."""
 180         self.initialize()
 181         return self._real_extract(url)
 182
 183     def set_downloader(self, downloader):
 184         """Sets the downloader for this IE."""
 185         self._downloader = downloader
 186
 187     def _real_initialize(self):
 188         """Real initialization process. Redefine in subclasses."""
 189         pass
 190
 191     def _real_extract(self, url):
 192         """Real extraction process. Redefine in subclasses."""
 193         pass
 194
 195     @classmethod
 196     def ie_key(cls):
 197         """A string for getting the InfoExtractor with get_info_extractor"""
 198         return cls.__name__[:-2]
 199
 200     @property
 201     def IE_NAME(self):
 202         return type(self).__name__[:-2]
 203
 204     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 205         """ Returns the response handle """
 206         if note is None:
 207             self.report_download_webpage(video_id)
 208         elif note is not False:
 209             if video_id is None:
 210                 self.to_screen('%s' % (note,))
 211             else:
 212                 self.to_screen('%s: %s' % (video_id, note))
 213         try:
 214             return self._downloader.urlopen(url_or_request)
 215         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 216             if errnote is False:
 217                 return False
 218             if errnote is None:
 219                 errnote = 'Unable to download webpage'
 220             errmsg = '%s: %s' % (errnote, compat_str(err))
 221             if fatal:
 222                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 223             else:
 224                 self._downloader.report_warning(errmsg)
 225                 return False
 226
 227     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 228         """ Returns a tuple (page content as string, URL handle) """
 229
 230         # Strip hashes from the URL (#1038)
 231         if isinstance(url_or_request, (compat_str, str)):
 232             url_or_request = url_or_request.partition('#')[0]
 233
 234         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 235         if urlh is False:
 236             assert not fatal
 237             return False
 238         content_type = urlh.headers.get('Content-Type', '')
 239         webpage_bytes = urlh.read()
 240         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 241         if m:
 242             encoding = m.group(1)
 243         else:
 244             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 245                           webpage_bytes[:1024])
 246             if m:
 247                 encoding = m.group(1).decode('ascii')
 248             elif webpage_bytes.startswith(b'\xff\xfe'):
 249                 encoding = 'utf-16'
 250             else:
 251                 encoding = 'utf-8'
 252         if self._downloader.params.get('dump_intermediate_pages', False):
 253             try:
 254                 url = url_or_request.get_full_url()
 255             except AttributeError:
 256                 url = url_or_request
 257             self.to_screen('Dumping request to ' + url)
 258             dump = base64.b64encode(webpage_bytes).decode('ascii')
 259             self._downloader.to_screen(dump)
 260         if self._downloader.params.get('write_pages', False):
 261             try:
 262                 url = url_or_request.get_full_url()
 263             except AttributeError:
 264                 url = url_or_request
 265             basen = '%s_%s' % (video_id, url)
 266             if len(basen) > 240:
 267                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 268                 basen = basen[:240 - len(h)] + h
 269             raw_filename = basen + '.dump'
 270             filename = sanitize_filename(raw_filename, restricted=True)
 271             self.to_screen('Saving request to ' + filename)
 272             with open(filename, 'wb') as outf:
 273                 outf.write(webpage_bytes)
 274
 275         try:
 276             content = webpage_bytes.decode(encoding, 'replace')
 277         except LookupError:
 278             content = webpage_bytes.decode('utf-8', 'replace')
 279
 280         if ('<title>Access to this site is blocked</title>' in content and
 281                 'Websense' in content[:512]):
 282             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 283             blocked_iframe = self._html_search_regex(
 284                 r'<iframe src="([^"]+)"', content,
 285                 'Websense information URL', default=None)
 286             if blocked_iframe:
 287                 msg += ' Visit %s for more details' % blocked_iframe
 288             raise ExtractorError(msg, expected=True)
 289
 290         return (content, urlh)
 291
 292     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 293         """ Returns the data of the page as a string """
 294         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 295         if res is False:
 296             return res
 297         else:
 298             content, _ = res
 299             return content
 300
 301     def _download_xml(self, url_or_request, video_id,
 302                       note='Downloading XML', errnote='Unable to download XML',
 303                       transform_source=None, fatal=True):
 304         """Return the xml as an xml.etree.ElementTree.Element"""
 305         xml_string = self._download_webpage(
 306             url_or_request, video_id, note, errnote, fatal=fatal)
 307         if xml_string is False:
 308             return xml_string
 309         if transform_source:
 310             xml_string = transform_source(xml_string)
 311         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 312
 313     def _download_json(self, url_or_request, video_id,
 314                        note='Downloading JSON metadata',
 315                        errnote='Unable to download JSON metadata',
 316                        transform_source=None,
 317                        fatal=True):
 318         json_string = self._download_webpage(
 319             url_or_request, video_id, note, errnote, fatal=fatal)
 320         if (not fatal) and json_string is False:
 321             return None
 322         if transform_source:
 323             json_string = transform_source(json_string)
 324         try:
 325             return json.loads(json_string)
 326         except ValueError as ve:
 327             raise ExtractorError('Failed to download JSON', cause=ve)
 328
 329     def report_warning(self, msg, video_id=None):
 330         idstr = '' if video_id is None else '%s: ' % video_id
 331         self._downloader.report_warning(
 332             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 333
 334     def to_screen(self, msg):
 335         """Print msg to screen, prefixing it with '[ie_name]'"""
 336         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 337
 338     def report_extraction(self, id_or_name):
 339         """Report information extraction."""
 340         self.to_screen('%s: Extracting information' % id_or_name)
 341
 342     def report_download_webpage(self, video_id):
 343         """Report webpage download."""
 344         self.to_screen('%s: Downloading webpage' % video_id)
 345
 346     def report_age_confirmation(self):
 347         """Report attempt to confirm age."""
 348         self.to_screen('Confirming age')
 349
 350     def report_login(self):
 351         """Report attempt to log in."""
 352         self.to_screen('Logging in')
 353
 354     #Methods for following #608
 355     @staticmethod
 356     def url_result(url, ie=None, video_id=None):
 357         """Returns a url that points to a page that should be processed"""
 358         #TODO: ie should be the class used for getting the info
 359         video_info = {'_type': 'url',
 360                       'url': url,
 361                       'ie_key': ie}
 362         if video_id is not None:
 363             video_info['id'] = video_id
 364         return video_info
 365     @staticmethod
 366     def playlist_result(entries, playlist_id=None, playlist_title=None):
 367         """Returns a playlist"""
 368         video_info = {'_type': 'playlist',
 369                       'entries': entries}
 370         if playlist_id:
 371             video_info['id'] = playlist_id
 372         if playlist_title:
 373             video_info['title'] = playlist_title
 374         return video_info
 375
 376     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 377         """
 378         Perform a regex search on the given string, using a single or a list of
 379         patterns returning the first matching group.
 380         In case of failure return a default value or raise a WARNING or a
 381         RegexNotFoundError, depending on fatal, specifying the field name.
 382         """
 383         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 384             mobj = re.search(pattern, string, flags)
 385         else:
 386             for p in pattern:
 387                 mobj = re.search(p, string, flags)
 388                 if mobj:
 389                     break
 390
 391         if os.name != 'nt' and sys.stderr.isatty():
 392             _name = '\033[0;34m%s\033[0m' % name
 393         else:
 394             _name = name
 395
 396         if mobj:
 397             # return the first matching group
 398             return next(g for g in mobj.groups() if g is not None)
 399         elif default is not _NO_DEFAULT:
 400             return default
 401         elif fatal:
 402             raise RegexNotFoundError('Unable to extract %s' % _name)
 403         else:
 404             self._downloader.report_warning('unable to extract %s; '
 405                 'please report this issue on http://yt-dl.org/bug' % _name)
 406             return None
 407
 408     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 409         """
 410         Like _search_regex, but strips HTML tags and unescapes entities.
 411         """
 412         res = self._search_regex(pattern, string, name, default, fatal, flags)
 413         if res:
 414             return clean_html(res).strip()
 415         else:
 416             return res
 417
 418     def _get_login_info(self):
 419         """
 420         Get the the login info as (username, password)
 421         It will look in the netrc file using the _NETRC_MACHINE value
 422         If there's no info available, return (None, None)
 423         """
 424         if self._downloader is None:
 425             return (None, None)
 426
 427         username = None
 428         password = None
 429         downloader_params = self._downloader.params
 430
 431         # Attempt to use provided username and password or .netrc data
 432         if downloader_params.get('username', None) is not None:
 433             username = downloader_params['username']
 434             password = downloader_params['password']
 435         elif downloader_params.get('usenetrc', False):
 436             try:
 437                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 438                 if info is not None:
 439                     username = info[0]
 440                     password = info[2]
 441                 else:
 442                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 443             except (IOError, netrc.NetrcParseError) as err:
 444                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 445
 446         return (username, password)
 447
 448     def _get_tfa_info(self):
 449         """
 450         Get the two-factor authentication info
 451         TODO - asking the user will be required for sms/phone verify
 452         currently just uses the command line option
 453         If there's no info available, return None
 454         """
 455         if self._downloader is None:
 456             return None
 457         downloader_params = self._downloader.params
 458
 459         if downloader_params.get('twofactor', None) is not None:
 460             return downloader_params['twofactor']
 461
 462         return None
 463
 464     # Helper functions for extracting OpenGraph info
 465     @staticmethod
 466     def _og_regexes(prop):
 467         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 468         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 469         template = r'<meta[^>]+?%s[^>]+?%s'
 470         return [
 471             template % (property_re, content_re),
 472             template % (content_re, property_re),
 473         ]
 474
 475     def _og_search_property(self, prop, html, name=None, **kargs):
 476         if name is None:
 477             name = 'OpenGraph %s' % prop
 478         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 479         if escaped is None:
 480             return None
 481         return unescapeHTML(escaped)
 482
 483     def _og_search_thumbnail(self, html, **kargs):
 484         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 485
 486     def _og_search_description(self, html, **kargs):
 487         return self._og_search_property('description', html, fatal=False, **kargs)
 488
 489     def _og_search_title(self, html, **kargs):
 490         return self._og_search_property('title', html, **kargs)
 491
 492     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 493         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 494         if secure:
 495             regexes = self._og_regexes('video:secure_url') + regexes
 496         return self._html_search_regex(regexes, html, name, **kargs)
 497
 498     def _og_search_url(self, html, **kargs):
 499         return self._og_search_property('url', html, **kargs)
 500
 501     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 502         if display_name is None:
 503             display_name = name
 504         return self._html_search_regex(
 505             r'''(?ix)<meta
 506                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 507                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 508             html, display_name, fatal=fatal, **kwargs)
 509
 510     def _dc_search_uploader(self, html):
 511         return self._html_search_meta('dc.creator', html, 'uploader')
 512
 513     def _rta_search(self, html):
 514         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 515         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 516                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 517                      html):
 518             return 18
 519         return 0
 520
 521     def _media_rating_search(self, html):
 522         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 523         rating = self._html_search_meta('rating', html)
 524
 525         if not rating:
 526             return None
 527
 528         RATING_TABLE = {
 529             'safe for kids': 0,
 530             'general': 8,
 531             '14 years': 14,
 532             'mature': 17,
 533             'restricted': 19,
 534         }
 535         return RATING_TABLE.get(rating.lower(), None)
 536
 537     def _twitter_search_player(self, html):
 538         return self._html_search_meta('twitter:player', html,
 539             'twitter card player')
 540
 541     def _sort_formats(self, formats):
 542         if not formats:
 543             raise ExtractorError('No video formats found')
 544
 545         def _formats_key(f):
 546             # TODO remove the following workaround
 547             from ..utils import determine_ext
 548             if not f.get('ext') and 'url' in f:
 549                 f['ext'] = determine_ext(f['url'])
 550
 551             preference = f.get('preference')
 552             if preference is None:
 553                 proto = f.get('protocol')
 554                 if proto is None:
 555                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 556
 557                 preference = 0 if proto in ['http', 'https'] else -0.1
 558                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 559                     preference -= 0.5
 560
 561             if f.get('vcodec') == 'none':  # audio only
 562                 if self._downloader.params.get('prefer_free_formats'):
 563                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 564                 else:
 565                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 566                 ext_preference = 0
 567                 try:
 568                     audio_ext_preference = ORDER.index(f['ext'])
 569                 except ValueError:
 570                     audio_ext_preference = -1
 571             else:
 572                 if self._downloader.params.get('prefer_free_formats'):
 573                     ORDER = ['flv', 'mp4', 'webm']
 574                 else:
 575                     ORDER = ['webm', 'flv', 'mp4']
 576                 try:
 577                     ext_preference = ORDER.index(f['ext'])
 578                 except ValueError:
 579                     ext_preference = -1
 580                 audio_ext_preference = 0
 581
 582             return (
 583                 preference,
 584                 f.get('quality') if f.get('quality') is not None else -1,
 585                 f.get('height') if f.get('height') is not None else -1,
 586                 f.get('width') if f.get('width') is not None else -1,
 587                 ext_preference,
 588                 f.get('tbr') if f.get('tbr') is not None else -1,
 589                 f.get('vbr') if f.get('vbr') is not None else -1,
 590                 f.get('abr') if f.get('abr') is not None else -1,
 591                 audio_ext_preference,
 592                 f.get('filesize') if f.get('filesize') is not None else -1,
 593                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 594                 f.get('format_id'),
 595             )
 596         formats.sort(key=_formats_key)
 597
 598     def http_scheme(self):
 599         """ Either "https:" or "https:", depending on the user's preferences """
 600         return (
 601             'http:'
 602             if self._downloader.params.get('prefer_insecure', False)
 603             else 'https:')
 604
 605     def _proto_relative_url(self, url, scheme=None):
 606         if url is None:
 607             return url
 608         if url.startswith('//'):
 609             if scheme is None:
 610                 scheme = self.http_scheme()
 611             return scheme + url
 612         else:
 613             return url
 614
 615     def _sleep(self, timeout, video_id, msg_template=None):
 616         if msg_template is None:
 617             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 618         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 619         self.to_screen(msg)
 620         time.sleep(timeout)
 621
 622     def _extract_f4m_formats(self, manifest_url, video_id):
 623         manifest = self._download_xml(
 624             manifest_url, video_id, 'Downloading f4m manifest',
 625             'Unable to download f4m manifest')
 626
 627         formats = []
 628         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 629         for i, media_el in enumerate(media_nodes):
 630             tbr = int_or_none(media_el.attrib.get('bitrate'))
 631             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 632             formats.append({
 633                 'format_id': format_id,
 634                 'url': manifest_url,
 635                 'ext': 'flv',
 636                 'tbr': tbr,
 637                 'width': int_or_none(media_el.attrib.get('width')),
 638                 'height': int_or_none(media_el.attrib.get('height')),
 639             })
 640         self._sort_formats(formats)
 641
 642         return formats
 643
 644     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 645                               entry_protocol='m3u8', preference=None):
 646
 647         formats = [{
 648             'format_id': 'm3u8-meta',
 649             'url': m3u8_url,
 650             'ext': ext,
 651             'protocol': 'm3u8',
 652             'preference': -1,
 653             'resolution': 'multiple',
 654             'format_note': 'Quality selection URL',
 655         }]
 656
 657         format_url = lambda u: (
 658             u
 659             if re.match(r'^https?://', u)
 660             else compat_urlparse.urljoin(m3u8_url, u))
 661
 662         m3u8_doc = self._download_webpage(m3u8_url, video_id)
 663         last_info = None
 664         kv_rex = re.compile(
 665             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 666         for line in m3u8_doc.splitlines():
 667             if line.startswith('#EXT-X-STREAM-INF:'):
 668                 last_info = {}
 669                 for m in kv_rex.finditer(line):
 670                     v = m.group('val')
 671                     if v.startswith('"'):
 672                         v = v[1:-1]
 673                     last_info[m.group('key')] = v
 674             elif line.startswith('#') or not line.strip():
 675                 continue
 676             else:
 677                 if last_info is None:
 678                     formats.append({'url': format_url(line)})
 679                     continue
 680                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 681
 682                 f = {
 683                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 684                     'url': format_url(line.strip()),
 685                     'tbr': tbr,
 686                     'ext': ext,
 687                     'protocol': entry_protocol,
 688                     'preference': preference,
 689                 }
 690                 codecs = last_info.get('CODECS')
 691                 if codecs:
 692                     # TODO: looks like video codec is not always necessarily goes first
 693                     va_codecs = codecs.split(',')
 694                     if va_codecs[0]:
 695                         f['vcodec'] = va_codecs[0].partition('.')[0]
 696                     if len(va_codecs) > 1 and va_codecs[1]:
 697                         f['acodec'] = va_codecs[1].partition('.')[0]
 698                 resolution = last_info.get('RESOLUTION')
 699                 if resolution:
 700                     width_str, height_str = resolution.split('x')
 701                     f['width'] = int(width_str)
 702                     f['height'] = int(height_str)
 703                 formats.append(f)
 704                 last_info = {}
 705         self._sort_formats(formats)
 706         return formats
 707
 708
 709 class SearchInfoExtractor(InfoExtractor):
 710     """
 711     Base class for paged search queries extractors.
 712     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 713     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 714     """
 715
 716     @classmethod
 717     def _make_valid_url(cls):
 718         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 719
 720     @classmethod
 721     def suitable(cls, url):
 722         return re.match(cls._make_valid_url(), url) is not None
 723
 724     def _real_extract(self, query):
 725         mobj = re.match(self._make_valid_url(), query)
 726         if mobj is None:
 727             raise ExtractorError('Invalid search query "%s"' % query)
 728
 729         prefix = mobj.group('prefix')
 730         query = mobj.group('query')
 731         if prefix == '':
 732             return self._get_n_results(query, 1)
 733         elif prefix == 'all':
 734             return self._get_n_results(query, self._MAX_RESULTS)
 735         else:
 736             n = int(prefix)
 737             if n <= 0:
 738                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 739             elif n > self._MAX_RESULTS:
 740                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 741                 n = self._MAX_RESULTS
 742             return self._get_n_results(query, n)
 743
 744     def _get_n_results(self, query, n):
 745         """Get a specified number of results for a query"""
 746         raise NotImplementedError("This method must be implemented by subclasses")
 747
 748     @property
 749     def SEARCH_KEY(self):
 750         return self._SEARCH_KEY