youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_cookiejar,
  17     compat_HTTPError,
  18     compat_http_client,
  19     compat_urllib_error,
  20     compat_urllib_parse_urlparse,
  21     compat_urlparse,
  22     compat_str,
  23 )
  24 from ..utils import (
  25     NO_DEFAULT,
  26     age_restricted,
  27     bug_reports_message,
  28     clean_html,
  29     compiled_regex_type,
  30     determine_ext,
  31     ExtractorError,
  32     fix_xml_ampersands,
  33     float_or_none,
  34     int_or_none,
  35     RegexNotFoundError,
  36     sanitize_filename,
  37     unescapeHTML,
  38 )
  39
  40
  41 class InfoExtractor(object):
  42     """Information Extractor class.
  43
  44     Information extractors are the classes that, given a URL, extract
  45     information about the video (or videos) the URL refers to. This
  46     information includes the real video URL, the video title, author and
  47     others. The information is stored in a dictionary which is then
  48     passed to the YoutubeDL. The YoutubeDL processes this
  49     information possibly downloading the video to the file system, among
  50     other possible outcomes.
  51
  52     The type field determines the type of the result.
  53     By far the most common value (and the default if _type is missing) is
  54     "video", which indicates a single video.
  55
  56     For a video, the dictionaries must include the following fields:
  57
  58     id:             Video identifier.
  59     title:          Video title, unescaped.
  60
  61     Additionally, it must contain either a formats entry or a url one:
  62
  63     formats:        A list of dictionaries for each format available, ordered
  64                     from worst to best quality.
  65
  66                     Potential fields:
  67                     * url        Mandatory. The URL of the video file
  68                     * ext        Will be calculated from url if missing
  69                     * format     A human-readable description of the format
  70                                  ("mp4 container with h264/opus").
  71                                  Calculated from the format_id, width, height.
  72                                  and format_note fields if missing.
  73                     * format_id  A short description of the format
  74                                  ("mp4_h264_opus" or "19").
  75                                 Technically optional, but strongly recommended.
  76                     * format_note Additional info about the format
  77                                  ("3D" or "DASH video")
  78                     * width      Width of the video, if known
  79                     * height     Height of the video, if known
  80                     * resolution Textual description of width and height
  81                     * tbr        Average bitrate of audio and video in KBit/s
  82                     * abr        Average audio bitrate in KBit/s
  83                     * acodec     Name of the audio codec in use
  84                     * asr        Audio sampling rate in Hertz
  85                     * vbr        Average video bitrate in KBit/s
  86                     * fps        Frame rate
  87                     * vcodec     Name of the video codec in use
  88                     * container  Name of the container format
  89                     * filesize   The number of bytes, if known in advance
  90                     * filesize_approx  An estimate for the number of bytes
  91                     * player_url SWF Player URL (used for rtmpdump).
  92                     * protocol   The protocol that will be used for the actual
  93                                  download, lower-case.
  94                                  "http", "https", "rtsp", "rtmp", "rtmpe",
  95                                  "m3u8", or "m3u8_native".
  96                     * preference Order number of this format. If this field is
  97                                  present and not None, the formats get sorted
  98                                  by this field, regardless of all other values.
  99                                  -1 for default (order by other properties),
 100                                  -2 or smaller for less than default.
 101                                  < -1000 to hide the format (if there is
 102                                     another one which is strictly better)
 103                     * language_preference  Is this in the correct requested
 104                                  language?
 105                                  10 if it's what the URL is about,
 106                                  -1 for default (don't know),
 107                                  -10 otherwise, other values reserved for now.
 108                     * quality    Order number of the video quality of this
 109                                  format, irrespective of the file format.
 110                                  -1 for default (order by other properties),
 111                                  -2 or smaller for less than default.
 112                     * source_preference  Order number for this video source
 113                                   (quality takes higher priority)
 114                                  -1 for default (order by other properties),
 115                                  -2 or smaller for less than default.
 116                     * http_headers  A dictionary of additional HTTP headers
 117                                  to add to the request.
 118                     * stretched_ratio  If given and not 1, indicates that the
 119                                  video's pixels are not square.
 120                                  width : height ratio as float.
 121                     * no_resume  The server does not support resuming the
 122                                  (HTTP or RTMP) download. Boolean.
 123
 124     url:            Final video URL.
 125     ext:            Video filename extension.
 126     format:         The video format, defaults to ext (used for --get-format)
 127     player_url:     SWF Player URL (used for rtmpdump).
 128
 129     The following fields are optional:
 130
 131     alt_title:      A secondary title of the video.
 132     display_id      An alternative identifier for the video, not necessarily
 133                     unique, but available before title. Typically, id is
 134                     something like "4234987", title "Dancing naked mole rats",
 135                     and display_id "dancing-naked-mole-rats"
 136     thumbnails:     A list of dictionaries, with the following entries:
 137                         * "id" (optional, string) - Thumbnail format ID
 138                         * "url"
 139                         * "preference" (optional, int) - quality of the image
 140                         * "width" (optional, int)
 141                         * "height" (optional, int)
 142                         * "resolution" (optional, string "{width}x{height"},
 143                                         deprecated)
 144     thumbnail:      Full URL to a video thumbnail image.
 145     description:    Full video description.
 146     uploader:       Full name of the video uploader.
 147     creator:        The main artist who created the video.
 148     timestamp:      UNIX timestamp of the moment the video became available.
 149     upload_date:    Video upload date (YYYYMMDD).
 150                     If not explicitly set, calculated from timestamp.
 151     uploader_id:    Nickname or id of the video uploader.
 152     location:       Physical location where the video was filmed.
 153     subtitles:      The available subtitles as a dictionary in the format
 154                     {language: subformats}. "subformats" is a list sorted from
 155                     lower to higher preference, each element is a dictionary
 156                     with the "ext" entry and one of:
 157                         * "data": The subtitles file contents
 158                         * "url": A url pointing to the subtitles file
 159     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 160                     automatically generated captions
 161     duration:       Length of the video in seconds, as an integer.
 162     view_count:     How many users have watched the video on the platform.
 163     like_count:     Number of positive ratings of the video
 164     dislike_count:  Number of negative ratings of the video
 165     average_rating: Average rating give by users, the scale used depends on the webpage
 166     comment_count:  Number of comments on the video
 167     comments:       A list of comments, each with one or more of the following
 168                     properties (all but one of text or html optional):
 169                         * "author" - human-readable name of the comment author
 170                         * "author_id" - user ID of the comment author
 171                         * "id" - Comment ID
 172                         * "html" - Comment as HTML
 173                         * "text" - Plain text of the comment
 174                         * "timestamp" - UNIX timestamp of comment
 175                         * "parent" - ID of the comment this one is replying to.
 176                                      Set to "root" to indicate that this is a
 177                                      comment to the original video.
 178     age_limit:      Age restriction for the video, as an integer (years)
 179     webpage_url:    The url to the video webpage, if given to youtube-dl it
 180                     should allow to get the same result again. (It will be set
 181                     by YoutubeDL if it's missing)
 182     categories:     A list of categories that the video falls in, for example
 183                     ["Sports", "Berlin"]
 184     is_live:        True, False, or None (=unknown). Whether this video is a
 185                     live stream that goes on instead of a fixed-length video.
 186     start_time:     Time in seconds where the reproduction should start, as
 187                     specified in the url.
 188     end_time:       Time in seconds where the reproduction should end, as
 189                     specified in the url.
 190
 191     Unless mentioned otherwise, the fields should be Unicode strings.
 192
 193     Unless mentioned otherwise, None is equivalent to absence of information.
 194
 195
 196     _type "playlist" indicates multiple videos.
 197     There must be a key "entries", which is a list, an iterable, or a PagedList
 198     object, each element of which is a valid dictionary by this specification.
 199
 200     Additionally, playlists can have "title" and "id" attributes with the same
 201     semantics as videos (see above).
 202
 203
 204     _type "multi_video" indicates that there are multiple videos that
 205     form a single show, for examples multiple acts of an opera or TV episode.
 206     It must have an entries key like a playlist and contain all the keys
 207     required for a video at the same time.
 208
 209
 210     _type "url" indicates that the video must be extracted from another
 211     location, possibly by a different extractor. Its only required key is:
 212     "url" - the next URL to extract.
 213     The key "ie_key" can be set to the class name (minus the trailing "IE",
 214     e.g. "Youtube") if the extractor class is known in advance.
 215     Additionally, the dictionary may have any properties of the resolved entity
 216     known in advance, for example "title" if the title of the referred video is
 217     known ahead of time.
 218
 219
 220     _type "url_transparent" entities have the same specification as "url", but
 221     indicate that the given additional information is more precise than the one
 222     associated with the resolved URL.
 223     This is useful when a site employs a video service that hosts the video and
 224     its technical metadata, but that video service does not embed a useful
 225     title, description etc.
 226
 227
 228     Subclasses of this one should re-define the _real_initialize() and
 229     _real_extract() methods and define a _VALID_URL regexp.
 230     Probably, they should also be added to the list of extractors.
 231
 232     Finally, the _WORKING attribute should be set to False for broken IEs
 233     in order to warn the users and skip the tests.
 234     """
 235
 236     _ready = False
 237     _downloader = None
 238     _WORKING = True
 239
 240     def __init__(self, downloader=None):
 241         """Constructor. Receives an optional downloader."""
 242         self._ready = False
 243         self.set_downloader(downloader)
 244
 245     @classmethod
 246     def suitable(cls, url):
 247         """Receives a URL and returns True if suitable for this IE."""
 248
 249         # This does not use has/getattr intentionally - we want to know whether
 250         # we have cached the regexp for *this* class, whereas getattr would also
 251         # match the superclass
 252         if '_VALID_URL_RE' not in cls.__dict__:
 253             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 254         return cls._VALID_URL_RE.match(url) is not None
 255
 256     @classmethod
 257     def _match_id(cls, url):
 258         if '_VALID_URL_RE' not in cls.__dict__:
 259             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 260         m = cls._VALID_URL_RE.match(url)
 261         assert m
 262         return m.group('id')
 263
 264     @classmethod
 265     def working(cls):
 266         """Getter method for _WORKING."""
 267         return cls._WORKING
 268
 269     def initialize(self):
 270         """Initializes an instance (authentication, etc)."""
 271         if not self._ready:
 272             self._real_initialize()
 273             self._ready = True
 274
 275     def extract(self, url):
 276         """Extracts URL information and returns it in list of dicts."""
 277         try:
 278             self.initialize()
 279             return self._real_extract(url)
 280         except ExtractorError:
 281             raise
 282         except compat_http_client.IncompleteRead as e:
 283             raise ExtractorError('A network error has occured.', cause=e, expected=True)
 284         except (KeyError, StopIteration) as e:
 285             raise ExtractorError('An extractor error has occured.', cause=e)
 286
 287     def set_downloader(self, downloader):
 288         """Sets the downloader for this IE."""
 289         self._downloader = downloader
 290
 291     def _real_initialize(self):
 292         """Real initialization process. Redefine in subclasses."""
 293         pass
 294
 295     def _real_extract(self, url):
 296         """Real extraction process. Redefine in subclasses."""
 297         pass
 298
 299     @classmethod
 300     def ie_key(cls):
 301         """A string for getting the InfoExtractor with get_info_extractor"""
 302         return cls.__name__[:-2]
 303
 304     @property
 305     def IE_NAME(self):
 306         return type(self).__name__[:-2]
 307
 308     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 309         """ Returns the response handle """
 310         if note is None:
 311             self.report_download_webpage(video_id)
 312         elif note is not False:
 313             if video_id is None:
 314                 self.to_screen('%s' % (note,))
 315             else:
 316                 self.to_screen('%s: %s' % (video_id, note))
 317         try:
 318             return self._downloader.urlopen(url_or_request)
 319         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 320             if errnote is False:
 321                 return False
 322             if errnote is None:
 323                 errnote = 'Unable to download webpage'
 324             errmsg = '%s: %s' % (errnote, compat_str(err))
 325             if fatal:
 326                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 327             else:
 328                 self._downloader.report_warning(errmsg)
 329                 return False
 330
 331     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
 332         """ Returns a tuple (page content as string, URL handle) """
 333         # Strip hashes from the URL (#1038)
 334         if isinstance(url_or_request, (compat_str, str)):
 335             url_or_request = url_or_request.partition('#')[0]
 336
 337         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 338         if urlh is False:
 339             assert not fatal
 340             return False
 341         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 342         return (content, urlh)
 343
 344     @staticmethod
 345     def _guess_encoding_from_content(content_type, webpage_bytes):
 346         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 347         if m:
 348             encoding = m.group(1)
 349         else:
 350             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 351                           webpage_bytes[:1024])
 352             if m:
 353                 encoding = m.group(1).decode('ascii')
 354             elif webpage_bytes.startswith(b'\xff\xfe'):
 355                 encoding = 'utf-16'
 356             else:
 357                 encoding = 'utf-8'
 358
 359         return encoding
 360
 361     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 362         content_type = urlh.headers.get('Content-Type', '')
 363         webpage_bytes = urlh.read()
 364         if prefix is not None:
 365             webpage_bytes = prefix + webpage_bytes
 366         if not encoding:
 367             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 368         if self._downloader.params.get('dump_intermediate_pages', False):
 369             try:
 370                 url = url_or_request.get_full_url()
 371             except AttributeError:
 372                 url = url_or_request
 373             self.to_screen('Dumping request to ' + url)
 374             dump = base64.b64encode(webpage_bytes).decode('ascii')
 375             self._downloader.to_screen(dump)
 376         if self._downloader.params.get('write_pages', False):
 377             try:
 378                 url = url_or_request.get_full_url()
 379             except AttributeError:
 380                 url = url_or_request
 381             basen = '%s_%s' % (video_id, url)
 382             if len(basen) > 240:
 383                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 384                 basen = basen[:240 - len(h)] + h
 385             raw_filename = basen + '.dump'
 386             filename = sanitize_filename(raw_filename, restricted=True)
 387             self.to_screen('Saving request to ' + filename)
 388             # Working around MAX_PATH limitation on Windows (see
 389             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 390             if os.name == 'nt':
 391                 absfilepath = os.path.abspath(filename)
 392                 if len(absfilepath) > 259:
 393                     filename = '\\\\?\\' + absfilepath
 394             with open(filename, 'wb') as outf:
 395                 outf.write(webpage_bytes)
 396
 397         try:
 398             content = webpage_bytes.decode(encoding, 'replace')
 399         except LookupError:
 400             content = webpage_bytes.decode('utf-8', 'replace')
 401
 402         if ('<title>Access to this site is blocked</title>' in content and
 403                 'Websense' in content[:512]):
 404             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 405             blocked_iframe = self._html_search_regex(
 406                 r'<iframe src="([^"]+)"', content,
 407                 'Websense information URL', default=None)
 408             if blocked_iframe:
 409                 msg += ' Visit %s for more details' % blocked_iframe
 410             raise ExtractorError(msg, expected=True)
 411         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 412             msg = (
 413                 'Access to this webpage has been blocked by Indian censorship. '
 414                 'Use a VPN or proxy server (with --proxy) to route around it.')
 415             block_msg = self._html_search_regex(
 416                 r'</h1><p>(.*?)</p>',
 417                 content, 'block message', default=None)
 418             if block_msg:
 419                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 420             raise ExtractorError(msg, expected=True)
 421
 422         return content
 423
 424     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
 425         """ Returns the data of the page as a string """
 426         success = False
 427         try_count = 0
 428         while success is False:
 429             try:
 430                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 431                 success = True
 432             except compat_http_client.IncompleteRead as e:
 433                 try_count += 1
 434                 if try_count >= tries:
 435                     raise e
 436                 self._sleep(timeout, video_id)
 437         if res is False:
 438             return res
 439         else:
 440             content, _ = res
 441             return content
 442
 443     def _download_xml(self, url_or_request, video_id,
 444                       note='Downloading XML', errnote='Unable to download XML',
 445                       transform_source=None, fatal=True, encoding=None):
 446         """Return the xml as an xml.etree.ElementTree.Element"""
 447         xml_string = self._download_webpage(
 448             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
 449         if xml_string is False:
 450             return xml_string
 451         if transform_source:
 452             xml_string = transform_source(xml_string)
 453         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 454
 455     def _download_json(self, url_or_request, video_id,
 456                        note='Downloading JSON metadata',
 457                        errnote='Unable to download JSON metadata',
 458                        transform_source=None,
 459                        fatal=True, encoding=None):
 460         json_string = self._download_webpage(
 461             url_or_request, video_id, note, errnote, fatal=fatal,
 462             encoding=encoding)
 463         if (not fatal) and json_string is False:
 464             return None
 465         return self._parse_json(
 466             json_string, video_id, transform_source=transform_source, fatal=fatal)
 467
 468     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 469         if transform_source:
 470             json_string = transform_source(json_string)
 471         try:
 472             return json.loads(json_string)
 473         except ValueError as ve:
 474             errmsg = '%s: Failed to parse JSON ' % video_id
 475             if fatal:
 476                 raise ExtractorError(errmsg, cause=ve)
 477             else:
 478                 self.report_warning(errmsg + str(ve))
 479
 480     def report_warning(self, msg, video_id=None):
 481         idstr = '' if video_id is None else '%s: ' % video_id
 482         self._downloader.report_warning(
 483             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 484
 485     def to_screen(self, msg):
 486         """Print msg to screen, prefixing it with '[ie_name]'"""
 487         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 488
 489     def report_extraction(self, id_or_name):
 490         """Report information extraction."""
 491         self.to_screen('%s: Extracting information' % id_or_name)
 492
 493     def report_download_webpage(self, video_id):
 494         """Report webpage download."""
 495         self.to_screen('%s: Downloading webpage' % video_id)
 496
 497     def report_age_confirmation(self):
 498         """Report attempt to confirm age."""
 499         self.to_screen('Confirming age')
 500
 501     def report_login(self):
 502         """Report attempt to log in."""
 503         self.to_screen('Logging in')
 504
 505     # Methods for following #608
 506     @staticmethod
 507     def url_result(url, ie=None, video_id=None, video_title=None):
 508         """Returns a url that points to a page that should be processed"""
 509         # TODO: ie should be the class used for getting the info
 510         video_info = {'_type': 'url',
 511                       'url': url,
 512                       'ie_key': ie}
 513         if video_id is not None:
 514             video_info['id'] = video_id
 515         if video_title is not None:
 516             video_info['title'] = video_title
 517         return video_info
 518
 519     @staticmethod
 520     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 521         """Returns a playlist"""
 522         video_info = {'_type': 'playlist',
 523                       'entries': entries}
 524         if playlist_id:
 525             video_info['id'] = playlist_id
 526         if playlist_title:
 527             video_info['title'] = playlist_title
 528         if playlist_description:
 529             video_info['description'] = playlist_description
 530         return video_info
 531
 532     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 533         """
 534         Perform a regex search on the given string, using a single or a list of
 535         patterns returning the first matching group.
 536         In case of failure return a default value or raise a WARNING or a
 537         RegexNotFoundError, depending on fatal, specifying the field name.
 538         """
 539         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 540             mobj = re.search(pattern, string, flags)
 541         else:
 542             for p in pattern:
 543                 mobj = re.search(p, string, flags)
 544                 if mobj:
 545                     break
 546
 547         if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
 548             _name = '\033[0;34m%s\033[0m' % name
 549         else:
 550             _name = name
 551
 552         if mobj:
 553             if group is None:
 554                 # return the first matching group
 555                 return next(g for g in mobj.groups() if g is not None)
 556             else:
 557                 return mobj.group(group)
 558         elif default is not NO_DEFAULT:
 559             return default
 560         elif fatal:
 561             raise RegexNotFoundError('Unable to extract %s' % _name)
 562         else:
 563             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 564             return None
 565
 566     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 567         """
 568         Like _search_regex, but strips HTML tags and unescapes entities.
 569         """
 570         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 571         if res:
 572             return clean_html(res).strip()
 573         else:
 574             return res
 575
 576     def _get_login_info(self):
 577         """
 578         Get the login info as (username, password)
 579         It will look in the netrc file using the _NETRC_MACHINE value
 580         If there's no info available, return (None, None)
 581         """
 582         if self._downloader is None:
 583             return (None, None)
 584
 585         username = None
 586         password = None
 587         downloader_params = self._downloader.params
 588
 589         # Attempt to use provided username and password or .netrc data
 590         if downloader_params.get('username', None) is not None:
 591             username = downloader_params['username']
 592             password = downloader_params['password']
 593         elif downloader_params.get('usenetrc', False):
 594             try:
 595                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 596                 if info is not None:
 597                     username = info[0]
 598                     password = info[2]
 599                 else:
 600                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 601             except (IOError, netrc.NetrcParseError) as err:
 602                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 603
 604         return (username, password)
 605
 606     def _get_tfa_info(self):
 607         """
 608         Get the two-factor authentication info
 609         TODO - asking the user will be required for sms/phone verify
 610         currently just uses the command line option
 611         If there's no info available, return None
 612         """
 613         if self._downloader is None:
 614             return None
 615         downloader_params = self._downloader.params
 616
 617         if downloader_params.get('twofactor', None) is not None:
 618             return downloader_params['twofactor']
 619
 620         return None
 621
 622     # Helper functions for extracting OpenGraph info
 623     @staticmethod
 624     def _og_regexes(prop):
 625         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 626         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 627         template = r'<meta[^>]+?%s[^>]+?%s'
 628         return [
 629             template % (property_re, content_re),
 630             template % (content_re, property_re),
 631         ]
 632
 633     def _og_search_property(self, prop, html, name=None, **kargs):
 634         if name is None:
 635             name = 'OpenGraph %s' % prop
 636         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 637         if escaped is None:
 638             return None
 639         return unescapeHTML(escaped)
 640
 641     def _og_search_thumbnail(self, html, **kargs):
 642         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 643
 644     def _og_search_description(self, html, **kargs):
 645         return self._og_search_property('description', html, fatal=False, **kargs)
 646
 647     def _og_search_title(self, html, **kargs):
 648         return self._og_search_property('title', html, **kargs)
 649
 650     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 651         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 652         if secure:
 653             regexes = self._og_regexes('video:secure_url') + regexes
 654         return self._html_search_regex(regexes, html, name, **kargs)
 655
 656     def _og_search_url(self, html, **kargs):
 657         return self._og_search_property('url', html, **kargs)
 658
 659     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 660         if display_name is None:
 661             display_name = name
 662         return self._html_search_regex(
 663             r'''(?isx)<meta
 664                     (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 665                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
 666             html, display_name, fatal=fatal, group='content', **kwargs)
 667
 668     def _dc_search_uploader(self, html):
 669         return self._html_search_meta('dc.creator', html, 'uploader')
 670
 671     def _rta_search(self, html):
 672         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 673         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 674                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 675                      html):
 676             return 18
 677         return 0
 678
 679     def _media_rating_search(self, html):
 680         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 681         rating = self._html_search_meta('rating', html)
 682
 683         if not rating:
 684             return None
 685
 686         RATING_TABLE = {
 687             'safe for kids': 0,
 688             'general': 8,
 689             '14 years': 14,
 690             'mature': 17,
 691             'restricted': 19,
 692         }
 693         return RATING_TABLE.get(rating.lower(), None)
 694
 695     def _family_friendly_search(self, html):
 696         # See http://schema.org/VideoObject
 697         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 698
 699         if not family_friendly:
 700             return None
 701
 702         RATING_TABLE = {
 703             '1': 0,
 704             'true': 0,
 705             '0': 18,
 706             'false': 18,
 707         }
 708         return RATING_TABLE.get(family_friendly.lower(), None)
 709
 710     def _twitter_search_player(self, html):
 711         return self._html_search_meta('twitter:player', html,
 712                                       'twitter card player')
 713
 714     @staticmethod
 715     def _hidden_inputs(html):
 716         return dict([
 717             (input.group('name'), input.group('value')) for input in re.finditer(
 718                 r'''(?x)
 719                     <input\s+
 720                         type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+
 721                         name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+
 722                         (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)?
 723                         value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value)
 724                 ''', html)
 725         ])
 726
 727     def _form_hidden_inputs(self, form_id, html):
 728         form = self._search_regex(
 729             r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
 730             html, '%s form' % form_id, group='form')
 731         return self._hidden_inputs(form)
 732
 733     def _sort_formats(self, formats, field_preference=None):
 734         if not formats:
 735             raise ExtractorError('No video formats found')
 736
 737         def _formats_key(f):
 738             # TODO remove the following workaround
 739             from ..utils import determine_ext
 740             if not f.get('ext') and 'url' in f:
 741                 f['ext'] = determine_ext(f['url'])
 742
 743             if isinstance(field_preference, (list, tuple)):
 744                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
 745
 746             preference = f.get('preference')
 747             if preference is None:
 748                 proto = f.get('protocol')
 749                 if proto is None:
 750                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 751
 752                 preference = 0 if proto in ['http', 'https'] else -0.1
 753                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 754                     preference -= 0.5
 755
 756             if f.get('vcodec') == 'none':  # audio only
 757                 if self._downloader.params.get('prefer_free_formats'):
 758                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 759                 else:
 760                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 761                 ext_preference = 0
 762                 try:
 763                     audio_ext_preference = ORDER.index(f['ext'])
 764                 except ValueError:
 765                     audio_ext_preference = -1
 766             else:
 767                 if self._downloader.params.get('prefer_free_formats'):
 768                     ORDER = ['flv', 'mp4', 'webm']
 769                 else:
 770                     ORDER = ['webm', 'flv', 'mp4']
 771                 try:
 772                     ext_preference = ORDER.index(f['ext'])
 773                 except ValueError:
 774                     ext_preference = -1
 775                 audio_ext_preference = 0
 776
 777             return (
 778                 preference,
 779                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 780                 f.get('quality') if f.get('quality') is not None else -1,
 781                 f.get('tbr') if f.get('tbr') is not None else -1,
 782                 f.get('filesize') if f.get('filesize') is not None else -1,
 783                 f.get('vbr') if f.get('vbr') is not None else -1,
 784                 f.get('height') if f.get('height') is not None else -1,
 785                 f.get('width') if f.get('width') is not None else -1,
 786                 ext_preference,
 787                 f.get('abr') if f.get('abr') is not None else -1,
 788                 audio_ext_preference,
 789                 f.get('fps') if f.get('fps') is not None else -1,
 790                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 791                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 792                 f.get('format_id') if f.get('format_id') is not None else '',
 793             )
 794         formats.sort(key=_formats_key)
 795
 796     def _check_formats(self, formats, video_id):
 797         if formats:
 798             formats[:] = filter(
 799                 lambda f: self._is_valid_url(
 800                     f['url'], video_id,
 801                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
 802                 formats)
 803
 804     def _is_valid_url(self, url, video_id, item='video'):
 805         url = self._proto_relative_url(url, scheme='http:')
 806         # For now assume non HTTP(S) URLs always valid
 807         if not (url.startswith('http://') or url.startswith('https://')):
 808             return True
 809         try:
 810             self._request_webpage(url, video_id, 'Checking %s URL' % item)
 811             return True
 812         except ExtractorError as e:
 813             if isinstance(e.cause, compat_HTTPError):
 814                 self.to_screen(
 815                     '%s: %s URL is invalid, skipping' % (video_id, item))
 816                 return False
 817             raise
 818
 819     def http_scheme(self):
 820         """ Either "http:" or "https:", depending on the user's preferences """
 821         return (
 822             'http:'
 823             if self._downloader.params.get('prefer_insecure', False)
 824             else 'https:')
 825
 826     def _proto_relative_url(self, url, scheme=None):
 827         if url is None:
 828             return url
 829         if url.startswith('//'):
 830             if scheme is None:
 831                 scheme = self.http_scheme()
 832             return scheme + url
 833         else:
 834             return url
 835
 836     def _sleep(self, timeout, video_id, msg_template=None):
 837         if msg_template is None:
 838             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 839         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 840         self.to_screen(msg)
 841         time.sleep(timeout)
 842
 843     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
 844                              transform_source=lambda s: fix_xml_ampersands(s).strip()):
 845         manifest = self._download_xml(
 846             manifest_url, video_id, 'Downloading f4m manifest',
 847             'Unable to download f4m manifest',
 848             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
 849             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
 850             transform_source=transform_source)
 851
 852         formats = []
 853         manifest_version = '1.0'
 854         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 855         if not media_nodes:
 856             manifest_version = '2.0'
 857             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
 858         for i, media_el in enumerate(media_nodes):
 859             if manifest_version == '2.0':
 860                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
 861                 if not media_url:
 862                     continue
 863                 manifest_url = (
 864                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
 865                     else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
 866                 # If media_url is itself a f4m manifest do the recursive extraction
 867                 # since bitrates in parent manifest (this one) and media_url manifest
 868                 # may differ leading to inability to resolve the format by requested
 869                 # bitrate in f4m downloader
 870                 if determine_ext(manifest_url) == 'f4m':
 871                     formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
 872                     continue
 873             tbr = int_or_none(media_el.attrib.get('bitrate'))
 874             formats.append({
 875                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
 876                 'url': manifest_url,
 877                 'ext': 'flv',
 878                 'tbr': tbr,
 879                 'width': int_or_none(media_el.attrib.get('width')),
 880                 'height': int_or_none(media_el.attrib.get('height')),
 881                 'preference': preference,
 882             })
 883         self._sort_formats(formats)
 884
 885         return formats
 886
 887     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 888                               entry_protocol='m3u8', preference=None,
 889                               m3u8_id=None, note=None, errnote=None,
 890                               fatal=True):
 891
 892         formats = [{
 893             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
 894             'url': m3u8_url,
 895             'ext': ext,
 896             'protocol': 'm3u8',
 897             'preference': preference - 1 if preference else -1,
 898             'resolution': 'multiple',
 899             'format_note': 'Quality selection URL',
 900         }]
 901
 902         format_url = lambda u: (
 903             u
 904             if re.match(r'^https?://', u)
 905             else compat_urlparse.urljoin(m3u8_url, u))
 906
 907         m3u8_doc = self._download_webpage(
 908             m3u8_url, video_id,
 909             note=note or 'Downloading m3u8 information',
 910             errnote=errnote or 'Failed to download m3u8 information',
 911             fatal=fatal)
 912         if m3u8_doc is False:
 913             return m3u8_doc
 914         last_info = None
 915         last_media = None
 916         kv_rex = re.compile(
 917             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 918         for line in m3u8_doc.splitlines():
 919             if line.startswith('#EXT-X-STREAM-INF:'):
 920                 last_info = {}
 921                 for m in kv_rex.finditer(line):
 922                     v = m.group('val')
 923                     if v.startswith('"'):
 924                         v = v[1:-1]
 925                     last_info[m.group('key')] = v
 926             elif line.startswith('#EXT-X-MEDIA:'):
 927                 last_media = {}
 928                 for m in kv_rex.finditer(line):
 929                     v = m.group('val')
 930                     if v.startswith('"'):
 931                         v = v[1:-1]
 932                     last_media[m.group('key')] = v
 933             elif line.startswith('#') or not line.strip():
 934                 continue
 935             else:
 936                 if last_info is None:
 937                     formats.append({'url': format_url(line)})
 938                     continue
 939                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 940                 format_id = []
 941                 if m3u8_id:
 942                     format_id.append(m3u8_id)
 943                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
 944                 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
 945                 f = {
 946                     'format_id': '-'.join(format_id),
 947                     'url': format_url(line.strip()),
 948                     'tbr': tbr,
 949                     'ext': ext,
 950                     'protocol': entry_protocol,
 951                     'preference': preference,
 952                 }
 953                 codecs = last_info.get('CODECS')
 954                 if codecs:
 955                     # TODO: looks like video codec is not always necessarily goes first
 956                     va_codecs = codecs.split(',')
 957                     if va_codecs[0]:
 958                         f['vcodec'] = va_codecs[0].partition('.')[0]
 959                     if len(va_codecs) > 1 and va_codecs[1]:
 960                         f['acodec'] = va_codecs[1].partition('.')[0]
 961                 resolution = last_info.get('RESOLUTION')
 962                 if resolution:
 963                     width_str, height_str = resolution.split('x')
 964                     f['width'] = int(width_str)
 965                     f['height'] = int(height_str)
 966                 if last_media is not None:
 967                     f['m3u8_media'] = last_media
 968                     last_media = None
 969                 formats.append(f)
 970                 last_info = {}
 971         self._sort_formats(formats)
 972         return formats
 973
 974     # TODO: improve extraction
 975     def _extract_smil_formats(self, smil_url, video_id, fatal=True):
 976         smil = self._download_xml(
 977             smil_url, video_id, 'Downloading SMIL file',
 978             'Unable to download SMIL file', fatal=fatal)
 979         if smil is False:
 980             assert not fatal
 981             return []
 982
 983         base = smil.find('./head/meta').get('base')
 984
 985         formats = []
 986         rtmp_count = 0
 987         if smil.findall('./body/seq/video'):
 988             video = smil.findall('./body/seq/video')[0]
 989             fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
 990             formats.extend(fmts)
 991         else:
 992             for video in smil.findall('./body/switch/video'):
 993                 fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
 994                 formats.extend(fmts)
 995
 996         self._sort_formats(formats)
 997
 998         return formats
 999
1000     def _parse_smil_video(self, video, video_id, base, rtmp_count):
1001         src = video.get('src')
1002         if not src:
1003             return [], rtmp_count
1004         bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1005         width = int_or_none(video.get('width'))
1006         height = int_or_none(video.get('height'))
1007         proto = video.get('proto')
1008         if not proto:
1009             if base:
1010                 if base.startswith('rtmp'):
1011                     proto = 'rtmp'
1012                 elif base.startswith('http'):
1013                     proto = 'http'
1014         ext = video.get('ext')
1015         if proto == 'm3u8':
1016             return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
1017         elif proto == 'rtmp':
1018             rtmp_count += 1
1019             streamer = video.get('streamer') or base
1020             return ([{
1021                 'url': streamer,
1022                 'play_path': src,
1023                 'ext': 'flv',
1024                 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1025                 'tbr': bitrate,
1026                 'width': width,
1027                 'height': height,
1028             }], rtmp_count)
1029         elif proto.startswith('http'):
1030             return ([{
1031                 'url': base + src,
1032                 'ext': ext or 'flv',
1033                 'tbr': bitrate,
1034                 'width': width,
1035                 'height': height,
1036             }], rtmp_count)
1037
1038     def _live_title(self, name):
1039         """ Generate the title for a live video """
1040         now = datetime.datetime.now()
1041         now_str = now.strftime("%Y-%m-%d %H:%M")
1042         return name + ' ' + now_str
1043
1044     def _int(self, v, name, fatal=False, **kwargs):
1045         res = int_or_none(v, **kwargs)
1046         if 'get_attr' in kwargs:
1047             print(getattr(v, kwargs['get_attr']))
1048         if res is None:
1049             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1050             if fatal:
1051                 raise ExtractorError(msg)
1052             else:
1053                 self._downloader.report_warning(msg)
1054         return res
1055
1056     def _float(self, v, name, fatal=False, **kwargs):
1057         res = float_or_none(v, **kwargs)
1058         if res is None:
1059             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1060             if fatal:
1061                 raise ExtractorError(msg)
1062             else:
1063                 self._downloader.report_warning(msg)
1064         return res
1065
1066     def _set_cookie(self, domain, name, value, expire_time=None):
1067         cookie = compat_cookiejar.Cookie(
1068             0, name, value, None, None, domain, None,
1069             None, '/', True, False, expire_time, '', None, None, None)
1070         self._downloader.cookiejar.set_cookie(cookie)
1071
1072     def get_testcases(self, include_onlymatching=False):
1073         t = getattr(self, '_TEST', None)
1074         if t:
1075             assert not hasattr(self, '_TESTS'), \
1076                 '%s has _TEST and _TESTS' % type(self).__name__
1077             tests = [t]
1078         else:
1079             tests = getattr(self, '_TESTS', [])
1080         for t in tests:
1081             if not include_onlymatching and t.get('only_matching', False):
1082                 continue
1083             t['name'] = type(self).__name__[:-len('IE')]
1084             yield t
1085
1086     def is_suitable(self, age_limit):
1087         """ Test whether the extractor is generally suitable for the given
1088         age limit (i.e. pornographic sites are not, all others usually are) """
1089
1090         any_restricted = False
1091         for tc in self.get_testcases(include_onlymatching=False):
1092             if 'playlist' in tc:
1093                 tc = tc['playlist'][0]
1094             is_restricted = age_restricted(
1095                 tc.get('info_dict', {}).get('age_limit'), age_limit)
1096             if not is_restricted:
1097                 return True
1098             any_restricted = any_restricted or is_restricted
1099         return not any_restricted
1100
1101     def extract_subtitles(self, *args, **kwargs):
1102         if (self._downloader.params.get('writesubtitles', False) or
1103                 self._downloader.params.get('listsubtitles')):
1104             return self._get_subtitles(*args, **kwargs)
1105         return {}
1106
1107     def _get_subtitles(self, *args, **kwargs):
1108         raise NotImplementedError("This method must be implemented by subclasses")
1109
1110     def extract_automatic_captions(self, *args, **kwargs):
1111         if (self._downloader.params.get('writeautomaticsub', False) or
1112                 self._downloader.params.get('listsubtitles')):
1113             return self._get_automatic_captions(*args, **kwargs)
1114         return {}
1115
1116     def _get_automatic_captions(self, *args, **kwargs):
1117         raise NotImplementedError("This method must be implemented by subclasses")
1118
1119
1120 class SearchInfoExtractor(InfoExtractor):
1121     """
1122     Base class for paged search queries extractors.
1123     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
1124     Instances should define _SEARCH_KEY and _MAX_RESULTS.
1125     """
1126
1127     @classmethod
1128     def _make_valid_url(cls):
1129         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1130
1131     @classmethod
1132     def suitable(cls, url):
1133         return re.match(cls._make_valid_url(), url) is not None
1134
1135     def _real_extract(self, query):
1136         mobj = re.match(self._make_valid_url(), query)
1137         if mobj is None:
1138             raise ExtractorError('Invalid search query "%s"' % query)
1139
1140         prefix = mobj.group('prefix')
1141         query = mobj.group('query')
1142         if prefix == '':
1143             return self._get_n_results(query, 1)
1144         elif prefix == 'all':
1145             return self._get_n_results(query, self._MAX_RESULTS)
1146         else:
1147             n = int(prefix)
1148             if n <= 0:
1149                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1150             elif n > self._MAX_RESULTS:
1151                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1152                 n = self._MAX_RESULTS
1153             return self._get_n_results(query, n)
1154
1155     def _get_n_results(self, query, n):
1156         """Get a specified number of results for a query"""
1157         raise NotImplementedError("This method must be implemented by subclasses")
1158
1159     @property
1160     def SEARCH_KEY(self):
1161         return self._SEARCH_KEY