]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[youtube] Extract average rating (closes #2362)
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
f1a9d64e
PH
1from __future__ import unicode_literals
2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
267ed0c5 13import xml.etree.ElementTree
d6983cb4 14
8c25f81b 15from ..compat import (
42939b61 16 compat_cookiejar,
96a53167 17 compat_HTTPError,
d6983cb4
PH
18 compat_http_client,
19 compat_urllib_error,
c7deaa4c 20 compat_urllib_parse_urlparse,
f0b5d6af 21 compat_urlparse,
d6983cb4 22 compat_str,
8c25f81b
PH
23)
24from ..utils import (
05900629 25 age_restricted,
d6983cb4
PH
26 clean_html,
27 compiled_regex_type,
28 ExtractorError,
b14f3a4c 29 float_or_none,
96a53167 30 HEADRequest,
31bb8d3f 31 int_or_none,
55b3e45b 32 RegexNotFoundError,
d41e6efc 33 sanitize_filename,
f38de77f 34 unescapeHTML,
d6983cb4 35)
46374a56 36_NO_DEFAULT = object()
d6983cb4 37
dca08720 38
d6983cb4
PH
39class InfoExtractor(object):
40 """Information Extractor class.
41
42 Information extractors are the classes that, given a URL, extract
43 information about the video (or videos) the URL refers to. This
44 information includes the real video URL, the video title, author and
45 others. The information is stored in a dictionary which is then
5d380852 46 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
47 information possibly downloading the video to the file system, among
48 other possible outcomes.
49
fed5d032
PH
50 The type field determines the the type of the result.
51 By far the most common value (and the default if _type is missing) is
52 "video", which indicates a single video.
53
54 For a video, the dictionaries must include the following fields:
d6983cb4
PH
55
56 id: Video identifier.
d6983cb4 57 title: Video title, unescaped.
d67b0b15 58
f49d89ee 59 Additionally, it must contain either a formats entry or a url one:
d67b0b15 60
f49d89ee
PH
61 formats: A list of dictionaries for each format available, ordered
62 from worst to best quality.
63
64 Potential fields:
d67b0b15
PH
65 * url Mandatory. The URL of the video file
66 * ext Will be calculated from url if missing
67 * format A human-readable description of the format
68 ("mp4 container with h264/opus").
69 Calculated from the format_id, width, height.
70 and format_note fields if missing.
71 * format_id A short description of the format
5d4f3985
PH
72 ("mp4_h264_opus" or "19").
73 Technically optional, but strongly recommended.
d67b0b15
PH
74 * format_note Additional info about the format
75 ("3D" or "DASH video")
76 * width Width of the video, if known
77 * height Height of the video, if known
f49d89ee 78 * resolution Textual description of width and height
7217e148 79 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
80 * abr Average audio bitrate in KBit/s
81 * acodec Name of the audio codec in use
dd27fd17 82 * asr Audio sampling rate in Hertz
d67b0b15 83 * vbr Average video bitrate in KBit/s
fbb21cf5 84 * fps Frame rate
d67b0b15 85 * vcodec Name of the video codec in use
1394ce65 86 * container Name of the container format
d67b0b15 87 * filesize The number of bytes, if known in advance
9732d77e 88 * filesize_approx An estimate for the number of bytes
d67b0b15 89 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
90 * protocol The protocol that will be used for the actual
91 download, lower-case.
b04b8852
PH
92 "http", "https", "rtsp", "rtmp", "rtmpe",
93 "m3u8", or "m3u8_native".
f49d89ee 94 * preference Order number of this format. If this field is
08d13955 95 present and not None, the formats get sorted
38d63d84 96 by this field, regardless of all other values.
f49d89ee
PH
97 -1 for default (order by other properties),
98 -2 or smaller for less than default.
e65566a9
PH
99 < -1000 to hide the format (if there is
100 another one which is strictly better)
aff2f4f4
PH
101 * language_preference Is this in the correct requested
102 language?
103 10 if it's what the URL is about,
104 -1 for default (don't know),
105 -10 otherwise, other values reserved for now.
5d73273f
PH
106 * quality Order number of the video quality of this
107 format, irrespective of the file format.
108 -1 for default (order by other properties),
109 -2 or smaller for less than default.
c64ed2a3
PH
110 * source_preference Order number for this video source
111 (quality takes higher priority)
112 -1 for default (order by other properties),
113 -2 or smaller for less than default.
d769be6c
PH
114 * http_method HTTP method to use for the download.
115 * http_headers A dictionary of additional HTTP headers
116 to add to the request.
117 * http_post_data Additional data to send with a POST
118 request.
6271f1ca 119 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
120 video's pixels are not square.
121 width : height ratio as float.
122 * no_resume The server does not support resuming the
123 (HTTP or RTMP) download. Boolean.
124
c0ba0f48 125 url: Final video URL.
d6983cb4 126 ext: Video filename extension.
d67b0b15
PH
127 format: The video format, defaults to ext (used for --get-format)
128 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 129
d6983cb4
PH
130 The following fields are optional:
131
f5e43bc6 132 alt_title: A secondary title of the video.
0afef30b
PH
133 display_id An alternative identifier for the video, not necessarily
134 unique, but available before title. Typically, id is
135 something like "4234987", title "Dancing naked mole rats",
136 and display_id "dancing-naked-mole-rats"
d5519808 137 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 138 * "id" (optional, string) - Thumbnail format ID
d5519808 139 * "url"
cfb56d1a 140 * "preference" (optional, int) - quality of the image
d5519808
PH
141 * "width" (optional, int)
142 * "height" (optional, int)
143 * "resolution" (optional, string "{width}x{height"},
144 deprecated)
d6983cb4 145 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 146 description: Full video description.
d6983cb4 147 uploader: Full name of the video uploader.
9bb8e0a3 148 creator: The main artist who created the video.
955c4514 149 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 150 upload_date: Video upload date (YYYYMMDD).
955c4514 151 If not explicitly set, calculated from timestamp.
d6983cb4 152 uploader_id: Nickname or id of the video uploader.
da9ec3b9 153 location: Physical location where the video was filmed.
5d51a883
JMF
154 subtitles: The subtitle file contents as a dictionary in the format
155 {language: subtitles}.
c0ba0f48 156 duration: Length of the video in seconds, as an integer.
f3d29461 157 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
158 like_count: Number of positive ratings of the video
159 dislike_count: Number of negative ratings of the video
2d30521a 160 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 161 comment_count: Number of comments on the video
dd622d7c
PH
162 comments: A list of comments, each with one or more of the following
163 properties (all but one of text or html optional):
164 * "author" - human-readable name of the comment author
165 * "author_id" - user ID of the comment author
166 * "id" - Comment ID
167 * "html" - Comment as HTML
168 * "text" - Plain text of the comment
169 * "timestamp" - UNIX timestamp of comment
170 * "parent" - ID of the comment this one is replying to.
171 Set to "root" to indicate that this is a
172 comment to the original video.
8dbe9899 173 age_limit: Age restriction for the video, as an integer (years)
9103bbc5
JMF
174 webpage_url: The url to the video webpage, if given to youtube-dl it
175 should allow to get the same result again. (It will be set
176 by YoutubeDL if it's missing)
ad3bc6ac
PH
177 categories: A list of categories that the video falls in, for example
178 ["Sports", "Berlin"]
7267bd53
PH
179 is_live: True, False, or None (=unknown). Whether this video is a
180 live stream that goes on instead of a fixed-length video.
d6983cb4 181
deefc05b 182 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 183
d838b1bd
PH
184 Unless mentioned otherwise, None is equivalent to absence of information.
185
fed5d032
PH
186
187 _type "playlist" indicates multiple videos.
b82f815f
PH
188 There must be a key "entries", which is a list, an iterable, or a PagedList
189 object, each element of which is a valid dictionary by this specification.
fed5d032
PH
190
191 Additionally, playlists can have "title" and "id" attributes with the same
192 semantics as videos (see above).
193
194
195 _type "multi_video" indicates that there are multiple videos that
196 form a single show, for examples multiple acts of an opera or TV episode.
197 It must have an entries key like a playlist and contain all the keys
198 required for a video at the same time.
199
200
201 _type "url" indicates that the video must be extracted from another
202 location, possibly by a different extractor. Its only required key is:
203 "url" - the next URL to extract.
f58766ce
PH
204 The key "ie_key" can be set to the class name (minus the trailing "IE",
205 e.g. "Youtube") if the extractor class is known in advance.
206 Additionally, the dictionary may have any properties of the resolved entity
207 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
208 known ahead of time.
209
210
211 _type "url_transparent" entities have the same specification as "url", but
212 indicate that the given additional information is more precise than the one
213 associated with the resolved URL.
214 This is useful when a site employs a video service that hosts the video and
215 its technical metadata, but that video service does not embed a useful
216 title, description etc.
217
218
d6983cb4
PH
219 Subclasses of this one should re-define the _real_initialize() and
220 _real_extract() methods and define a _VALID_URL regexp.
221 Probably, they should also be added to the list of extractors.
222
d6983cb4
PH
223 Finally, the _WORKING attribute should be set to False for broken IEs
224 in order to warn the users and skip the tests.
225 """
226
227 _ready = False
228 _downloader = None
229 _WORKING = True
230
231 def __init__(self, downloader=None):
232 """Constructor. Receives an optional downloader."""
233 self._ready = False
234 self.set_downloader(downloader)
235
236 @classmethod
237 def suitable(cls, url):
238 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
239
240 # This does not use has/getattr intentionally - we want to know whether
241 # we have cached the regexp for *this* class, whereas getattr would also
242 # match the superclass
243 if '_VALID_URL_RE' not in cls.__dict__:
244 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
245 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 246
ed9266db
PH
247 @classmethod
248 def _match_id(cls, url):
249 if '_VALID_URL_RE' not in cls.__dict__:
250 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
251 m = cls._VALID_URL_RE.match(url)
252 assert m
253 return m.group('id')
254
d6983cb4
PH
255 @classmethod
256 def working(cls):
257 """Getter method for _WORKING."""
258 return cls._WORKING
259
260 def initialize(self):
261 """Initializes an instance (authentication, etc)."""
262 if not self._ready:
263 self._real_initialize()
264 self._ready = True
265
266 def extract(self, url):
267 """Extracts URL information and returns it in list of dicts."""
3a5bcd03
PH
268 try:
269 self.initialize()
270 return self._real_extract(url)
271 except ExtractorError:
272 raise
273 except compat_http_client.IncompleteRead as e:
274 raise ExtractorError('A network error has occured.', cause=e, expected=True)
9650885b 275 except (KeyError, StopIteration) as e:
3a5bcd03 276 raise ExtractorError('An extractor error has occured.', cause=e)
d6983cb4
PH
277
278 def set_downloader(self, downloader):
279 """Sets the downloader for this IE."""
280 self._downloader = downloader
281
282 def _real_initialize(self):
283 """Real initialization process. Redefine in subclasses."""
284 pass
285
286 def _real_extract(self, url):
287 """Real extraction process. Redefine in subclasses."""
288 pass
289
56c73665
JMF
290 @classmethod
291 def ie_key(cls):
292 """A string for getting the InfoExtractor with get_info_extractor"""
293 return cls.__name__[:-2]
294
d6983cb4
PH
295 @property
296 def IE_NAME(self):
297 return type(self).__name__[:-2]
298
7cc3570e 299 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
300 """ Returns the response handle """
301 if note is None:
302 self.report_download_webpage(video_id)
303 elif note is not False:
7cc3570e 304 if video_id is None:
f1a9d64e 305 self.to_screen('%s' % (note,))
7cc3570e 306 else:
f1a9d64e 307 self.to_screen('%s: %s' % (video_id, note))
d6983cb4 308 try:
dca08720 309 return self._downloader.urlopen(url_or_request)
d6983cb4 310 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
311 if errnote is False:
312 return False
d6983cb4 313 if errnote is None:
f1a9d64e
PH
314 errnote = 'Unable to download webpage'
315 errmsg = '%s: %s' % (errnote, compat_str(err))
7cc3570e
PH
316 if fatal:
317 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
318 else:
319 self._downloader.report_warning(errmsg)
320 return False
d6983cb4 321
7cc3570e 322 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 323 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
324 # Strip hashes from the URL (#1038)
325 if isinstance(url_or_request, (compat_str, str)):
326 url_or_request = url_or_request.partition('#')[0]
327
7cc3570e
PH
328 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
329 if urlh is False:
330 assert not fatal
331 return False
23be51d8
PH
332 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
333 return (content, urlh)
334
4e262a88 335 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
d6983cb4 336 content_type = urlh.headers.get('Content-Type', '')
f143d86a 337 webpage_bytes = urlh.read()
4e262a88
PH
338 if prefix is not None:
339 webpage_bytes = prefix + webpage_bytes
d6983cb4
PH
340 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
341 if m:
342 encoding = m.group(1)
343 else:
0d75ae2c 344 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
345 webpage_bytes[:1024])
346 if m:
347 encoding = m.group(1).decode('ascii')
b60016e8
PH
348 elif webpage_bytes.startswith(b'\xff\xfe'):
349 encoding = 'utf-16'
f143d86a
PH
350 else:
351 encoding = 'utf-8'
d6983cb4
PH
352 if self._downloader.params.get('dump_intermediate_pages', False):
353 try:
354 url = url_or_request.get_full_url()
355 except AttributeError:
356 url = url_or_request
f1a9d64e 357 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
358 dump = base64.b64encode(webpage_bytes).decode('ascii')
359 self._downloader.to_screen(dump)
d41e6efc
PH
360 if self._downloader.params.get('write_pages', False):
361 try:
362 url = url_or_request.get_full_url()
363 except AttributeError:
364 url = url_or_request
5afa7f8b 365 basen = '%s_%s' % (video_id, url)
c1bce22f 366 if len(basen) > 240:
f1a9d64e 367 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
368 basen = basen[:240 - len(h)] + h
369 raw_filename = basen + '.dump'
d41e6efc 370 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 371 self.to_screen('Saving request to ' + filename)
5f58165d
S
372 # Working around MAX_PATH limitation on Windows (see
373 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
374 if os.name == 'nt':
375 absfilepath = os.path.abspath(filename)
376 if len(absfilepath) > 259:
377 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
378 with open(filename, 'wb') as outf:
379 outf.write(webpage_bytes)
380
ec0fafbb
AA
381 try:
382 content = webpage_bytes.decode(encoding, 'replace')
383 except LookupError:
384 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 385
f1a9d64e
PH
386 if ('<title>Access to this site is blocked</title>' in content and
387 'Websense' in content[:512]):
388 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
389 blocked_iframe = self._html_search_regex(
390 r'<iframe src="([^"]+)"', content,
f1a9d64e 391 'Websense information URL', default=None)
2410c43d 392 if blocked_iframe:
f1a9d64e 393 msg += ' Visit %s for more details' % blocked_iframe
2410c43d
PH
394 raise ExtractorError(msg, expected=True)
395
23be51d8 396 return content
d6983cb4 397
995ad69c 398 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
d6983cb4 399 """ Returns the data of the page as a string """
995ad69c
TF
400 success = False
401 try_count = 0
402 while success is False:
403 try:
404 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
405 success = True
406 except compat_http_client.IncompleteRead as e:
407 try_count += 1
408 if try_count >= tries:
409 raise e
410 self._sleep(timeout, video_id)
7cc3570e
PH
411 if res is False:
412 return res
413 else:
414 content, _ = res
415 return content
d6983cb4 416
2a275ab0 417 def _download_xml(self, url_or_request, video_id,
f1a9d64e 418 note='Downloading XML', errnote='Unable to download XML',
28746fbd 419 transform_source=None, fatal=True):
267ed0c5 420 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd
PH
421 xml_string = self._download_webpage(
422 url_or_request, video_id, note, errnote, fatal=fatal)
423 if xml_string is False:
424 return xml_string
e2b38da9
PH
425 if transform_source:
426 xml_string = transform_source(xml_string)
267ed0c5
JMF
427 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
428
3d3538e4 429 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
430 note='Downloading JSON metadata',
431 errnote='Unable to download JSON metadata',
b090af59
PH
432 transform_source=None,
433 fatal=True):
434 json_string = self._download_webpage(
435 url_or_request, video_id, note, errnote, fatal=fatal)
436 if (not fatal) and json_string is False:
437 return None
ebb64199
TF
438 return self._parse_json(
439 json_string, video_id, transform_source=transform_source, fatal=fatal)
440
441 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
442 if transform_source:
443 json_string = transform_source(json_string)
3d3538e4
PH
444 try:
445 return json.loads(json_string)
446 except ValueError as ve:
e7b6d122
PH
447 errmsg = '%s: Failed to parse JSON ' % video_id
448 if fatal:
449 raise ExtractorError(errmsg, cause=ve)
450 else:
451 self.report_warning(errmsg + str(ve))
3d3538e4 452
f45f96f8 453 def report_warning(self, msg, video_id=None):
f1a9d64e 454 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 455 self._downloader.report_warning(
f1a9d64e 456 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 457
d6983cb4
PH
458 def to_screen(self, msg):
459 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 460 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
461
462 def report_extraction(self, id_or_name):
463 """Report information extraction."""
f1a9d64e 464 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
465
466 def report_download_webpage(self, video_id):
467 """Report webpage download."""
f1a9d64e 468 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
469
470 def report_age_confirmation(self):
471 """Report attempt to confirm age."""
f1a9d64e 472 self.to_screen('Confirming age')
d6983cb4 473
fc79158d
JMF
474 def report_login(self):
475 """Report attempt to log in."""
f1a9d64e 476 self.to_screen('Logging in')
fc79158d 477
5f6a1245 478 # Methods for following #608
c0d0b01f
JMF
479 @staticmethod
480 def url_result(url, ie=None, video_id=None):
d6983cb4 481 """Returns a url that points to a page that should be processed"""
5f6a1245 482 # TODO: ie should be the class used for getting the info
d6983cb4
PH
483 video_info = {'_type': 'url',
484 'url': url,
485 'ie_key': ie}
7012b23c
PH
486 if video_id is not None:
487 video_info['id'] = video_id
d6983cb4 488 return video_info
5f6a1245 489
c0d0b01f 490 @staticmethod
acf5cbfe 491 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
492 """Returns a playlist"""
493 video_info = {'_type': 'playlist',
494 'entries': entries}
495 if playlist_id:
496 video_info['id'] = playlist_id
497 if playlist_title:
498 video_info['title'] = playlist_title
acf5cbfe
S
499 if playlist_description:
500 video_info['description'] = playlist_description
d6983cb4
PH
501 return video_info
502
711ede6e 503 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
504 """
505 Perform a regex search on the given string, using a single or a list of
506 patterns returning the first matching group.
507 In case of failure return a default value or raise a WARNING or a
55b3e45b 508 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
509 """
510 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
511 mobj = re.search(pattern, string, flags)
512 else:
513 for p in pattern:
514 mobj = re.search(p, string, flags)
c3415d1b
PH
515 if mobj:
516 break
d6983cb4 517
7e5db8c9 518 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
f1a9d64e 519 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
520 else:
521 _name = name
522
523 if mobj:
711ede6e
PH
524 if group is None:
525 # return the first matching group
526 return next(g for g in mobj.groups() if g is not None)
527 else:
528 return mobj.group(group)
46374a56 529 elif default is not _NO_DEFAULT:
d6983cb4
PH
530 return default
531 elif fatal:
f1a9d64e 532 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 533 else:
f1a9d64e 534 self._downloader.report_warning('unable to extract %s; '
9e1a5b84 535 'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4
PH
536 return None
537
711ede6e 538 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
539 """
540 Like _search_regex, but strips HTML tags and unescapes entities.
541 """
711ede6e 542 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
543 if res:
544 return clean_html(res).strip()
545 else:
546 return res
547
fc79158d
JMF
548 def _get_login_info(self):
549 """
550 Get the the login info as (username, password)
551 It will look in the netrc file using the _NETRC_MACHINE value
552 If there's no info available, return (None, None)
553 """
554 if self._downloader is None:
555 return (None, None)
556
557 username = None
558 password = None
559 downloader_params = self._downloader.params
560
561 # Attempt to use provided username and password or .netrc data
562 if downloader_params.get('username', None) is not None:
563 username = downloader_params['username']
564 password = downloader_params['password']
565 elif downloader_params.get('usenetrc', False):
566 try:
567 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
568 if info is not None:
569 username = info[0]
570 password = info[2]
571 else:
572 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
573 except (IOError, netrc.NetrcParseError) as err:
f1a9d64e 574 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
5f6a1245 575
fc79158d
JMF
576 return (username, password)
577
83317f69 578 def _get_tfa_info(self):
579 """
580 Get the two-factor authentication info
581 TODO - asking the user will be required for sms/phone verify
582 currently just uses the command line option
583 If there's no info available, return None
584 """
585 if self._downloader is None:
83317f69 586 return None
587 downloader_params = self._downloader.params
588
589 if downloader_params.get('twofactor', None) is not None:
590 return downloader_params['twofactor']
591
83317f69 592 return None
593
46720279
JMF
594 # Helper functions for extracting OpenGraph info
595 @staticmethod
ab2d5247 596 def _og_regexes(prop):
c1206423 597 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
9887c9b2 598 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2 599 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 600 return [
78fb87b2
JMF
601 template % (property_re, content_re),
602 template % (content_re, property_re),
ab2d5247 603 ]
46720279 604
3c4e6d83 605 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 606 if name is None:
3c4e6d83 607 name = 'OpenGraph %s' % prop
ab2d5247 608 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
609 if escaped is None:
610 return None
611 return unescapeHTML(escaped)
46720279
JMF
612
613 def _og_search_thumbnail(self, html, **kargs):
f1a9d64e 614 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
46720279
JMF
615
616 def _og_search_description(self, html, **kargs):
617 return self._og_search_property('description', html, fatal=False, **kargs)
618
619 def _og_search_title(self, html, **kargs):
620 return self._og_search_property('title', html, **kargs)
621
8ffa13e0 622 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
623 regexes = self._og_regexes('video') + self._og_regexes('video:url')
624 if secure:
625 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 626 return self._html_search_regex(regexes, html, name, **kargs)
46720279 627
78338f71
JMF
628 def _og_search_url(self, html, **kargs):
629 return self._og_search_property('url', html, **kargs)
630
40c696e5 631 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
632 if display_name is None:
633 display_name = name
634 return self._html_search_regex(
6c6f1408 635 r'''(?isx)<meta
711ede6e 636 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
bec22481 637 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
711ede6e 638 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
639
640 def _dc_search_uploader(self, html):
641 return self._html_search_meta('dc.creator', html, 'uploader')
642
8dbe9899
PH
643 def _rta_search(self, html):
644 # See http://www.rtalabel.org/index.php?content=howtofaq#single
645 if re.search(r'(?ix)<meta\s+name="rating"\s+'
646 r' content="RTA-5042-1996-1400-1577-RTA"',
647 html):
648 return 18
649 return 0
650
59040888
PH
651 def _media_rating_search(self, html):
652 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
653 rating = self._html_search_meta('rating', html)
654
655 if not rating:
656 return None
657
658 RATING_TABLE = {
659 'safe for kids': 0,
660 'general': 8,
661 '14 years': 14,
662 'mature': 17,
663 'restricted': 19,
664 }
665 return RATING_TABLE.get(rating.lower(), None)
666
69319969
NJ
667 def _family_friendly_search(self, html):
668 # See http://schema.org/VideoObj
669 family_friendly = self._html_search_meta('isFamilyFriendly', html)
670
671 if not family_friendly:
672 return None
673
674 RATING_TABLE = {
675 '1': 0,
676 'true': 0,
677 '0': 18,
678 'false': 18,
679 }
680 return RATING_TABLE.get(family_friendly.lower(), None)
681
0c708f11
JMF
682 def _twitter_search_player(self, html):
683 return self._html_search_meta('twitter:player', html,
9e1a5b84 684 'twitter card player')
0c708f11 685
4bcc7bd1 686 def _sort_formats(self, formats):
7e8caf30 687 if not formats:
f1a9d64e 688 raise ExtractorError('No video formats found')
7e8caf30 689
4bcc7bd1 690 def _formats_key(f):
e6812ac9
PH
691 # TODO remove the following workaround
692 from ..utils import determine_ext
693 if not f.get('ext') and 'url' in f:
694 f['ext'] = determine_ext(f['url'])
695
4bcc7bd1
PH
696 preference = f.get('preference')
697 if preference is None:
c7deaa4c
PH
698 proto = f.get('protocol')
699 if proto is None:
700 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
701
702 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
703 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
704 preference -= 0.5
705
706 if f.get('vcodec') == 'none': # audio only
707 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 708 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 709 else:
f1a9d64e 710 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
711 ext_preference = 0
712 try:
713 audio_ext_preference = ORDER.index(f['ext'])
714 except ValueError:
715 audio_ext_preference = -1
716 else:
717 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 718 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 719 else:
f1a9d64e 720 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
721 try:
722 ext_preference = ORDER.index(f['ext'])
723 except ValueError:
724 ext_preference = -1
725 audio_ext_preference = 0
726
727 return (
728 preference,
aff2f4f4 729 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 730 f.get('quality') if f.get('quality') is not None else -1,
9933b574 731 f.get('tbr') if f.get('tbr') is not None else -1,
4bcc7bd1 732 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
733 f.get('height') if f.get('height') is not None else -1,
734 f.get('width') if f.get('width') is not None else -1,
1e1896f2 735 ext_preference,
4bcc7bd1
PH
736 f.get('abr') if f.get('abr') is not None else -1,
737 audio_ext_preference,
2c8e03d9 738 f.get('fps') if f.get('fps') is not None else -1,
4bcc7bd1 739 f.get('filesize') if f.get('filesize') is not None else -1,
9732d77e 740 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 741 f.get('source_preference') if f.get('source_preference') is not None else -1,
4bcc7bd1
PH
742 f.get('format_id'),
743 )
744 formats.sort(key=_formats_key)
59040888 745
96a53167
S
746 def _check_formats(self, formats, video_id):
747 if formats:
748 formats[:] = filter(
749 lambda f: self._is_valid_url(
750 f['url'], video_id,
751 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
752 formats)
753
754 def _is_valid_url(self, url, video_id, item='video'):
755 try:
756 self._request_webpage(
757 HEADRequest(url), video_id,
758 'Checking %s URL' % item)
759 return True
760 except ExtractorError as e:
761 if isinstance(e.cause, compat_HTTPError):
762 self.report_warning(
763 '%s URL is invalid, skipping' % item, video_id)
764 return False
765 raise
766
20991253 767 def http_scheme(self):
1ede5b24 768 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
769 return (
770 'http:'
771 if self._downloader.params.get('prefer_insecure', False)
772 else 'https:')
773
57c7411f
PH
774 def _proto_relative_url(self, url, scheme=None):
775 if url is None:
776 return url
777 if url.startswith('//'):
778 if scheme is None:
779 scheme = self.http_scheme()
780 return scheme + url
781 else:
782 return url
783
4094b6e3
PH
784 def _sleep(self, timeout, video_id, msg_template=None):
785 if msg_template is None:
f1a9d64e 786 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
787 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
788 self.to_screen(msg)
789 time.sleep(timeout)
790
60ca389c 791 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
f036a632
JMF
792 manifest = self._download_xml(
793 manifest_url, video_id, 'Downloading f4m manifest',
794 'Unable to download f4m manifest')
31bb8d3f
JMF
795
796 formats = []
7a47d07c 797 manifest_version = '1.0'
b2527359 798 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 799 if not media_nodes:
7a47d07c 800 manifest_version = '2.0'
34e48bed 801 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b2527359 802 for i, media_el in enumerate(media_nodes):
7a47d07c 803 if manifest_version == '2.0':
3900eec2
S
804 manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/'
805 + (media_el.attrib.get('href') or media_el.attrib.get('url')))
b2527359 806 tbr = int_or_none(media_el.attrib.get('bitrate'))
31bb8d3f 807 formats.append({
60ca389c 808 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),
31bb8d3f
JMF
809 'url': manifest_url,
810 'ext': 'flv',
b2527359 811 'tbr': tbr,
31bb8d3f
JMF
812 'width': int_or_none(media_el.attrib.get('width')),
813 'height': int_or_none(media_el.attrib.get('height')),
60ca389c 814 'preference': preference,
31bb8d3f
JMF
815 })
816 self._sort_formats(formats)
817
818 return formats
819
f0b5d6af 820 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
60ca389c
S
821 entry_protocol='m3u8', preference=None,
822 m3u8_id=None):
f0b5d6af 823
704df56d 824 formats = [{
60ca389c 825 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])),
704df56d
PH
826 'url': m3u8_url,
827 'ext': ext,
828 'protocol': 'm3u8',
829 'preference': -1,
830 'resolution': 'multiple',
831 'format_note': 'Quality selection URL',
832 }]
833
f0b5d6af
PH
834 format_url = lambda u: (
835 u
836 if re.match(r'^https?://', u)
837 else compat_urlparse.urljoin(m3u8_url, u))
838
81515ad9
PH
839 m3u8_doc = self._download_webpage(
840 m3u8_url, video_id,
841 note='Downloading m3u8 information',
842 errnote='Failed to download m3u8 information')
704df56d
PH
843 last_info = None
844 kv_rex = re.compile(
845 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
846 for line in m3u8_doc.splitlines():
847 if line.startswith('#EXT-X-STREAM-INF:'):
848 last_info = {}
849 for m in kv_rex.finditer(line):
850 v = m.group('val')
851 if v.startswith('"'):
852 v = v[1:-1]
853 last_info[m.group('key')] = v
854 elif line.startswith('#') or not line.strip():
855 continue
856 else:
daebaab6 857 if last_info is None:
f0b5d6af 858 formats.append({'url': format_url(line)})
3524cc25 859 continue
704df56d 860 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
704df56d 861 f = {
60ca389c 862 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])),
f0b5d6af 863 'url': format_url(line.strip()),
704df56d
PH
864 'tbr': tbr,
865 'ext': ext,
f0b5d6af
PH
866 'protocol': entry_protocol,
867 'preference': preference,
704df56d
PH
868 }
869 codecs = last_info.get('CODECS')
870 if codecs:
9ebf22b7
S
871 # TODO: looks like video codec is not always necessarily goes first
872 va_codecs = codecs.split(',')
873 if va_codecs[0]:
874 f['vcodec'] = va_codecs[0].partition('.')[0]
875 if len(va_codecs) > 1 and va_codecs[1]:
876 f['acodec'] = va_codecs[1].partition('.')[0]
704df56d
PH
877 resolution = last_info.get('RESOLUTION')
878 if resolution:
879 width_str, height_str = resolution.split('x')
880 f['width'] = int(width_str)
881 f['height'] = int(height_str)
882 formats.append(f)
883 last_info = {}
884 self._sort_formats(formats)
885 return formats
886
e89a2aab 887 # TODO: improve extraction
995029a1 888 def _extract_smil_formats(self, smil_url, video_id, fatal=True):
e89a2aab
S
889 smil = self._download_xml(
890 smil_url, video_id, 'Downloading SMIL file',
995029a1
PH
891 'Unable to download SMIL file', fatal=fatal)
892 if smil is False:
893 assert not fatal
894 return []
e89a2aab
S
895
896 base = smil.find('./head/meta').get('base')
897
898 formats = []
899 rtmp_count = 0
900 for video in smil.findall('./body/switch/video'):
901 src = video.get('src')
902 if not src:
903 continue
904 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
905 width = int_or_none(video.get('width'))
906 height = int_or_none(video.get('height'))
907 proto = video.get('proto')
908 if not proto:
909 if base:
910 if base.startswith('rtmp'):
911 proto = 'rtmp'
912 elif base.startswith('http'):
913 proto = 'http'
914 ext = video.get('ext')
915 if proto == 'm3u8':
916 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
917 elif proto == 'rtmp':
918 rtmp_count += 1
919 streamer = video.get('streamer') or base
920 formats.append({
921 'url': streamer,
922 'play_path': src,
923 'ext': 'flv',
924 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
925 'tbr': bitrate,
926 'width': width,
927 'height': height,
928 })
929 self._sort_formats(formats)
930
931 return formats
932
f4b1c7ad
PH
933 def _live_title(self, name):
934 """ Generate the title for a live video """
935 now = datetime.datetime.now()
936 now_str = now.strftime("%Y-%m-%d %H:%M")
937 return name + ' ' + now_str
938
b14f3a4c
PH
939 def _int(self, v, name, fatal=False, **kwargs):
940 res = int_or_none(v, **kwargs)
941 if 'get_attr' in kwargs:
942 print(getattr(v, kwargs['get_attr']))
943 if res is None:
944 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
945 if fatal:
946 raise ExtractorError(msg)
947 else:
948 self._downloader.report_warning(msg)
949 return res
950
951 def _float(self, v, name, fatal=False, **kwargs):
952 res = float_or_none(v, **kwargs)
953 if res is None:
954 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
955 if fatal:
956 raise ExtractorError(msg)
957 else:
958 self._downloader.report_warning(msg)
959 return res
960
42939b61 961 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
962 cookie = compat_cookiejar.Cookie(
963 0, name, value, None, None, domain, None,
42939b61
JMF
964 None, '/', True, False, expire_time, '', None, None, None)
965 self._downloader.cookiejar.set_cookie(cookie)
966
05900629
PH
967 def get_testcases(self, include_onlymatching=False):
968 t = getattr(self, '_TEST', None)
969 if t:
970 assert not hasattr(self, '_TESTS'), \
971 '%s has _TEST and _TESTS' % type(self).__name__
972 tests = [t]
973 else:
974 tests = getattr(self, '_TESTS', [])
975 for t in tests:
976 if not include_onlymatching and t.get('only_matching', False):
977 continue
978 t['name'] = type(self).__name__[:-len('IE')]
979 yield t
980
981 def is_suitable(self, age_limit):
982 """ Test whether the extractor is generally suitable for the given
983 age limit (i.e. pornographic sites are not, all others usually are) """
984
985 any_restricted = False
986 for tc in self.get_testcases(include_onlymatching=False):
987 if 'playlist' in tc:
988 tc = tc['playlist'][0]
989 is_restricted = age_restricted(
990 tc.get('info_dict', {}).get('age_limit'), age_limit)
991 if not is_restricted:
992 return True
993 any_restricted = any_restricted or is_restricted
994 return not any_restricted
995
8dbe9899 996
d6983cb4
PH
997class SearchInfoExtractor(InfoExtractor):
998 """
999 Base class for paged search queries extractors.
1000 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
1001 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1002 """
1003
1004 @classmethod
1005 def _make_valid_url(cls):
1006 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1007
1008 @classmethod
1009 def suitable(cls, url):
1010 return re.match(cls._make_valid_url(), url) is not None
1011
1012 def _real_extract(self, query):
1013 mobj = re.match(self._make_valid_url(), query)
1014 if mobj is None:
f1a9d64e 1015 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
1016
1017 prefix = mobj.group('prefix')
1018 query = mobj.group('query')
1019 if prefix == '':
1020 return self._get_n_results(query, 1)
1021 elif prefix == 'all':
1022 return self._get_n_results(query, self._MAX_RESULTS)
1023 else:
1024 n = int(prefix)
1025 if n <= 0:
f1a9d64e 1026 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 1027 elif n > self._MAX_RESULTS:
f1a9d64e 1028 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
1029 n = self._MAX_RESULTS
1030 return self._get_n_results(query, n)
1031
1032 def _get_n_results(self, query, n):
1033 """Get a specified number of results for a query"""
416a5efc 1034 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
1035
1036 @property
1037 def SEARCH_KEY(self):
1038 return self._SEARCH_KEY