]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
Merge remote-tracking branch 'duncankl/airmozilla'
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
f1a9d64e
PH
1from __future__ import unicode_literals
2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
267ed0c5 13import xml.etree.ElementTree
d6983cb4 14
8c25f81b 15from ..compat import (
42939b61 16 compat_cookiejar,
96a53167 17 compat_HTTPError,
d6983cb4
PH
18 compat_http_client,
19 compat_urllib_error,
c7deaa4c 20 compat_urllib_parse_urlparse,
f0b5d6af 21 compat_urlparse,
d6983cb4 22 compat_str,
8c25f81b
PH
23)
24from ..utils import (
05900629 25 age_restricted,
d6983cb4
PH
26 clean_html,
27 compiled_regex_type,
28 ExtractorError,
b14f3a4c 29 float_or_none,
31bb8d3f 30 int_or_none,
55b3e45b 31 RegexNotFoundError,
d41e6efc 32 sanitize_filename,
f38de77f 33 unescapeHTML,
d6983cb4 34)
46374a56 35_NO_DEFAULT = object()
d6983cb4 36
dca08720 37
d6983cb4
PH
38class InfoExtractor(object):
39 """Information Extractor class.
40
41 Information extractors are the classes that, given a URL, extract
42 information about the video (or videos) the URL refers to. This
43 information includes the real video URL, the video title, author and
44 others. The information is stored in a dictionary which is then
5d380852 45 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
46 information possibly downloading the video to the file system, among
47 other possible outcomes.
48
fed5d032
PH
49 The type field determines the the type of the result.
50 By far the most common value (and the default if _type is missing) is
51 "video", which indicates a single video.
52
53 For a video, the dictionaries must include the following fields:
d6983cb4
PH
54
55 id: Video identifier.
d6983cb4 56 title: Video title, unescaped.
d67b0b15 57
f49d89ee 58 Additionally, it must contain either a formats entry or a url one:
d67b0b15 59
f49d89ee
PH
60 formats: A list of dictionaries for each format available, ordered
61 from worst to best quality.
62
63 Potential fields:
d67b0b15
PH
64 * url Mandatory. The URL of the video file
65 * ext Will be calculated from url if missing
66 * format A human-readable description of the format
67 ("mp4 container with h264/opus").
68 Calculated from the format_id, width, height.
69 and format_note fields if missing.
70 * format_id A short description of the format
5d4f3985
PH
71 ("mp4_h264_opus" or "19").
72 Technically optional, but strongly recommended.
d67b0b15
PH
73 * format_note Additional info about the format
74 ("3D" or "DASH video")
75 * width Width of the video, if known
76 * height Height of the video, if known
f49d89ee 77 * resolution Textual description of width and height
7217e148 78 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
79 * abr Average audio bitrate in KBit/s
80 * acodec Name of the audio codec in use
dd27fd17 81 * asr Audio sampling rate in Hertz
d67b0b15 82 * vbr Average video bitrate in KBit/s
fbb21cf5 83 * fps Frame rate
d67b0b15 84 * vcodec Name of the video codec in use
1394ce65 85 * container Name of the container format
d67b0b15 86 * filesize The number of bytes, if known in advance
9732d77e 87 * filesize_approx An estimate for the number of bytes
d67b0b15 88 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
89 * protocol The protocol that will be used for the actual
90 download, lower-case.
b04b8852
PH
91 "http", "https", "rtsp", "rtmp", "rtmpe",
92 "m3u8", or "m3u8_native".
f49d89ee 93 * preference Order number of this format. If this field is
08d13955 94 present and not None, the formats get sorted
38d63d84 95 by this field, regardless of all other values.
f49d89ee
PH
96 -1 for default (order by other properties),
97 -2 or smaller for less than default.
e65566a9
PH
98 < -1000 to hide the format (if there is
99 another one which is strictly better)
aff2f4f4
PH
100 * language_preference Is this in the correct requested
101 language?
102 10 if it's what the URL is about,
103 -1 for default (don't know),
104 -10 otherwise, other values reserved for now.
5d73273f
PH
105 * quality Order number of the video quality of this
106 format, irrespective of the file format.
107 -1 for default (order by other properties),
108 -2 or smaller for less than default.
c64ed2a3
PH
109 * source_preference Order number for this video source
110 (quality takes higher priority)
111 -1 for default (order by other properties),
112 -2 or smaller for less than default.
d769be6c
PH
113 * http_method HTTP method to use for the download.
114 * http_headers A dictionary of additional HTTP headers
115 to add to the request.
116 * http_post_data Additional data to send with a POST
117 request.
6271f1ca 118 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
119 video's pixels are not square.
120 width : height ratio as float.
121 * no_resume The server does not support resuming the
122 (HTTP or RTMP) download. Boolean.
123
c0ba0f48 124 url: Final video URL.
d6983cb4 125 ext: Video filename extension.
d67b0b15
PH
126 format: The video format, defaults to ext (used for --get-format)
127 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 128
d6983cb4
PH
129 The following fields are optional:
130
f5e43bc6 131 alt_title: A secondary title of the video.
0afef30b
PH
132 display_id An alternative identifier for the video, not necessarily
133 unique, but available before title. Typically, id is
134 something like "4234987", title "Dancing naked mole rats",
135 and display_id "dancing-naked-mole-rats"
d5519808 136 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 137 * "id" (optional, string) - Thumbnail format ID
d5519808 138 * "url"
cfb56d1a 139 * "preference" (optional, int) - quality of the image
d5519808
PH
140 * "width" (optional, int)
141 * "height" (optional, int)
142 * "resolution" (optional, string "{width}x{height"},
143 deprecated)
d6983cb4 144 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 145 description: Full video description.
d6983cb4 146 uploader: Full name of the video uploader.
9bb8e0a3 147 creator: The main artist who created the video.
955c4514 148 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 149 upload_date: Video upload date (YYYYMMDD).
955c4514 150 If not explicitly set, calculated from timestamp.
d6983cb4 151 uploader_id: Nickname or id of the video uploader.
da9ec3b9 152 location: Physical location where the video was filmed.
a504ced0
JMF
153 subtitles: The available subtitles as a dictionary in the format
154 {language: subformats}. "subformats" is a list sorted from
155 lower to higher preference, each element is a dictionary
156 with the "ext" entry and one of:
157 * "data": The subtitles file contents
158 * "url": A url pointing to the subtitles file
360e1ca5
JMF
159 automatic_captions: Like 'subtitles', used by the YoutubeIE for
160 automatically generated captions
c0ba0f48 161 duration: Length of the video in seconds, as an integer.
f3d29461 162 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
163 like_count: Number of positive ratings of the video
164 dislike_count: Number of negative ratings of the video
2d30521a 165 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 166 comment_count: Number of comments on the video
dd622d7c
PH
167 comments: A list of comments, each with one or more of the following
168 properties (all but one of text or html optional):
169 * "author" - human-readable name of the comment author
170 * "author_id" - user ID of the comment author
171 * "id" - Comment ID
172 * "html" - Comment as HTML
173 * "text" - Plain text of the comment
174 * "timestamp" - UNIX timestamp of comment
175 * "parent" - ID of the comment this one is replying to.
176 Set to "root" to indicate that this is a
177 comment to the original video.
8dbe9899 178 age_limit: Age restriction for the video, as an integer (years)
9103bbc5
JMF
179 webpage_url: The url to the video webpage, if given to youtube-dl it
180 should allow to get the same result again. (It will be set
181 by YoutubeDL if it's missing)
ad3bc6ac
PH
182 categories: A list of categories that the video falls in, for example
183 ["Sports", "Berlin"]
7267bd53
PH
184 is_live: True, False, or None (=unknown). Whether this video is a
185 live stream that goes on instead of a fixed-length video.
d6983cb4 186
deefc05b 187 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 188
d838b1bd
PH
189 Unless mentioned otherwise, None is equivalent to absence of information.
190
fed5d032
PH
191
192 _type "playlist" indicates multiple videos.
b82f815f
PH
193 There must be a key "entries", which is a list, an iterable, or a PagedList
194 object, each element of which is a valid dictionary by this specification.
fed5d032
PH
195
196 Additionally, playlists can have "title" and "id" attributes with the same
197 semantics as videos (see above).
198
199
200 _type "multi_video" indicates that there are multiple videos that
201 form a single show, for examples multiple acts of an opera or TV episode.
202 It must have an entries key like a playlist and contain all the keys
203 required for a video at the same time.
204
205
206 _type "url" indicates that the video must be extracted from another
207 location, possibly by a different extractor. Its only required key is:
208 "url" - the next URL to extract.
f58766ce
PH
209 The key "ie_key" can be set to the class name (minus the trailing "IE",
210 e.g. "Youtube") if the extractor class is known in advance.
211 Additionally, the dictionary may have any properties of the resolved entity
212 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
213 known ahead of time.
214
215
216 _type "url_transparent" entities have the same specification as "url", but
217 indicate that the given additional information is more precise than the one
218 associated with the resolved URL.
219 This is useful when a site employs a video service that hosts the video and
220 its technical metadata, but that video service does not embed a useful
221 title, description etc.
222
223
d6983cb4
PH
224 Subclasses of this one should re-define the _real_initialize() and
225 _real_extract() methods and define a _VALID_URL regexp.
226 Probably, they should also be added to the list of extractors.
227
d6983cb4
PH
228 Finally, the _WORKING attribute should be set to False for broken IEs
229 in order to warn the users and skip the tests.
230 """
231
232 _ready = False
233 _downloader = None
234 _WORKING = True
235
236 def __init__(self, downloader=None):
237 """Constructor. Receives an optional downloader."""
238 self._ready = False
239 self.set_downloader(downloader)
240
241 @classmethod
242 def suitable(cls, url):
243 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
244
245 # This does not use has/getattr intentionally - we want to know whether
246 # we have cached the regexp for *this* class, whereas getattr would also
247 # match the superclass
248 if '_VALID_URL_RE' not in cls.__dict__:
249 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
250 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 251
ed9266db
PH
252 @classmethod
253 def _match_id(cls, url):
254 if '_VALID_URL_RE' not in cls.__dict__:
255 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
256 m = cls._VALID_URL_RE.match(url)
257 assert m
258 return m.group('id')
259
d6983cb4
PH
260 @classmethod
261 def working(cls):
262 """Getter method for _WORKING."""
263 return cls._WORKING
264
265 def initialize(self):
266 """Initializes an instance (authentication, etc)."""
267 if not self._ready:
268 self._real_initialize()
269 self._ready = True
270
271 def extract(self, url):
272 """Extracts URL information and returns it in list of dicts."""
3a5bcd03
PH
273 try:
274 self.initialize()
275 return self._real_extract(url)
276 except ExtractorError:
277 raise
278 except compat_http_client.IncompleteRead as e:
279 raise ExtractorError('A network error has occured.', cause=e, expected=True)
9650885b 280 except (KeyError, StopIteration) as e:
3a5bcd03 281 raise ExtractorError('An extractor error has occured.', cause=e)
d6983cb4
PH
282
283 def set_downloader(self, downloader):
284 """Sets the downloader for this IE."""
285 self._downloader = downloader
286
287 def _real_initialize(self):
288 """Real initialization process. Redefine in subclasses."""
289 pass
290
291 def _real_extract(self, url):
292 """Real extraction process. Redefine in subclasses."""
293 pass
294
56c73665
JMF
295 @classmethod
296 def ie_key(cls):
297 """A string for getting the InfoExtractor with get_info_extractor"""
298 return cls.__name__[:-2]
299
d6983cb4
PH
300 @property
301 def IE_NAME(self):
302 return type(self).__name__[:-2]
303
7cc3570e 304 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
305 """ Returns the response handle """
306 if note is None:
307 self.report_download_webpage(video_id)
308 elif note is not False:
7cc3570e 309 if video_id is None:
f1a9d64e 310 self.to_screen('%s' % (note,))
7cc3570e 311 else:
f1a9d64e 312 self.to_screen('%s: %s' % (video_id, note))
d6983cb4 313 try:
dca08720 314 return self._downloader.urlopen(url_or_request)
d6983cb4 315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
316 if errnote is False:
317 return False
d6983cb4 318 if errnote is None:
f1a9d64e
PH
319 errnote = 'Unable to download webpage'
320 errmsg = '%s: %s' % (errnote, compat_str(err))
7cc3570e
PH
321 if fatal:
322 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
323 else:
324 self._downloader.report_warning(errmsg)
325 return False
d6983cb4 326
7cc3570e 327 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 328 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
329 # Strip hashes from the URL (#1038)
330 if isinstance(url_or_request, (compat_str, str)):
331 url_or_request = url_or_request.partition('#')[0]
332
7cc3570e
PH
333 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
334 if urlh is False:
335 assert not fatal
336 return False
23be51d8
PH
337 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
338 return (content, urlh)
339
4e262a88 340 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
d6983cb4 341 content_type = urlh.headers.get('Content-Type', '')
f143d86a 342 webpage_bytes = urlh.read()
4e262a88
PH
343 if prefix is not None:
344 webpage_bytes = prefix + webpage_bytes
d6983cb4
PH
345 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
346 if m:
347 encoding = m.group(1)
348 else:
0d75ae2c 349 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
350 webpage_bytes[:1024])
351 if m:
352 encoding = m.group(1).decode('ascii')
b60016e8
PH
353 elif webpage_bytes.startswith(b'\xff\xfe'):
354 encoding = 'utf-16'
f143d86a
PH
355 else:
356 encoding = 'utf-8'
d6983cb4
PH
357 if self._downloader.params.get('dump_intermediate_pages', False):
358 try:
359 url = url_or_request.get_full_url()
360 except AttributeError:
361 url = url_or_request
f1a9d64e 362 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
363 dump = base64.b64encode(webpage_bytes).decode('ascii')
364 self._downloader.to_screen(dump)
d41e6efc
PH
365 if self._downloader.params.get('write_pages', False):
366 try:
367 url = url_or_request.get_full_url()
368 except AttributeError:
369 url = url_or_request
5afa7f8b 370 basen = '%s_%s' % (video_id, url)
c1bce22f 371 if len(basen) > 240:
f1a9d64e 372 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
373 basen = basen[:240 - len(h)] + h
374 raw_filename = basen + '.dump'
d41e6efc 375 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 376 self.to_screen('Saving request to ' + filename)
5f58165d
S
377 # Working around MAX_PATH limitation on Windows (see
378 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
379 if os.name == 'nt':
380 absfilepath = os.path.abspath(filename)
381 if len(absfilepath) > 259:
382 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
383 with open(filename, 'wb') as outf:
384 outf.write(webpage_bytes)
385
ec0fafbb
AA
386 try:
387 content = webpage_bytes.decode(encoding, 'replace')
388 except LookupError:
389 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 390
f1a9d64e
PH
391 if ('<title>Access to this site is blocked</title>' in content and
392 'Websense' in content[:512]):
393 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
394 blocked_iframe = self._html_search_regex(
395 r'<iframe src="([^"]+)"', content,
f1a9d64e 396 'Websense information URL', default=None)
2410c43d 397 if blocked_iframe:
f1a9d64e 398 msg += ' Visit %s for more details' % blocked_iframe
2410c43d 399 raise ExtractorError(msg, expected=True)
77b2986b
PH
400 if '<title>The URL you requested has been blocked</title>' in content[:512]:
401 msg = (
402 'Access to this webpage has been blocked by Indian censorship. '
403 'Use a VPN or proxy server (with --proxy) to route around it.')
404 block_msg = self._html_search_regex(
405 r'</h1><p>(.*?)</p>',
406 content, 'block message', default=None)
407 if block_msg:
408 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
409 raise ExtractorError(msg, expected=True)
2410c43d 410
23be51d8 411 return content
d6983cb4 412
995ad69c 413 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
d6983cb4 414 """ Returns the data of the page as a string """
995ad69c
TF
415 success = False
416 try_count = 0
417 while success is False:
418 try:
419 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
420 success = True
421 except compat_http_client.IncompleteRead as e:
422 try_count += 1
423 if try_count >= tries:
424 raise e
425 self._sleep(timeout, video_id)
7cc3570e
PH
426 if res is False:
427 return res
428 else:
429 content, _ = res
430 return content
d6983cb4 431
2a275ab0 432 def _download_xml(self, url_or_request, video_id,
f1a9d64e 433 note='Downloading XML', errnote='Unable to download XML',
28746fbd 434 transform_source=None, fatal=True):
267ed0c5 435 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd
PH
436 xml_string = self._download_webpage(
437 url_or_request, video_id, note, errnote, fatal=fatal)
438 if xml_string is False:
439 return xml_string
e2b38da9
PH
440 if transform_source:
441 xml_string = transform_source(xml_string)
267ed0c5
JMF
442 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
443
3d3538e4 444 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
445 note='Downloading JSON metadata',
446 errnote='Unable to download JSON metadata',
b090af59
PH
447 transform_source=None,
448 fatal=True):
449 json_string = self._download_webpage(
450 url_or_request, video_id, note, errnote, fatal=fatal)
451 if (not fatal) and json_string is False:
452 return None
ebb64199
TF
453 return self._parse_json(
454 json_string, video_id, transform_source=transform_source, fatal=fatal)
455
456 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
457 if transform_source:
458 json_string = transform_source(json_string)
3d3538e4
PH
459 try:
460 return json.loads(json_string)
461 except ValueError as ve:
e7b6d122
PH
462 errmsg = '%s: Failed to parse JSON ' % video_id
463 if fatal:
464 raise ExtractorError(errmsg, cause=ve)
465 else:
466 self.report_warning(errmsg + str(ve))
3d3538e4 467
f45f96f8 468 def report_warning(self, msg, video_id=None):
f1a9d64e 469 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 470 self._downloader.report_warning(
f1a9d64e 471 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 472
d6983cb4
PH
473 def to_screen(self, msg):
474 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 475 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
476
477 def report_extraction(self, id_or_name):
478 """Report information extraction."""
f1a9d64e 479 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
480
481 def report_download_webpage(self, video_id):
482 """Report webpage download."""
f1a9d64e 483 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
484
485 def report_age_confirmation(self):
486 """Report attempt to confirm age."""
f1a9d64e 487 self.to_screen('Confirming age')
d6983cb4 488
fc79158d
JMF
489 def report_login(self):
490 """Report attempt to log in."""
f1a9d64e 491 self.to_screen('Logging in')
fc79158d 492
5f6a1245 493 # Methods for following #608
c0d0b01f
JMF
494 @staticmethod
495 def url_result(url, ie=None, video_id=None):
d6983cb4 496 """Returns a url that points to a page that should be processed"""
5f6a1245 497 # TODO: ie should be the class used for getting the info
d6983cb4
PH
498 video_info = {'_type': 'url',
499 'url': url,
500 'ie_key': ie}
7012b23c
PH
501 if video_id is not None:
502 video_info['id'] = video_id
d6983cb4 503 return video_info
5f6a1245 504
c0d0b01f 505 @staticmethod
acf5cbfe 506 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
507 """Returns a playlist"""
508 video_info = {'_type': 'playlist',
509 'entries': entries}
510 if playlist_id:
511 video_info['id'] = playlist_id
512 if playlist_title:
513 video_info['title'] = playlist_title
acf5cbfe
S
514 if playlist_description:
515 video_info['description'] = playlist_description
d6983cb4
PH
516 return video_info
517
711ede6e 518 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
519 """
520 Perform a regex search on the given string, using a single or a list of
521 patterns returning the first matching group.
522 In case of failure return a default value or raise a WARNING or a
55b3e45b 523 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
524 """
525 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
526 mobj = re.search(pattern, string, flags)
527 else:
528 for p in pattern:
529 mobj = re.search(p, string, flags)
c3415d1b
PH
530 if mobj:
531 break
d6983cb4 532
7e5db8c9 533 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
f1a9d64e 534 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
535 else:
536 _name = name
537
538 if mobj:
711ede6e
PH
539 if group is None:
540 # return the first matching group
541 return next(g for g in mobj.groups() if g is not None)
542 else:
543 return mobj.group(group)
46374a56 544 elif default is not _NO_DEFAULT:
d6983cb4
PH
545 return default
546 elif fatal:
f1a9d64e 547 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 548 else:
f1a9d64e 549 self._downloader.report_warning('unable to extract %s; '
9e1a5b84 550 'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4
PH
551 return None
552
711ede6e 553 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
554 """
555 Like _search_regex, but strips HTML tags and unescapes entities.
556 """
711ede6e 557 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
558 if res:
559 return clean_html(res).strip()
560 else:
561 return res
562
fc79158d
JMF
563 def _get_login_info(self):
564 """
565 Get the the login info as (username, password)
566 It will look in the netrc file using the _NETRC_MACHINE value
567 If there's no info available, return (None, None)
568 """
569 if self._downloader is None:
570 return (None, None)
571
572 username = None
573 password = None
574 downloader_params = self._downloader.params
575
576 # Attempt to use provided username and password or .netrc data
577 if downloader_params.get('username', None) is not None:
578 username = downloader_params['username']
579 password = downloader_params['password']
580 elif downloader_params.get('usenetrc', False):
581 try:
582 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
583 if info is not None:
584 username = info[0]
585 password = info[2]
586 else:
587 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
588 except (IOError, netrc.NetrcParseError) as err:
f1a9d64e 589 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
5f6a1245 590
fc79158d
JMF
591 return (username, password)
592
83317f69 593 def _get_tfa_info(self):
594 """
595 Get the two-factor authentication info
596 TODO - asking the user will be required for sms/phone verify
597 currently just uses the command line option
598 If there's no info available, return None
599 """
600 if self._downloader is None:
83317f69 601 return None
602 downloader_params = self._downloader.params
603
604 if downloader_params.get('twofactor', None) is not None:
605 return downloader_params['twofactor']
606
83317f69 607 return None
608
46720279
JMF
609 # Helper functions for extracting OpenGraph info
610 @staticmethod
ab2d5247 611 def _og_regexes(prop):
c1206423 612 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
9887c9b2 613 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2 614 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 615 return [
78fb87b2
JMF
616 template % (property_re, content_re),
617 template % (content_re, property_re),
ab2d5247 618 ]
46720279 619
3c4e6d83 620 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 621 if name is None:
3c4e6d83 622 name = 'OpenGraph %s' % prop
ab2d5247 623 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
624 if escaped is None:
625 return None
626 return unescapeHTML(escaped)
46720279
JMF
627
628 def _og_search_thumbnail(self, html, **kargs):
f1a9d64e 629 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
46720279
JMF
630
631 def _og_search_description(self, html, **kargs):
632 return self._og_search_property('description', html, fatal=False, **kargs)
633
634 def _og_search_title(self, html, **kargs):
635 return self._og_search_property('title', html, **kargs)
636
8ffa13e0 637 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
638 regexes = self._og_regexes('video') + self._og_regexes('video:url')
639 if secure:
640 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 641 return self._html_search_regex(regexes, html, name, **kargs)
46720279 642
78338f71
JMF
643 def _og_search_url(self, html, **kargs):
644 return self._og_search_property('url', html, **kargs)
645
40c696e5 646 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
647 if display_name is None:
648 display_name = name
649 return self._html_search_regex(
6c6f1408 650 r'''(?isx)<meta
711ede6e 651 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
bec22481 652 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
711ede6e 653 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
654
655 def _dc_search_uploader(self, html):
656 return self._html_search_meta('dc.creator', html, 'uploader')
657
8dbe9899
PH
658 def _rta_search(self, html):
659 # See http://www.rtalabel.org/index.php?content=howtofaq#single
660 if re.search(r'(?ix)<meta\s+name="rating"\s+'
661 r' content="RTA-5042-1996-1400-1577-RTA"',
662 html):
663 return 18
664 return 0
665
59040888
PH
666 def _media_rating_search(self, html):
667 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
668 rating = self._html_search_meta('rating', html)
669
670 if not rating:
671 return None
672
673 RATING_TABLE = {
674 'safe for kids': 0,
675 'general': 8,
676 '14 years': 14,
677 'mature': 17,
678 'restricted': 19,
679 }
680 return RATING_TABLE.get(rating.lower(), None)
681
69319969 682 def _family_friendly_search(self, html):
6ca7732d 683 # See http://schema.org/VideoObject
69319969
NJ
684 family_friendly = self._html_search_meta('isFamilyFriendly', html)
685
686 if not family_friendly:
687 return None
688
689 RATING_TABLE = {
690 '1': 0,
691 'true': 0,
692 '0': 18,
693 'false': 18,
694 }
695 return RATING_TABLE.get(family_friendly.lower(), None)
696
0c708f11
JMF
697 def _twitter_search_player(self, html):
698 return self._html_search_meta('twitter:player', html,
9e1a5b84 699 'twitter card player')
0c708f11 700
4bcc7bd1 701 def _sort_formats(self, formats):
7e8caf30 702 if not formats:
f1a9d64e 703 raise ExtractorError('No video formats found')
7e8caf30 704
4bcc7bd1 705 def _formats_key(f):
e6812ac9
PH
706 # TODO remove the following workaround
707 from ..utils import determine_ext
708 if not f.get('ext') and 'url' in f:
709 f['ext'] = determine_ext(f['url'])
710
4bcc7bd1
PH
711 preference = f.get('preference')
712 if preference is None:
c7deaa4c
PH
713 proto = f.get('protocol')
714 if proto is None:
715 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
716
717 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
718 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
719 preference -= 0.5
720
721 if f.get('vcodec') == 'none': # audio only
722 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 723 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 724 else:
f1a9d64e 725 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
726 ext_preference = 0
727 try:
728 audio_ext_preference = ORDER.index(f['ext'])
729 except ValueError:
730 audio_ext_preference = -1
731 else:
732 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 733 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 734 else:
f1a9d64e 735 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
736 try:
737 ext_preference = ORDER.index(f['ext'])
738 except ValueError:
739 ext_preference = -1
740 audio_ext_preference = 0
741
742 return (
743 preference,
aff2f4f4 744 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 745 f.get('quality') if f.get('quality') is not None else -1,
9933b574 746 f.get('tbr') if f.get('tbr') is not None else -1,
03cd72b0 747 f.get('filesize') if f.get('filesize') is not None else -1,
4bcc7bd1 748 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
749 f.get('height') if f.get('height') is not None else -1,
750 f.get('width') if f.get('width') is not None else -1,
1e1896f2 751 ext_preference,
4bcc7bd1
PH
752 f.get('abr') if f.get('abr') is not None else -1,
753 audio_ext_preference,
2c8e03d9 754 f.get('fps') if f.get('fps') is not None else -1,
9732d77e 755 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 756 f.get('source_preference') if f.get('source_preference') is not None else -1,
4bcc7bd1
PH
757 f.get('format_id'),
758 )
759 formats.sort(key=_formats_key)
59040888 760
96a53167
S
761 def _check_formats(self, formats, video_id):
762 if formats:
763 formats[:] = filter(
764 lambda f: self._is_valid_url(
765 f['url'], video_id,
766 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
767 formats)
768
769 def _is_valid_url(self, url, video_id, item='video'):
770 try:
4069766c 771 self._request_webpage(url, video_id, 'Checking %s URL' % item)
96a53167
S
772 return True
773 except ExtractorError as e:
774 if isinstance(e.cause, compat_HTTPError):
775 self.report_warning(
776 '%s URL is invalid, skipping' % item, video_id)
777 return False
778 raise
779
20991253 780 def http_scheme(self):
1ede5b24 781 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
782 return (
783 'http:'
784 if self._downloader.params.get('prefer_insecure', False)
785 else 'https:')
786
57c7411f
PH
787 def _proto_relative_url(self, url, scheme=None):
788 if url is None:
789 return url
790 if url.startswith('//'):
791 if scheme is None:
792 scheme = self.http_scheme()
793 return scheme + url
794 else:
795 return url
796
4094b6e3
PH
797 def _sleep(self, timeout, video_id, msg_template=None):
798 if msg_template is None:
f1a9d64e 799 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
800 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
801 self.to_screen(msg)
802 time.sleep(timeout)
803
60ca389c 804 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
f036a632
JMF
805 manifest = self._download_xml(
806 manifest_url, video_id, 'Downloading f4m manifest',
807 'Unable to download f4m manifest')
31bb8d3f
JMF
808
809 formats = []
7a47d07c 810 manifest_version = '1.0'
b2527359 811 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 812 if not media_nodes:
7a47d07c 813 manifest_version = '2.0'
34e48bed 814 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b2527359 815 for i, media_el in enumerate(media_nodes):
7a47d07c 816 if manifest_version == '2.0':
8fb3ac36
PH
817 manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' +
818 (media_el.attrib.get('href') or media_el.attrib.get('url')))
b2527359 819 tbr = int_or_none(media_el.attrib.get('bitrate'))
31bb8d3f 820 formats.append({
60ca389c 821 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),
31bb8d3f
JMF
822 'url': manifest_url,
823 'ext': 'flv',
b2527359 824 'tbr': tbr,
31bb8d3f
JMF
825 'width': int_or_none(media_el.attrib.get('width')),
826 'height': int_or_none(media_el.attrib.get('height')),
60ca389c 827 'preference': preference,
31bb8d3f
JMF
828 })
829 self._sort_formats(formats)
830
831 return formats
832
f0b5d6af 833 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
60ca389c
S
834 entry_protocol='m3u8', preference=None,
835 m3u8_id=None):
f0b5d6af 836
704df56d 837 formats = [{
60ca389c 838 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])),
704df56d
PH
839 'url': m3u8_url,
840 'ext': ext,
841 'protocol': 'm3u8',
9fe6ef7a 842 'preference': preference - 1 if preference else -1,
704df56d
PH
843 'resolution': 'multiple',
844 'format_note': 'Quality selection URL',
845 }]
846
f0b5d6af
PH
847 format_url = lambda u: (
848 u
849 if re.match(r'^https?://', u)
850 else compat_urlparse.urljoin(m3u8_url, u))
851
81515ad9
PH
852 m3u8_doc = self._download_webpage(
853 m3u8_url, video_id,
854 note='Downloading m3u8 information',
855 errnote='Failed to download m3u8 information')
704df56d 856 last_info = None
fa156077 857 last_media = None
704df56d
PH
858 kv_rex = re.compile(
859 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
860 for line in m3u8_doc.splitlines():
861 if line.startswith('#EXT-X-STREAM-INF:'):
862 last_info = {}
863 for m in kv_rex.finditer(line):
864 v = m.group('val')
865 if v.startswith('"'):
866 v = v[1:-1]
867 last_info[m.group('key')] = v
4cd95bcb
JMF
868 elif line.startswith('#EXT-X-MEDIA:'):
869 last_media = {}
870 for m in kv_rex.finditer(line):
871 v = m.group('val')
872 if v.startswith('"'):
873 v = v[1:-1]
874 last_media[m.group('key')] = v
704df56d
PH
875 elif line.startswith('#') or not line.strip():
876 continue
877 else:
daebaab6 878 if last_info is None:
f0b5d6af 879 formats.append({'url': format_url(line)})
3524cc25 880 continue
704df56d 881 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
704df56d 882 f = {
60ca389c 883 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])),
f0b5d6af 884 'url': format_url(line.strip()),
704df56d
PH
885 'tbr': tbr,
886 'ext': ext,
f0b5d6af
PH
887 'protocol': entry_protocol,
888 'preference': preference,
704df56d
PH
889 }
890 codecs = last_info.get('CODECS')
891 if codecs:
9ebf22b7
S
892 # TODO: looks like video codec is not always necessarily goes first
893 va_codecs = codecs.split(',')
894 if va_codecs[0]:
895 f['vcodec'] = va_codecs[0].partition('.')[0]
896 if len(va_codecs) > 1 and va_codecs[1]:
897 f['acodec'] = va_codecs[1].partition('.')[0]
704df56d
PH
898 resolution = last_info.get('RESOLUTION')
899 if resolution:
900 width_str, height_str = resolution.split('x')
901 f['width'] = int(width_str)
902 f['height'] = int(height_str)
4cd95bcb
JMF
903 if last_media is not None:
904 f['m3u8_media'] = last_media
905 last_media = None
704df56d
PH
906 formats.append(f)
907 last_info = {}
908 self._sort_formats(formats)
909 return formats
910
e89a2aab 911 # TODO: improve extraction
995029a1 912 def _extract_smil_formats(self, smil_url, video_id, fatal=True):
e89a2aab
S
913 smil = self._download_xml(
914 smil_url, video_id, 'Downloading SMIL file',
995029a1
PH
915 'Unable to download SMIL file', fatal=fatal)
916 if smil is False:
917 assert not fatal
918 return []
e89a2aab
S
919
920 base = smil.find('./head/meta').get('base')
921
922 formats = []
923 rtmp_count = 0
924 for video in smil.findall('./body/switch/video'):
925 src = video.get('src')
926 if not src:
927 continue
928 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
929 width = int_or_none(video.get('width'))
930 height = int_or_none(video.get('height'))
931 proto = video.get('proto')
932 if not proto:
933 if base:
934 if base.startswith('rtmp'):
935 proto = 'rtmp'
936 elif base.startswith('http'):
937 proto = 'http'
938 ext = video.get('ext')
939 if proto == 'm3u8':
940 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
941 elif proto == 'rtmp':
942 rtmp_count += 1
943 streamer = video.get('streamer') or base
944 formats.append({
945 'url': streamer,
946 'play_path': src,
947 'ext': 'flv',
948 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
949 'tbr': bitrate,
950 'width': width,
951 'height': height,
952 })
953 self._sort_formats(formats)
954
955 return formats
956
f4b1c7ad
PH
957 def _live_title(self, name):
958 """ Generate the title for a live video """
959 now = datetime.datetime.now()
960 now_str = now.strftime("%Y-%m-%d %H:%M")
961 return name + ' ' + now_str
962
b14f3a4c
PH
963 def _int(self, v, name, fatal=False, **kwargs):
964 res = int_or_none(v, **kwargs)
965 if 'get_attr' in kwargs:
966 print(getattr(v, kwargs['get_attr']))
967 if res is None:
968 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
969 if fatal:
970 raise ExtractorError(msg)
971 else:
972 self._downloader.report_warning(msg)
973 return res
974
975 def _float(self, v, name, fatal=False, **kwargs):
976 res = float_or_none(v, **kwargs)
977 if res is None:
978 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
979 if fatal:
980 raise ExtractorError(msg)
981 else:
982 self._downloader.report_warning(msg)
983 return res
984
42939b61 985 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
986 cookie = compat_cookiejar.Cookie(
987 0, name, value, None, None, domain, None,
42939b61
JMF
988 None, '/', True, False, expire_time, '', None, None, None)
989 self._downloader.cookiejar.set_cookie(cookie)
990
05900629
PH
991 def get_testcases(self, include_onlymatching=False):
992 t = getattr(self, '_TEST', None)
993 if t:
994 assert not hasattr(self, '_TESTS'), \
995 '%s has _TEST and _TESTS' % type(self).__name__
996 tests = [t]
997 else:
998 tests = getattr(self, '_TESTS', [])
999 for t in tests:
1000 if not include_onlymatching and t.get('only_matching', False):
1001 continue
1002 t['name'] = type(self).__name__[:-len('IE')]
1003 yield t
1004
1005 def is_suitable(self, age_limit):
1006 """ Test whether the extractor is generally suitable for the given
1007 age limit (i.e. pornographic sites are not, all others usually are) """
1008
1009 any_restricted = False
1010 for tc in self.get_testcases(include_onlymatching=False):
1011 if 'playlist' in tc:
1012 tc = tc['playlist'][0]
1013 is_restricted = age_restricted(
1014 tc.get('info_dict', {}).get('age_limit'), age_limit)
1015 if not is_restricted:
1016 return True
1017 any_restricted = any_restricted or is_restricted
1018 return not any_restricted
1019
a504ced0 1020 def extract_subtitles(self, *args, **kwargs):
9868ea49
JMF
1021 if (self._downloader.params.get('writesubtitles', False) or
1022 self._downloader.params.get('listsubtitles')):
1023 return self._get_subtitles(*args, **kwargs)
1024 return {}
a504ced0
JMF
1025
1026 def _get_subtitles(self, *args, **kwargs):
1027 raise NotImplementedError("This method must be implemented by subclasses")
1028
360e1ca5 1029 def extract_automatic_captions(self, *args, **kwargs):
9868ea49
JMF
1030 if (self._downloader.params.get('writeautomaticsub', False) or
1031 self._downloader.params.get('listsubtitles')):
1032 return self._get_automatic_captions(*args, **kwargs)
1033 return {}
360e1ca5
JMF
1034
1035 def _get_automatic_captions(self, *args, **kwargs):
1036 raise NotImplementedError("This method must be implemented by subclasses")
1037
8dbe9899 1038
d6983cb4
PH
1039class SearchInfoExtractor(InfoExtractor):
1040 """
1041 Base class for paged search queries extractors.
1042 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
1043 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1044 """
1045
1046 @classmethod
1047 def _make_valid_url(cls):
1048 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1049
1050 @classmethod
1051 def suitable(cls, url):
1052 return re.match(cls._make_valid_url(), url) is not None
1053
1054 def _real_extract(self, query):
1055 mobj = re.match(self._make_valid_url(), query)
1056 if mobj is None:
f1a9d64e 1057 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
1058
1059 prefix = mobj.group('prefix')
1060 query = mobj.group('query')
1061 if prefix == '':
1062 return self._get_n_results(query, 1)
1063 elif prefix == 'all':
1064 return self._get_n_results(query, self._MAX_RESULTS)
1065 else:
1066 n = int(prefix)
1067 if n <= 0:
f1a9d64e 1068 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 1069 elif n > self._MAX_RESULTS:
f1a9d64e 1070 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
1071 n = self._MAX_RESULTS
1072 return self._get_n_results(query, n)
1073
1074 def _get_n_results(self, query, n):
1075 """Get a specified number of results for a query"""
416a5efc 1076 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
1077
1078 @property
1079 def SEARCH_KEY(self):
1080 return self._SEARCH_KEY