]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[bliptv] Convert to new subtitles system
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
f1a9d64e
PH
1from __future__ import unicode_literals
2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
267ed0c5 13import xml.etree.ElementTree
d6983cb4 14
8c25f81b 15from ..compat import (
42939b61 16 compat_cookiejar,
96a53167 17 compat_HTTPError,
d6983cb4
PH
18 compat_http_client,
19 compat_urllib_error,
c7deaa4c 20 compat_urllib_parse_urlparse,
f0b5d6af 21 compat_urlparse,
d6983cb4 22 compat_str,
8c25f81b
PH
23)
24from ..utils import (
05900629 25 age_restricted,
d6983cb4
PH
26 clean_html,
27 compiled_regex_type,
28 ExtractorError,
b14f3a4c 29 float_or_none,
96a53167 30 HEADRequest,
31bb8d3f 31 int_or_none,
55b3e45b 32 RegexNotFoundError,
d41e6efc 33 sanitize_filename,
f38de77f 34 unescapeHTML,
d6983cb4 35)
46374a56 36_NO_DEFAULT = object()
d6983cb4 37
dca08720 38
d6983cb4
PH
39class InfoExtractor(object):
40 """Information Extractor class.
41
42 Information extractors are the classes that, given a URL, extract
43 information about the video (or videos) the URL refers to. This
44 information includes the real video URL, the video title, author and
45 others. The information is stored in a dictionary which is then
5d380852 46 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
47 information possibly downloading the video to the file system, among
48 other possible outcomes.
49
fed5d032
PH
50 The type field determines the the type of the result.
51 By far the most common value (and the default if _type is missing) is
52 "video", which indicates a single video.
53
54 For a video, the dictionaries must include the following fields:
d6983cb4
PH
55
56 id: Video identifier.
d6983cb4 57 title: Video title, unescaped.
d67b0b15 58
f49d89ee 59 Additionally, it must contain either a formats entry or a url one:
d67b0b15 60
f49d89ee
PH
61 formats: A list of dictionaries for each format available, ordered
62 from worst to best quality.
63
64 Potential fields:
d67b0b15
PH
65 * url Mandatory. The URL of the video file
66 * ext Will be calculated from url if missing
67 * format A human-readable description of the format
68 ("mp4 container with h264/opus").
69 Calculated from the format_id, width, height.
70 and format_note fields if missing.
71 * format_id A short description of the format
5d4f3985
PH
72 ("mp4_h264_opus" or "19").
73 Technically optional, but strongly recommended.
d67b0b15
PH
74 * format_note Additional info about the format
75 ("3D" or "DASH video")
76 * width Width of the video, if known
77 * height Height of the video, if known
f49d89ee 78 * resolution Textual description of width and height
7217e148 79 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
80 * abr Average audio bitrate in KBit/s
81 * acodec Name of the audio codec in use
dd27fd17 82 * asr Audio sampling rate in Hertz
d67b0b15 83 * vbr Average video bitrate in KBit/s
fbb21cf5 84 * fps Frame rate
d67b0b15 85 * vcodec Name of the video codec in use
1394ce65 86 * container Name of the container format
d67b0b15 87 * filesize The number of bytes, if known in advance
9732d77e 88 * filesize_approx An estimate for the number of bytes
d67b0b15 89 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
90 * protocol The protocol that will be used for the actual
91 download, lower-case.
b04b8852
PH
92 "http", "https", "rtsp", "rtmp", "rtmpe",
93 "m3u8", or "m3u8_native".
f49d89ee 94 * preference Order number of this format. If this field is
08d13955 95 present and not None, the formats get sorted
38d63d84 96 by this field, regardless of all other values.
f49d89ee
PH
97 -1 for default (order by other properties),
98 -2 or smaller for less than default.
e65566a9
PH
99 < -1000 to hide the format (if there is
100 another one which is strictly better)
aff2f4f4
PH
101 * language_preference Is this in the correct requested
102 language?
103 10 if it's what the URL is about,
104 -1 for default (don't know),
105 -10 otherwise, other values reserved for now.
5d73273f
PH
106 * quality Order number of the video quality of this
107 format, irrespective of the file format.
108 -1 for default (order by other properties),
109 -2 or smaller for less than default.
c64ed2a3
PH
110 * source_preference Order number for this video source
111 (quality takes higher priority)
112 -1 for default (order by other properties),
113 -2 or smaller for less than default.
d769be6c
PH
114 * http_method HTTP method to use for the download.
115 * http_headers A dictionary of additional HTTP headers
116 to add to the request.
117 * http_post_data Additional data to send with a POST
118 request.
6271f1ca 119 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
120 video's pixels are not square.
121 width : height ratio as float.
122 * no_resume The server does not support resuming the
123 (HTTP or RTMP) download. Boolean.
124
c0ba0f48 125 url: Final video URL.
d6983cb4 126 ext: Video filename extension.
d67b0b15
PH
127 format: The video format, defaults to ext (used for --get-format)
128 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 129
d6983cb4
PH
130 The following fields are optional:
131
f5e43bc6 132 alt_title: A secondary title of the video.
0afef30b
PH
133 display_id An alternative identifier for the video, not necessarily
134 unique, but available before title. Typically, id is
135 something like "4234987", title "Dancing naked mole rats",
136 and display_id "dancing-naked-mole-rats"
d5519808 137 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 138 * "id" (optional, string) - Thumbnail format ID
d5519808 139 * "url"
cfb56d1a 140 * "preference" (optional, int) - quality of the image
d5519808
PH
141 * "width" (optional, int)
142 * "height" (optional, int)
143 * "resolution" (optional, string "{width}x{height"},
144 deprecated)
d6983cb4 145 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 146 description: Full video description.
d6983cb4 147 uploader: Full name of the video uploader.
9bb8e0a3 148 creator: The main artist who created the video.
955c4514 149 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 150 upload_date: Video upload date (YYYYMMDD).
955c4514 151 If not explicitly set, calculated from timestamp.
d6983cb4 152 uploader_id: Nickname or id of the video uploader.
da9ec3b9 153 location: Physical location where the video was filmed.
a504ced0
JMF
154 subtitles: The available subtitles as a dictionary in the format
155 {language: subformats}. "subformats" is a list sorted from
156 lower to higher preference, each element is a dictionary
157 with the "ext" entry and one of:
158 * "data": The subtitles file contents
159 * "url": A url pointing to the subtitles file
360e1ca5
JMF
160 automatic_captions: Like 'subtitles', used by the YoutubeIE for
161 automatically generated captions
c0ba0f48 162 duration: Length of the video in seconds, as an integer.
f3d29461 163 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
164 like_count: Number of positive ratings of the video
165 dislike_count: Number of negative ratings of the video
2d30521a 166 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 167 comment_count: Number of comments on the video
dd622d7c
PH
168 comments: A list of comments, each with one or more of the following
169 properties (all but one of text or html optional):
170 * "author" - human-readable name of the comment author
171 * "author_id" - user ID of the comment author
172 * "id" - Comment ID
173 * "html" - Comment as HTML
174 * "text" - Plain text of the comment
175 * "timestamp" - UNIX timestamp of comment
176 * "parent" - ID of the comment this one is replying to.
177 Set to "root" to indicate that this is a
178 comment to the original video.
8dbe9899 179 age_limit: Age restriction for the video, as an integer (years)
9103bbc5
JMF
180 webpage_url: The url to the video webpage, if given to youtube-dl it
181 should allow to get the same result again. (It will be set
182 by YoutubeDL if it's missing)
ad3bc6ac
PH
183 categories: A list of categories that the video falls in, for example
184 ["Sports", "Berlin"]
7267bd53
PH
185 is_live: True, False, or None (=unknown). Whether this video is a
186 live stream that goes on instead of a fixed-length video.
d6983cb4 187
deefc05b 188 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 189
d838b1bd
PH
190 Unless mentioned otherwise, None is equivalent to absence of information.
191
fed5d032
PH
192
193 _type "playlist" indicates multiple videos.
b82f815f
PH
194 There must be a key "entries", which is a list, an iterable, or a PagedList
195 object, each element of which is a valid dictionary by this specification.
fed5d032
PH
196
197 Additionally, playlists can have "title" and "id" attributes with the same
198 semantics as videos (see above).
199
200
201 _type "multi_video" indicates that there are multiple videos that
202 form a single show, for examples multiple acts of an opera or TV episode.
203 It must have an entries key like a playlist and contain all the keys
204 required for a video at the same time.
205
206
207 _type "url" indicates that the video must be extracted from another
208 location, possibly by a different extractor. Its only required key is:
209 "url" - the next URL to extract.
f58766ce
PH
210 The key "ie_key" can be set to the class name (minus the trailing "IE",
211 e.g. "Youtube") if the extractor class is known in advance.
212 Additionally, the dictionary may have any properties of the resolved entity
213 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
214 known ahead of time.
215
216
217 _type "url_transparent" entities have the same specification as "url", but
218 indicate that the given additional information is more precise than the one
219 associated with the resolved URL.
220 This is useful when a site employs a video service that hosts the video and
221 its technical metadata, but that video service does not embed a useful
222 title, description etc.
223
224
d6983cb4
PH
225 Subclasses of this one should re-define the _real_initialize() and
226 _real_extract() methods and define a _VALID_URL regexp.
227 Probably, they should also be added to the list of extractors.
228
d6983cb4
PH
229 Finally, the _WORKING attribute should be set to False for broken IEs
230 in order to warn the users and skip the tests.
231 """
232
233 _ready = False
234 _downloader = None
235 _WORKING = True
236
237 def __init__(self, downloader=None):
238 """Constructor. Receives an optional downloader."""
239 self._ready = False
240 self.set_downloader(downloader)
241
242 @classmethod
243 def suitable(cls, url):
244 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
245
246 # This does not use has/getattr intentionally - we want to know whether
247 # we have cached the regexp for *this* class, whereas getattr would also
248 # match the superclass
249 if '_VALID_URL_RE' not in cls.__dict__:
250 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
251 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 252
ed9266db
PH
253 @classmethod
254 def _match_id(cls, url):
255 if '_VALID_URL_RE' not in cls.__dict__:
256 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
257 m = cls._VALID_URL_RE.match(url)
258 assert m
259 return m.group('id')
260
d6983cb4
PH
261 @classmethod
262 def working(cls):
263 """Getter method for _WORKING."""
264 return cls._WORKING
265
266 def initialize(self):
267 """Initializes an instance (authentication, etc)."""
268 if not self._ready:
269 self._real_initialize()
270 self._ready = True
271
272 def extract(self, url):
273 """Extracts URL information and returns it in list of dicts."""
3a5bcd03
PH
274 try:
275 self.initialize()
276 return self._real_extract(url)
277 except ExtractorError:
278 raise
279 except compat_http_client.IncompleteRead as e:
280 raise ExtractorError('A network error has occured.', cause=e, expected=True)
9650885b 281 except (KeyError, StopIteration) as e:
3a5bcd03 282 raise ExtractorError('An extractor error has occured.', cause=e)
d6983cb4
PH
283
284 def set_downloader(self, downloader):
285 """Sets the downloader for this IE."""
286 self._downloader = downloader
287
288 def _real_initialize(self):
289 """Real initialization process. Redefine in subclasses."""
290 pass
291
292 def _real_extract(self, url):
293 """Real extraction process. Redefine in subclasses."""
294 pass
295
56c73665
JMF
296 @classmethod
297 def ie_key(cls):
298 """A string for getting the InfoExtractor with get_info_extractor"""
299 return cls.__name__[:-2]
300
d6983cb4
PH
301 @property
302 def IE_NAME(self):
303 return type(self).__name__[:-2]
304
7cc3570e 305 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
306 """ Returns the response handle """
307 if note is None:
308 self.report_download_webpage(video_id)
309 elif note is not False:
7cc3570e 310 if video_id is None:
f1a9d64e 311 self.to_screen('%s' % (note,))
7cc3570e 312 else:
f1a9d64e 313 self.to_screen('%s: %s' % (video_id, note))
d6983cb4 314 try:
dca08720 315 return self._downloader.urlopen(url_or_request)
d6983cb4 316 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
317 if errnote is False:
318 return False
d6983cb4 319 if errnote is None:
f1a9d64e
PH
320 errnote = 'Unable to download webpage'
321 errmsg = '%s: %s' % (errnote, compat_str(err))
7cc3570e
PH
322 if fatal:
323 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
324 else:
325 self._downloader.report_warning(errmsg)
326 return False
d6983cb4 327
7cc3570e 328 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 329 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
330 # Strip hashes from the URL (#1038)
331 if isinstance(url_or_request, (compat_str, str)):
332 url_or_request = url_or_request.partition('#')[0]
333
7cc3570e
PH
334 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
335 if urlh is False:
336 assert not fatal
337 return False
23be51d8
PH
338 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
339 return (content, urlh)
340
4e262a88 341 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
d6983cb4 342 content_type = urlh.headers.get('Content-Type', '')
f143d86a 343 webpage_bytes = urlh.read()
4e262a88
PH
344 if prefix is not None:
345 webpage_bytes = prefix + webpage_bytes
d6983cb4
PH
346 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
347 if m:
348 encoding = m.group(1)
349 else:
0d75ae2c 350 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
351 webpage_bytes[:1024])
352 if m:
353 encoding = m.group(1).decode('ascii')
b60016e8
PH
354 elif webpage_bytes.startswith(b'\xff\xfe'):
355 encoding = 'utf-16'
f143d86a
PH
356 else:
357 encoding = 'utf-8'
d6983cb4
PH
358 if self._downloader.params.get('dump_intermediate_pages', False):
359 try:
360 url = url_or_request.get_full_url()
361 except AttributeError:
362 url = url_or_request
f1a9d64e 363 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
364 dump = base64.b64encode(webpage_bytes).decode('ascii')
365 self._downloader.to_screen(dump)
d41e6efc
PH
366 if self._downloader.params.get('write_pages', False):
367 try:
368 url = url_or_request.get_full_url()
369 except AttributeError:
370 url = url_or_request
5afa7f8b 371 basen = '%s_%s' % (video_id, url)
c1bce22f 372 if len(basen) > 240:
f1a9d64e 373 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
374 basen = basen[:240 - len(h)] + h
375 raw_filename = basen + '.dump'
d41e6efc 376 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 377 self.to_screen('Saving request to ' + filename)
5f58165d
S
378 # Working around MAX_PATH limitation on Windows (see
379 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
380 if os.name == 'nt':
381 absfilepath = os.path.abspath(filename)
382 if len(absfilepath) > 259:
383 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
384 with open(filename, 'wb') as outf:
385 outf.write(webpage_bytes)
386
ec0fafbb
AA
387 try:
388 content = webpage_bytes.decode(encoding, 'replace')
389 except LookupError:
390 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 391
f1a9d64e
PH
392 if ('<title>Access to this site is blocked</title>' in content and
393 'Websense' in content[:512]):
394 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
395 blocked_iframe = self._html_search_regex(
396 r'<iframe src="([^"]+)"', content,
f1a9d64e 397 'Websense information URL', default=None)
2410c43d 398 if blocked_iframe:
f1a9d64e 399 msg += ' Visit %s for more details' % blocked_iframe
2410c43d
PH
400 raise ExtractorError(msg, expected=True)
401
23be51d8 402 return content
d6983cb4 403
995ad69c 404 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
d6983cb4 405 """ Returns the data of the page as a string """
995ad69c
TF
406 success = False
407 try_count = 0
408 while success is False:
409 try:
410 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
411 success = True
412 except compat_http_client.IncompleteRead as e:
413 try_count += 1
414 if try_count >= tries:
415 raise e
416 self._sleep(timeout, video_id)
7cc3570e
PH
417 if res is False:
418 return res
419 else:
420 content, _ = res
421 return content
d6983cb4 422
2a275ab0 423 def _download_xml(self, url_or_request, video_id,
f1a9d64e 424 note='Downloading XML', errnote='Unable to download XML',
28746fbd 425 transform_source=None, fatal=True):
267ed0c5 426 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd
PH
427 xml_string = self._download_webpage(
428 url_or_request, video_id, note, errnote, fatal=fatal)
429 if xml_string is False:
430 return xml_string
e2b38da9
PH
431 if transform_source:
432 xml_string = transform_source(xml_string)
267ed0c5
JMF
433 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
434
3d3538e4 435 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
436 note='Downloading JSON metadata',
437 errnote='Unable to download JSON metadata',
b090af59
PH
438 transform_source=None,
439 fatal=True):
440 json_string = self._download_webpage(
441 url_or_request, video_id, note, errnote, fatal=fatal)
442 if (not fatal) and json_string is False:
443 return None
ebb64199
TF
444 return self._parse_json(
445 json_string, video_id, transform_source=transform_source, fatal=fatal)
446
447 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
448 if transform_source:
449 json_string = transform_source(json_string)
3d3538e4
PH
450 try:
451 return json.loads(json_string)
452 except ValueError as ve:
e7b6d122
PH
453 errmsg = '%s: Failed to parse JSON ' % video_id
454 if fatal:
455 raise ExtractorError(errmsg, cause=ve)
456 else:
457 self.report_warning(errmsg + str(ve))
3d3538e4 458
f45f96f8 459 def report_warning(self, msg, video_id=None):
f1a9d64e 460 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 461 self._downloader.report_warning(
f1a9d64e 462 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 463
d6983cb4
PH
464 def to_screen(self, msg):
465 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 466 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
467
468 def report_extraction(self, id_or_name):
469 """Report information extraction."""
f1a9d64e 470 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
471
472 def report_download_webpage(self, video_id):
473 """Report webpage download."""
f1a9d64e 474 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
475
476 def report_age_confirmation(self):
477 """Report attempt to confirm age."""
f1a9d64e 478 self.to_screen('Confirming age')
d6983cb4 479
fc79158d
JMF
480 def report_login(self):
481 """Report attempt to log in."""
f1a9d64e 482 self.to_screen('Logging in')
fc79158d 483
5f6a1245 484 # Methods for following #608
c0d0b01f
JMF
485 @staticmethod
486 def url_result(url, ie=None, video_id=None):
d6983cb4 487 """Returns a url that points to a page that should be processed"""
5f6a1245 488 # TODO: ie should be the class used for getting the info
d6983cb4
PH
489 video_info = {'_type': 'url',
490 'url': url,
491 'ie_key': ie}
7012b23c
PH
492 if video_id is not None:
493 video_info['id'] = video_id
d6983cb4 494 return video_info
5f6a1245 495
c0d0b01f 496 @staticmethod
acf5cbfe 497 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
498 """Returns a playlist"""
499 video_info = {'_type': 'playlist',
500 'entries': entries}
501 if playlist_id:
502 video_info['id'] = playlist_id
503 if playlist_title:
504 video_info['title'] = playlist_title
acf5cbfe
S
505 if playlist_description:
506 video_info['description'] = playlist_description
d6983cb4
PH
507 return video_info
508
711ede6e 509 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
510 """
511 Perform a regex search on the given string, using a single or a list of
512 patterns returning the first matching group.
513 In case of failure return a default value or raise a WARNING or a
55b3e45b 514 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
515 """
516 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
517 mobj = re.search(pattern, string, flags)
518 else:
519 for p in pattern:
520 mobj = re.search(p, string, flags)
c3415d1b
PH
521 if mobj:
522 break
d6983cb4 523
7e5db8c9 524 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
f1a9d64e 525 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
526 else:
527 _name = name
528
529 if mobj:
711ede6e
PH
530 if group is None:
531 # return the first matching group
532 return next(g for g in mobj.groups() if g is not None)
533 else:
534 return mobj.group(group)
46374a56 535 elif default is not _NO_DEFAULT:
d6983cb4
PH
536 return default
537 elif fatal:
f1a9d64e 538 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 539 else:
f1a9d64e 540 self._downloader.report_warning('unable to extract %s; '
9e1a5b84 541 'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4
PH
542 return None
543
711ede6e 544 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
545 """
546 Like _search_regex, but strips HTML tags and unescapes entities.
547 """
711ede6e 548 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
549 if res:
550 return clean_html(res).strip()
551 else:
552 return res
553
fc79158d
JMF
554 def _get_login_info(self):
555 """
556 Get the the login info as (username, password)
557 It will look in the netrc file using the _NETRC_MACHINE value
558 If there's no info available, return (None, None)
559 """
560 if self._downloader is None:
561 return (None, None)
562
563 username = None
564 password = None
565 downloader_params = self._downloader.params
566
567 # Attempt to use provided username and password or .netrc data
568 if downloader_params.get('username', None) is not None:
569 username = downloader_params['username']
570 password = downloader_params['password']
571 elif downloader_params.get('usenetrc', False):
572 try:
573 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
574 if info is not None:
575 username = info[0]
576 password = info[2]
577 else:
578 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
579 except (IOError, netrc.NetrcParseError) as err:
f1a9d64e 580 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
5f6a1245 581
fc79158d
JMF
582 return (username, password)
583
83317f69 584 def _get_tfa_info(self):
585 """
586 Get the two-factor authentication info
587 TODO - asking the user will be required for sms/phone verify
588 currently just uses the command line option
589 If there's no info available, return None
590 """
591 if self._downloader is None:
83317f69 592 return None
593 downloader_params = self._downloader.params
594
595 if downloader_params.get('twofactor', None) is not None:
596 return downloader_params['twofactor']
597
83317f69 598 return None
599
46720279
JMF
600 # Helper functions for extracting OpenGraph info
601 @staticmethod
ab2d5247 602 def _og_regexes(prop):
c1206423 603 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
9887c9b2 604 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2 605 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 606 return [
78fb87b2
JMF
607 template % (property_re, content_re),
608 template % (content_re, property_re),
ab2d5247 609 ]
46720279 610
3c4e6d83 611 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 612 if name is None:
3c4e6d83 613 name = 'OpenGraph %s' % prop
ab2d5247 614 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
615 if escaped is None:
616 return None
617 return unescapeHTML(escaped)
46720279
JMF
618
619 def _og_search_thumbnail(self, html, **kargs):
f1a9d64e 620 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
46720279
JMF
621
622 def _og_search_description(self, html, **kargs):
623 return self._og_search_property('description', html, fatal=False, **kargs)
624
625 def _og_search_title(self, html, **kargs):
626 return self._og_search_property('title', html, **kargs)
627
8ffa13e0 628 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
629 regexes = self._og_regexes('video') + self._og_regexes('video:url')
630 if secure:
631 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 632 return self._html_search_regex(regexes, html, name, **kargs)
46720279 633
78338f71
JMF
634 def _og_search_url(self, html, **kargs):
635 return self._og_search_property('url', html, **kargs)
636
40c696e5 637 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
638 if display_name is None:
639 display_name = name
640 return self._html_search_regex(
6c6f1408 641 r'''(?isx)<meta
711ede6e 642 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
bec22481 643 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
711ede6e 644 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
645
646 def _dc_search_uploader(self, html):
647 return self._html_search_meta('dc.creator', html, 'uploader')
648
8dbe9899
PH
649 def _rta_search(self, html):
650 # See http://www.rtalabel.org/index.php?content=howtofaq#single
651 if re.search(r'(?ix)<meta\s+name="rating"\s+'
652 r' content="RTA-5042-1996-1400-1577-RTA"',
653 html):
654 return 18
655 return 0
656
59040888
PH
657 def _media_rating_search(self, html):
658 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
659 rating = self._html_search_meta('rating', html)
660
661 if not rating:
662 return None
663
664 RATING_TABLE = {
665 'safe for kids': 0,
666 'general': 8,
667 '14 years': 14,
668 'mature': 17,
669 'restricted': 19,
670 }
671 return RATING_TABLE.get(rating.lower(), None)
672
69319969 673 def _family_friendly_search(self, html):
6ca7732d 674 # See http://schema.org/VideoObject
69319969
NJ
675 family_friendly = self._html_search_meta('isFamilyFriendly', html)
676
677 if not family_friendly:
678 return None
679
680 RATING_TABLE = {
681 '1': 0,
682 'true': 0,
683 '0': 18,
684 'false': 18,
685 }
686 return RATING_TABLE.get(family_friendly.lower(), None)
687
0c708f11
JMF
688 def _twitter_search_player(self, html):
689 return self._html_search_meta('twitter:player', html,
9e1a5b84 690 'twitter card player')
0c708f11 691
4bcc7bd1 692 def _sort_formats(self, formats):
7e8caf30 693 if not formats:
f1a9d64e 694 raise ExtractorError('No video formats found')
7e8caf30 695
4bcc7bd1 696 def _formats_key(f):
e6812ac9
PH
697 # TODO remove the following workaround
698 from ..utils import determine_ext
699 if not f.get('ext') and 'url' in f:
700 f['ext'] = determine_ext(f['url'])
701
4bcc7bd1
PH
702 preference = f.get('preference')
703 if preference is None:
c7deaa4c
PH
704 proto = f.get('protocol')
705 if proto is None:
706 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
707
708 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
709 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
710 preference -= 0.5
711
712 if f.get('vcodec') == 'none': # audio only
713 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 714 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 715 else:
f1a9d64e 716 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
717 ext_preference = 0
718 try:
719 audio_ext_preference = ORDER.index(f['ext'])
720 except ValueError:
721 audio_ext_preference = -1
722 else:
723 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 724 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 725 else:
f1a9d64e 726 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
727 try:
728 ext_preference = ORDER.index(f['ext'])
729 except ValueError:
730 ext_preference = -1
731 audio_ext_preference = 0
732
733 return (
734 preference,
aff2f4f4 735 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 736 f.get('quality') if f.get('quality') is not None else -1,
9933b574 737 f.get('tbr') if f.get('tbr') is not None else -1,
4bcc7bd1 738 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
739 f.get('height') if f.get('height') is not None else -1,
740 f.get('width') if f.get('width') is not None else -1,
1e1896f2 741 ext_preference,
4bcc7bd1
PH
742 f.get('abr') if f.get('abr') is not None else -1,
743 audio_ext_preference,
2c8e03d9 744 f.get('fps') if f.get('fps') is not None else -1,
4bcc7bd1 745 f.get('filesize') if f.get('filesize') is not None else -1,
9732d77e 746 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 747 f.get('source_preference') if f.get('source_preference') is not None else -1,
4bcc7bd1
PH
748 f.get('format_id'),
749 )
750 formats.sort(key=_formats_key)
59040888 751
96a53167
S
752 def _check_formats(self, formats, video_id):
753 if formats:
754 formats[:] = filter(
755 lambda f: self._is_valid_url(
756 f['url'], video_id,
757 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
758 formats)
759
760 def _is_valid_url(self, url, video_id, item='video'):
761 try:
762 self._request_webpage(
763 HEADRequest(url), video_id,
764 'Checking %s URL' % item)
765 return True
766 except ExtractorError as e:
767 if isinstance(e.cause, compat_HTTPError):
768 self.report_warning(
769 '%s URL is invalid, skipping' % item, video_id)
770 return False
771 raise
772
20991253 773 def http_scheme(self):
1ede5b24 774 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
775 return (
776 'http:'
777 if self._downloader.params.get('prefer_insecure', False)
778 else 'https:')
779
57c7411f
PH
780 def _proto_relative_url(self, url, scheme=None):
781 if url is None:
782 return url
783 if url.startswith('//'):
784 if scheme is None:
785 scheme = self.http_scheme()
786 return scheme + url
787 else:
788 return url
789
4094b6e3
PH
790 def _sleep(self, timeout, video_id, msg_template=None):
791 if msg_template is None:
f1a9d64e 792 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
793 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
794 self.to_screen(msg)
795 time.sleep(timeout)
796
60ca389c 797 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
f036a632
JMF
798 manifest = self._download_xml(
799 manifest_url, video_id, 'Downloading f4m manifest',
800 'Unable to download f4m manifest')
31bb8d3f
JMF
801
802 formats = []
7a47d07c 803 manifest_version = '1.0'
b2527359 804 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 805 if not media_nodes:
7a47d07c 806 manifest_version = '2.0'
34e48bed 807 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b2527359 808 for i, media_el in enumerate(media_nodes):
7a47d07c 809 if manifest_version == '2.0':
3900eec2
S
810 manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/'
811 + (media_el.attrib.get('href') or media_el.attrib.get('url')))
b2527359 812 tbr = int_or_none(media_el.attrib.get('bitrate'))
31bb8d3f 813 formats.append({
60ca389c 814 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),
31bb8d3f
JMF
815 'url': manifest_url,
816 'ext': 'flv',
b2527359 817 'tbr': tbr,
31bb8d3f
JMF
818 'width': int_or_none(media_el.attrib.get('width')),
819 'height': int_or_none(media_el.attrib.get('height')),
60ca389c 820 'preference': preference,
31bb8d3f
JMF
821 })
822 self._sort_formats(formats)
823
824 return formats
825
f0b5d6af 826 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
60ca389c
S
827 entry_protocol='m3u8', preference=None,
828 m3u8_id=None):
f0b5d6af 829
704df56d 830 formats = [{
60ca389c 831 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])),
704df56d
PH
832 'url': m3u8_url,
833 'ext': ext,
834 'protocol': 'm3u8',
835 'preference': -1,
836 'resolution': 'multiple',
837 'format_note': 'Quality selection URL',
838 }]
839
f0b5d6af
PH
840 format_url = lambda u: (
841 u
842 if re.match(r'^https?://', u)
843 else compat_urlparse.urljoin(m3u8_url, u))
844
81515ad9
PH
845 m3u8_doc = self._download_webpage(
846 m3u8_url, video_id,
847 note='Downloading m3u8 information',
848 errnote='Failed to download m3u8 information')
704df56d
PH
849 last_info = None
850 kv_rex = re.compile(
851 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
852 for line in m3u8_doc.splitlines():
853 if line.startswith('#EXT-X-STREAM-INF:'):
854 last_info = {}
855 for m in kv_rex.finditer(line):
856 v = m.group('val')
857 if v.startswith('"'):
858 v = v[1:-1]
859 last_info[m.group('key')] = v
860 elif line.startswith('#') or not line.strip():
861 continue
862 else:
daebaab6 863 if last_info is None:
f0b5d6af 864 formats.append({'url': format_url(line)})
3524cc25 865 continue
704df56d 866 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
704df56d 867 f = {
60ca389c 868 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])),
f0b5d6af 869 'url': format_url(line.strip()),
704df56d
PH
870 'tbr': tbr,
871 'ext': ext,
f0b5d6af
PH
872 'protocol': entry_protocol,
873 'preference': preference,
704df56d
PH
874 }
875 codecs = last_info.get('CODECS')
876 if codecs:
9ebf22b7
S
877 # TODO: looks like video codec is not always necessarily goes first
878 va_codecs = codecs.split(',')
879 if va_codecs[0]:
880 f['vcodec'] = va_codecs[0].partition('.')[0]
881 if len(va_codecs) > 1 and va_codecs[1]:
882 f['acodec'] = va_codecs[1].partition('.')[0]
704df56d
PH
883 resolution = last_info.get('RESOLUTION')
884 if resolution:
885 width_str, height_str = resolution.split('x')
886 f['width'] = int(width_str)
887 f['height'] = int(height_str)
888 formats.append(f)
889 last_info = {}
890 self._sort_formats(formats)
891 return formats
892
e89a2aab 893 # TODO: improve extraction
995029a1 894 def _extract_smil_formats(self, smil_url, video_id, fatal=True):
e89a2aab
S
895 smil = self._download_xml(
896 smil_url, video_id, 'Downloading SMIL file',
995029a1
PH
897 'Unable to download SMIL file', fatal=fatal)
898 if smil is False:
899 assert not fatal
900 return []
e89a2aab
S
901
902 base = smil.find('./head/meta').get('base')
903
904 formats = []
905 rtmp_count = 0
906 for video in smil.findall('./body/switch/video'):
907 src = video.get('src')
908 if not src:
909 continue
910 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
911 width = int_or_none(video.get('width'))
912 height = int_or_none(video.get('height'))
913 proto = video.get('proto')
914 if not proto:
915 if base:
916 if base.startswith('rtmp'):
917 proto = 'rtmp'
918 elif base.startswith('http'):
919 proto = 'http'
920 ext = video.get('ext')
921 if proto == 'm3u8':
922 formats.extend(self._extract_m3u8_formats(src, video_id, ext))
923 elif proto == 'rtmp':
924 rtmp_count += 1
925 streamer = video.get('streamer') or base
926 formats.append({
927 'url': streamer,
928 'play_path': src,
929 'ext': 'flv',
930 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
931 'tbr': bitrate,
932 'width': width,
933 'height': height,
934 })
935 self._sort_formats(formats)
936
937 return formats
938
f4b1c7ad
PH
939 def _live_title(self, name):
940 """ Generate the title for a live video """
941 now = datetime.datetime.now()
942 now_str = now.strftime("%Y-%m-%d %H:%M")
943 return name + ' ' + now_str
944
b14f3a4c
PH
945 def _int(self, v, name, fatal=False, **kwargs):
946 res = int_or_none(v, **kwargs)
947 if 'get_attr' in kwargs:
948 print(getattr(v, kwargs['get_attr']))
949 if res is None:
950 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
951 if fatal:
952 raise ExtractorError(msg)
953 else:
954 self._downloader.report_warning(msg)
955 return res
956
957 def _float(self, v, name, fatal=False, **kwargs):
958 res = float_or_none(v, **kwargs)
959 if res is None:
960 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
961 if fatal:
962 raise ExtractorError(msg)
963 else:
964 self._downloader.report_warning(msg)
965 return res
966
42939b61 967 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
968 cookie = compat_cookiejar.Cookie(
969 0, name, value, None, None, domain, None,
42939b61
JMF
970 None, '/', True, False, expire_time, '', None, None, None)
971 self._downloader.cookiejar.set_cookie(cookie)
972
05900629
PH
973 def get_testcases(self, include_onlymatching=False):
974 t = getattr(self, '_TEST', None)
975 if t:
976 assert not hasattr(self, '_TESTS'), \
977 '%s has _TEST and _TESTS' % type(self).__name__
978 tests = [t]
979 else:
980 tests = getattr(self, '_TESTS', [])
981 for t in tests:
982 if not include_onlymatching and t.get('only_matching', False):
983 continue
984 t['name'] = type(self).__name__[:-len('IE')]
985 yield t
986
987 def is_suitable(self, age_limit):
988 """ Test whether the extractor is generally suitable for the given
989 age limit (i.e. pornographic sites are not, all others usually are) """
990
991 any_restricted = False
992 for tc in self.get_testcases(include_onlymatching=False):
993 if 'playlist' in tc:
994 tc = tc['playlist'][0]
995 is_restricted = age_restricted(
996 tc.get('info_dict', {}).get('age_limit'), age_limit)
997 if not is_restricted:
998 return True
999 any_restricted = any_restricted or is_restricted
1000 return not any_restricted
1001
a504ced0
JMF
1002 def extract_subtitles(self, *args, **kwargs):
1003 subtitles = {}
1004 list_subtitles = self._downloader.params.get('listsubtitles')
1005 if self._downloader.params.get('writesubtitles', False) or list_subtitles:
1006 subtitles.update(self._get_subtitles(*args, **kwargs))
1007 return subtitles
1008
1009 def _get_subtitles(self, *args, **kwargs):
1010 raise NotImplementedError("This method must be implemented by subclasses")
1011
360e1ca5
JMF
1012 def extract_automatic_captions(self, *args, **kwargs):
1013 automatic_captions = {}
1014 list_subtitles = self._downloader.params.get('listsubtitles')
1015 if self._downloader.params.get('writeautomaticsub', False) or list_subtitles:
1016 automatic_captions.update(self._get_automatic_captions(*args, **kwargs))
1017 return automatic_captions
1018
1019 def _get_automatic_captions(self, *args, **kwargs):
1020 raise NotImplementedError("This method must be implemented by subclasses")
1021
8dbe9899 1022
d6983cb4
PH
1023class SearchInfoExtractor(InfoExtractor):
1024 """
1025 Base class for paged search queries extractors.
1026 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
1027 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1028 """
1029
1030 @classmethod
1031 def _make_valid_url(cls):
1032 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1033
1034 @classmethod
1035 def suitable(cls, url):
1036 return re.match(cls._make_valid_url(), url) is not None
1037
1038 def _real_extract(self, query):
1039 mobj = re.match(self._make_valid_url(), query)
1040 if mobj is None:
f1a9d64e 1041 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
1042
1043 prefix = mobj.group('prefix')
1044 query = mobj.group('query')
1045 if prefix == '':
1046 return self._get_n_results(query, 1)
1047 elif prefix == 'all':
1048 return self._get_n_results(query, self._MAX_RESULTS)
1049 else:
1050 n = int(prefix)
1051 if n <= 0:
f1a9d64e 1052 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 1053 elif n > self._MAX_RESULTS:
f1a9d64e 1054 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
1055 n = self._MAX_RESULTS
1056 return self._get_n_results(query, n)
1057
1058 def _get_n_results(self, query, n):
1059 """Get a specified number of results for a query"""
416a5efc 1060 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
1061
1062 @property
1063 def SEARCH_KEY(self):
1064 return self._SEARCH_KEY