]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[extractor/common] Relax _hidden_inputs
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
6a3828fd 1from __future__ import unicode_literals
f1a9d64e 2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
1bac3455 13import math
d6983cb4 14
8c25f81b 15from ..compat import (
42939b61 16 compat_cookiejar,
799207e8 17 compat_cookies,
e9c0cdd3 18 compat_etree_fromstring,
e64b7569 19 compat_getpass,
d6983cb4 20 compat_http_client,
e9c0cdd3
YCH
21 compat_os_name,
22 compat_str,
d6983cb4 23 compat_urllib_error,
15707c7e 24 compat_urllib_parse_urlencode,
41d06b04 25 compat_urllib_request,
f0b5d6af 26 compat_urlparse,
8c25f81b 27)
b22ca762 28from ..downloader.f4m import remove_encrypted_media
8c25f81b 29from ..utils import (
c342041f 30 NO_DEFAULT,
05900629 31 age_restricted,
08f2a92c 32 bug_reports_message,
d6983cb4
PH
33 clean_html,
34 compiled_regex_type,
70f0f5a8 35 determine_ext,
9b9c5355 36 error_to_compat_str,
d6983cb4 37 ExtractorError,
97f4aecf 38 fix_xml_ampersands,
b14f3a4c 39 float_or_none,
31bb8d3f 40 int_or_none,
4ca2a3cf 41 parse_iso8601,
55b3e45b 42 RegexNotFoundError,
d41e6efc 43 sanitize_filename,
5c2266df 44 sanitized_Request,
f38de77f 45 unescapeHTML,
647eab45 46 unified_strdate,
a107193e 47 url_basename,
8d6765cf
S
48 xpath_text,
49 xpath_with_ns,
d497a201 50 determine_protocol,
1bac3455 51 parse_duration,
cafcf657 52 mimetype2ext,
41d06b04 53 update_Request,
cdfee168 54 update_url_query,
d6983cb4 55)
c342041f 56
d6983cb4
PH
57
58class InfoExtractor(object):
59 """Information Extractor class.
60
61 Information extractors are the classes that, given a URL, extract
62 information about the video (or videos) the URL refers to. This
63 information includes the real video URL, the video title, author and
64 others. The information is stored in a dictionary which is then
5d380852 65 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
66 information possibly downloading the video to the file system, among
67 other possible outcomes.
68
cf0649f8 69 The type field determines the type of the result.
fed5d032
PH
70 By far the most common value (and the default if _type is missing) is
71 "video", which indicates a single video.
72
73 For a video, the dictionaries must include the following fields:
d6983cb4
PH
74
75 id: Video identifier.
d6983cb4 76 title: Video title, unescaped.
d67b0b15 77
f49d89ee 78 Additionally, it must contain either a formats entry or a url one:
d67b0b15 79
f49d89ee
PH
80 formats: A list of dictionaries for each format available, ordered
81 from worst to best quality.
82
83 Potential fields:
d67b0b15 84 * url Mandatory. The URL of the video file
10952eb2 85 * ext Will be calculated from URL if missing
d67b0b15
PH
86 * format A human-readable description of the format
87 ("mp4 container with h264/opus").
88 Calculated from the format_id, width, height.
89 and format_note fields if missing.
90 * format_id A short description of the format
5d4f3985
PH
91 ("mp4_h264_opus" or "19").
92 Technically optional, but strongly recommended.
d67b0b15
PH
93 * format_note Additional info about the format
94 ("3D" or "DASH video")
95 * width Width of the video, if known
96 * height Height of the video, if known
f49d89ee 97 * resolution Textual description of width and height
7217e148 98 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
99 * abr Average audio bitrate in KBit/s
100 * acodec Name of the audio codec in use
dd27fd17 101 * asr Audio sampling rate in Hertz
d67b0b15 102 * vbr Average video bitrate in KBit/s
fbb21cf5 103 * fps Frame rate
d67b0b15 104 * vcodec Name of the video codec in use
1394ce65 105 * container Name of the container format
d67b0b15 106 * filesize The number of bytes, if known in advance
9732d77e 107 * filesize_approx An estimate for the number of bytes
d67b0b15 108 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
109 * protocol The protocol that will be used for the actual
110 download, lower-case.
b04b8852 111 "http", "https", "rtsp", "rtmp", "rtmpe",
af7d5a63 112 "m3u8", "m3u8_native" or "http_dash_segments".
f49d89ee 113 * preference Order number of this format. If this field is
08d13955 114 present and not None, the formats get sorted
38d63d84 115 by this field, regardless of all other values.
f49d89ee
PH
116 -1 for default (order by other properties),
117 -2 or smaller for less than default.
e65566a9
PH
118 < -1000 to hide the format (if there is
119 another one which is strictly better)
32f90364
PH
120 * language Language code, e.g. "de" or "en-US".
121 * language_preference Is this in the language mentioned in
122 the URL?
aff2f4f4
PH
123 10 if it's what the URL is about,
124 -1 for default (don't know),
125 -10 otherwise, other values reserved for now.
5d73273f
PH
126 * quality Order number of the video quality of this
127 format, irrespective of the file format.
128 -1 for default (order by other properties),
129 -2 or smaller for less than default.
c64ed2a3
PH
130 * source_preference Order number for this video source
131 (quality takes higher priority)
132 -1 for default (order by other properties),
133 -2 or smaller for less than default.
d769be6c
PH
134 * http_headers A dictionary of additional HTTP headers
135 to add to the request.
6271f1ca 136 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
137 video's pixels are not square.
138 width : height ratio as float.
139 * no_resume The server does not support resuming the
140 (HTTP or RTMP) download. Boolean.
141
c0ba0f48 142 url: Final video URL.
d6983cb4 143 ext: Video filename extension.
d67b0b15
PH
144 format: The video format, defaults to ext (used for --get-format)
145 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 146
d6983cb4
PH
147 The following fields are optional:
148
f5e43bc6 149 alt_title: A secondary title of the video.
0afef30b
PH
150 display_id An alternative identifier for the video, not necessarily
151 unique, but available before title. Typically, id is
152 something like "4234987", title "Dancing naked mole rats",
153 and display_id "dancing-naked-mole-rats"
d5519808 154 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 155 * "id" (optional, string) - Thumbnail format ID
d5519808 156 * "url"
cfb56d1a 157 * "preference" (optional, int) - quality of the image
d5519808
PH
158 * "width" (optional, int)
159 * "height" (optional, int)
160 * "resolution" (optional, string "{width}x{height"},
161 deprecated)
d6983cb4 162 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 163 description: Full video description.
d6983cb4 164 uploader: Full name of the video uploader.
2bc0c46f 165 license: License name the video is licensed under.
9bb8e0a3 166 creator: The main artist who created the video.
8aab976b 167 release_date: The date (YYYYMMDD) when the video was released.
955c4514 168 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 169 upload_date: Video upload date (YYYYMMDD).
955c4514 170 If not explicitly set, calculated from timestamp.
d6983cb4 171 uploader_id: Nickname or id of the video uploader.
7bcd2830 172 uploader_url: Full URL to a personal webpage of the video uploader.
da9ec3b9 173 location: Physical location where the video was filmed.
a504ced0
JMF
174 subtitles: The available subtitles as a dictionary in the format
175 {language: subformats}. "subformats" is a list sorted from
176 lower to higher preference, each element is a dictionary
177 with the "ext" entry and one of:
178 * "data": The subtitles file contents
10952eb2 179 * "url": A URL pointing to the subtitles file
4bba3716 180 "ext" will be calculated from URL if missing
360e1ca5
JMF
181 automatic_captions: Like 'subtitles', used by the YoutubeIE for
182 automatically generated captions
62d231c0 183 duration: Length of the video in seconds, as an integer or float.
f3d29461 184 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
185 like_count: Number of positive ratings of the video
186 dislike_count: Number of negative ratings of the video
02835c6b 187 repost_count: Number of reposts of the video
2d30521a 188 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 189 comment_count: Number of comments on the video
dd622d7c
PH
190 comments: A list of comments, each with one or more of the following
191 properties (all but one of text or html optional):
192 * "author" - human-readable name of the comment author
193 * "author_id" - user ID of the comment author
194 * "id" - Comment ID
195 * "html" - Comment as HTML
196 * "text" - Plain text of the comment
197 * "timestamp" - UNIX timestamp of comment
198 * "parent" - ID of the comment this one is replying to.
199 Set to "root" to indicate that this is a
200 comment to the original video.
8dbe9899 201 age_limit: Age restriction for the video, as an integer (years)
10952eb2 202 webpage_url: The URL to the video webpage, if given to youtube-dl it
9103bbc5
JMF
203 should allow to get the same result again. (It will be set
204 by YoutubeDL if it's missing)
ad3bc6ac
PH
205 categories: A list of categories that the video falls in, for example
206 ["Sports", "Berlin"]
864f24bd 207 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
208 is_live: True, False, or None (=unknown). Whether this video is a
209 live stream that goes on instead of a fixed-length video.
7c80519c 210 start_time: Time in seconds where the reproduction should start, as
10952eb2 211 specified in the URL.
297a564b 212 end_time: Time in seconds where the reproduction should end, as
10952eb2 213 specified in the URL.
d6983cb4 214
7109903e
S
215 The following fields should only be used when the video belongs to some logical
216 chapter or section:
217
218 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
219 chapter_number: Number of the chapter the video belongs to, as an integer.
220 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
221
222 The following fields should only be used when the video is an episode of some
223 series or programme:
224
225 series: Title of the series or programme the video episode belongs to.
226 season: Title of the season the video episode belongs to.
27bfd4e5
S
227 season_number: Number of the season the video episode belongs to, as an integer.
228 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
229 episode: Title of the video episode. Unlike mandatory video title field,
230 this field should denote the exact title of the video episode
231 without any kind of decoration.
27bfd4e5
S
232 episode_number: Number of the video episode within a season, as an integer.
233 episode_id: Id of the video episode, as a unicode string.
7109903e 234
7a93ab5f
S
235 The following fields should only be used when the media is a track or a part of
236 a music album:
237
238 track: Title of the track.
239 track_number: Number of the track within an album or a disc, as an integer.
240 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
241 as a unicode string.
242 artist: Artist(s) of the track.
243 genre: Genre(s) of the track.
244 album: Title of the album the track belongs to.
245 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
246 album_artist: List of all artists appeared on the album (e.g.
247 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
248 and compilations).
249 disc_number: Number of the disc or other physical medium the track belongs to,
250 as an integer.
251 release_year: Year (YYYY) when the album was released.
252
deefc05b 253 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 254
d838b1bd
PH
255 Unless mentioned otherwise, None is equivalent to absence of information.
256
fed5d032
PH
257
258 _type "playlist" indicates multiple videos.
b82f815f
PH
259 There must be a key "entries", which is a list, an iterable, or a PagedList
260 object, each element of which is a valid dictionary by this specification.
fed5d032 261
e0b9d78f
S
262 Additionally, playlists can have "title", "description" and "id" attributes
263 with the same semantics as videos (see above).
fed5d032
PH
264
265
266 _type "multi_video" indicates that there are multiple videos that
267 form a single show, for examples multiple acts of an opera or TV episode.
268 It must have an entries key like a playlist and contain all the keys
269 required for a video at the same time.
270
271
272 _type "url" indicates that the video must be extracted from another
273 location, possibly by a different extractor. Its only required key is:
274 "url" - the next URL to extract.
f58766ce
PH
275 The key "ie_key" can be set to the class name (minus the trailing "IE",
276 e.g. "Youtube") if the extractor class is known in advance.
277 Additionally, the dictionary may have any properties of the resolved entity
278 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
279 known ahead of time.
280
281
282 _type "url_transparent" entities have the same specification as "url", but
283 indicate that the given additional information is more precise than the one
284 associated with the resolved URL.
285 This is useful when a site employs a video service that hosts the video and
286 its technical metadata, but that video service does not embed a useful
287 title, description etc.
288
289
d6983cb4
PH
290 Subclasses of this one should re-define the _real_initialize() and
291 _real_extract() methods and define a _VALID_URL regexp.
292 Probably, they should also be added to the list of extractors.
293
d6983cb4
PH
294 Finally, the _WORKING attribute should be set to False for broken IEs
295 in order to warn the users and skip the tests.
296 """
297
298 _ready = False
299 _downloader = None
300 _WORKING = True
301
302 def __init__(self, downloader=None):
303 """Constructor. Receives an optional downloader."""
304 self._ready = False
305 self.set_downloader(downloader)
306
307 @classmethod
308 def suitable(cls, url):
309 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
310
311 # This does not use has/getattr intentionally - we want to know whether
312 # we have cached the regexp for *this* class, whereas getattr would also
313 # match the superclass
314 if '_VALID_URL_RE' not in cls.__dict__:
315 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
316 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 317
ed9266db
PH
318 @classmethod
319 def _match_id(cls, url):
320 if '_VALID_URL_RE' not in cls.__dict__:
321 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
322 m = cls._VALID_URL_RE.match(url)
323 assert m
324 return m.group('id')
325
d6983cb4
PH
326 @classmethod
327 def working(cls):
328 """Getter method for _WORKING."""
329 return cls._WORKING
330
331 def initialize(self):
332 """Initializes an instance (authentication, etc)."""
333 if not self._ready:
334 self._real_initialize()
335 self._ready = True
336
337 def extract(self, url):
338 """Extracts URL information and returns it in list of dicts."""
3a5bcd03
PH
339 try:
340 self.initialize()
341 return self._real_extract(url)
342 except ExtractorError:
343 raise
344 except compat_http_client.IncompleteRead as e:
dfb1b146 345 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 346 except (KeyError, StopIteration) as e:
dfb1b146 347 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4
PH
348
349 def set_downloader(self, downloader):
350 """Sets the downloader for this IE."""
351 self._downloader = downloader
352
353 def _real_initialize(self):
354 """Real initialization process. Redefine in subclasses."""
355 pass
356
357 def _real_extract(self, url):
358 """Real extraction process. Redefine in subclasses."""
359 pass
360
56c73665
JMF
361 @classmethod
362 def ie_key(cls):
363 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 364 return compat_str(cls.__name__[:-2])
56c73665 365
d6983cb4
PH
366 @property
367 def IE_NAME(self):
dc519b54 368 return compat_str(type(self).__name__[:-2])
d6983cb4 369
41d06b04 370 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
d6983cb4
PH
371 """ Returns the response handle """
372 if note is None:
373 self.report_download_webpage(video_id)
374 elif note is not False:
7cc3570e 375 if video_id is None:
f1a9d64e 376 self.to_screen('%s' % (note,))
7cc3570e 377 else:
f1a9d64e 378 self.to_screen('%s: %s' % (video_id, note))
cdfee168 379 # data, headers and query params will be ignored for `Request` objects
41d06b04
S
380 if isinstance(url_or_request, compat_urllib_request.Request):
381 url_or_request = update_Request(
382 url_or_request, data=data, headers=headers, query=query)
383 else:
cdfee168 384 if query:
385 url_or_request = update_url_query(url_or_request, query)
386 if data or headers:
41d06b04 387 url_or_request = sanitized_Request(url_or_request, data, headers)
d6983cb4 388 try:
dca08720 389 return self._downloader.urlopen(url_or_request)
d6983cb4 390 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
391 if errnote is False:
392 return False
d6983cb4 393 if errnote is None:
f1a9d64e 394 errnote = 'Unable to download webpage'
7f8b2714 395
9b9c5355 396 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
397 if fatal:
398 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
399 else:
400 self._downloader.report_warning(errmsg)
401 return False
d6983cb4 402
41d06b04 403 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
d6983cb4 404 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
405 # Strip hashes from the URL (#1038)
406 if isinstance(url_or_request, (compat_str, str)):
407 url_or_request = url_or_request.partition('#')[0]
408
cdfee168 409 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
7cc3570e
PH
410 if urlh is False:
411 assert not fatal
412 return False
c9a77969 413 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
414 return (content, urlh)
415
c9a77969
YCH
416 @staticmethod
417 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
418 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
419 if m:
420 encoding = m.group(1)
421 else:
0d75ae2c 422 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
423 webpage_bytes[:1024])
424 if m:
425 encoding = m.group(1).decode('ascii')
b60016e8
PH
426 elif webpage_bytes.startswith(b'\xff\xfe'):
427 encoding = 'utf-16'
f143d86a
PH
428 else:
429 encoding = 'utf-8'
c9a77969
YCH
430
431 return encoding
432
433 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
434 content_type = urlh.headers.get('Content-Type', '')
435 webpage_bytes = urlh.read()
436 if prefix is not None:
437 webpage_bytes = prefix + webpage_bytes
438 if not encoding:
439 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4
PH
440 if self._downloader.params.get('dump_intermediate_pages', False):
441 try:
442 url = url_or_request.get_full_url()
443 except AttributeError:
444 url = url_or_request
f1a9d64e 445 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
446 dump = base64.b64encode(webpage_bytes).decode('ascii')
447 self._downloader.to_screen(dump)
d41e6efc
PH
448 if self._downloader.params.get('write_pages', False):
449 try:
450 url = url_or_request.get_full_url()
451 except AttributeError:
452 url = url_or_request
5afa7f8b 453 basen = '%s_%s' % (video_id, url)
c1bce22f 454 if len(basen) > 240:
f1a9d64e 455 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
456 basen = basen[:240 - len(h)] + h
457 raw_filename = basen + '.dump'
d41e6efc 458 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 459 self.to_screen('Saving request to ' + filename)
5f58165d
S
460 # Working around MAX_PATH limitation on Windows (see
461 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 462 if compat_os_name == 'nt':
5f58165d
S
463 absfilepath = os.path.abspath(filename)
464 if len(absfilepath) > 259:
465 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
466 with open(filename, 'wb') as outf:
467 outf.write(webpage_bytes)
468
ec0fafbb
AA
469 try:
470 content = webpage_bytes.decode(encoding, 'replace')
471 except LookupError:
472 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 473
f1a9d64e
PH
474 if ('<title>Access to this site is blocked</title>' in content and
475 'Websense' in content[:512]):
476 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
477 blocked_iframe = self._html_search_regex(
478 r'<iframe src="([^"]+)"', content,
f1a9d64e 479 'Websense information URL', default=None)
2410c43d 480 if blocked_iframe:
f1a9d64e 481 msg += ' Visit %s for more details' % blocked_iframe
2410c43d 482 raise ExtractorError(msg, expected=True)
77b2986b
PH
483 if '<title>The URL you requested has been blocked</title>' in content[:512]:
484 msg = (
485 'Access to this webpage has been blocked by Indian censorship. '
486 'Use a VPN or proxy server (with --proxy) to route around it.')
487 block_msg = self._html_search_regex(
488 r'</h1><p>(.*?)</p>',
489 content, 'block message', default=None)
490 if block_msg:
491 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
492 raise ExtractorError(msg, expected=True)
2410c43d 493
23be51d8 494 return content
d6983cb4 495
41d06b04 496 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
d6983cb4 497 """ Returns the data of the page as a string """
995ad69c
TF
498 success = False
499 try_count = 0
500 while success is False:
501 try:
cdfee168 502 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
995ad69c
TF
503 success = True
504 except compat_http_client.IncompleteRead as e:
505 try_count += 1
506 if try_count >= tries:
507 raise e
508 self._sleep(timeout, video_id)
7cc3570e
PH
509 if res is False:
510 return res
511 else:
512 content, _ = res
513 return content
d6983cb4 514
2a275ab0 515 def _download_xml(self, url_or_request, video_id,
f1a9d64e 516 note='Downloading XML', errnote='Unable to download XML',
41d06b04 517 transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
267ed0c5 518 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd 519 xml_string = self._download_webpage(
cdfee168 520 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
28746fbd
PH
521 if xml_string is False:
522 return xml_string
e2b38da9
PH
523 if transform_source:
524 xml_string = transform_source(xml_string)
36e6f62c 525 return compat_etree_fromstring(xml_string.encode('utf-8'))
267ed0c5 526
3d3538e4 527 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
528 note='Downloading JSON metadata',
529 errnote='Unable to download JSON metadata',
b090af59 530 transform_source=None,
41d06b04 531 fatal=True, encoding=None, data=None, headers={}, query={}):
b090af59 532 json_string = self._download_webpage(
c9a77969 533 url_or_request, video_id, note, errnote, fatal=fatal,
cdfee168 534 encoding=encoding, data=data, headers=headers, query=query)
b090af59
PH
535 if (not fatal) and json_string is False:
536 return None
ebb64199
TF
537 return self._parse_json(
538 json_string, video_id, transform_source=transform_source, fatal=fatal)
539
540 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
541 if transform_source:
542 json_string = transform_source(json_string)
3d3538e4
PH
543 try:
544 return json.loads(json_string)
545 except ValueError as ve:
e7b6d122
PH
546 errmsg = '%s: Failed to parse JSON ' % video_id
547 if fatal:
548 raise ExtractorError(errmsg, cause=ve)
549 else:
550 self.report_warning(errmsg + str(ve))
3d3538e4 551
f45f96f8 552 def report_warning(self, msg, video_id=None):
f1a9d64e 553 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 554 self._downloader.report_warning(
f1a9d64e 555 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 556
d6983cb4
PH
557 def to_screen(self, msg):
558 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 559 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
560
561 def report_extraction(self, id_or_name):
562 """Report information extraction."""
f1a9d64e 563 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
564
565 def report_download_webpage(self, video_id):
566 """Report webpage download."""
f1a9d64e 567 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
568
569 def report_age_confirmation(self):
570 """Report attempt to confirm age."""
f1a9d64e 571 self.to_screen('Confirming age')
d6983cb4 572
fc79158d
JMF
573 def report_login(self):
574 """Report attempt to log in."""
f1a9d64e 575 self.to_screen('Logging in')
fc79158d 576
43e7d3c9
S
577 @staticmethod
578 def raise_login_required(msg='This video is only available for registered users'):
579 raise ExtractorError(
580 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
581 expected=True)
582
c430802e
S
583 @staticmethod
584 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
585 raise ExtractorError(
586 '%s. You might want to use --proxy to workaround.' % msg,
587 expected=True)
588
5f6a1245 589 # Methods for following #608
c0d0b01f 590 @staticmethod
830d53bf 591 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 592 """Returns a URL that points to a page that should be processed"""
5f6a1245 593 # TODO: ie should be the class used for getting the info
d6983cb4
PH
594 video_info = {'_type': 'url',
595 'url': url,
596 'ie_key': ie}
7012b23c
PH
597 if video_id is not None:
598 video_info['id'] = video_id
830d53bf
S
599 if video_title is not None:
600 video_info['title'] = video_title
d6983cb4 601 return video_info
5f6a1245 602
c0d0b01f 603 @staticmethod
acf5cbfe 604 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
605 """Returns a playlist"""
606 video_info = {'_type': 'playlist',
607 'entries': entries}
608 if playlist_id:
609 video_info['id'] = playlist_id
610 if playlist_title:
611 video_info['title'] = playlist_title
acf5cbfe
S
612 if playlist_description:
613 video_info['description'] = playlist_description
d6983cb4
PH
614 return video_info
615
c342041f 616 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
617 """
618 Perform a regex search on the given string, using a single or a list of
619 patterns returning the first matching group.
620 In case of failure return a default value or raise a WARNING or a
55b3e45b 621 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
622 """
623 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
624 mobj = re.search(pattern, string, flags)
625 else:
626 for p in pattern:
627 mobj = re.search(p, string, flags)
c3415d1b
PH
628 if mobj:
629 break
d6983cb4 630
e9c0cdd3 631 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
f1a9d64e 632 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
633 else:
634 _name = name
635
636 if mobj:
711ede6e
PH
637 if group is None:
638 # return the first matching group
639 return next(g for g in mobj.groups() if g is not None)
640 else:
641 return mobj.group(group)
c342041f 642 elif default is not NO_DEFAULT:
d6983cb4
PH
643 return default
644 elif fatal:
f1a9d64e 645 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 646 else:
08f2a92c 647 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
648 return None
649
c342041f 650 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
651 """
652 Like _search_regex, but strips HTML tags and unescapes entities.
653 """
711ede6e 654 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
655 if res:
656 return clean_html(res).strip()
657 else:
658 return res
659
fc79158d
JMF
660 def _get_login_info(self):
661 """
cf0649f8 662 Get the login info as (username, password)
fc79158d
JMF
663 It will look in the netrc file using the _NETRC_MACHINE value
664 If there's no info available, return (None, None)
665 """
666 if self._downloader is None:
667 return (None, None)
668
669 username = None
670 password = None
671 downloader_params = self._downloader.params
672
673 # Attempt to use provided username and password or .netrc data
d800609c 674 if downloader_params.get('username') is not None:
fc79158d
JMF
675 username = downloader_params['username']
676 password = downloader_params['password']
677 elif downloader_params.get('usenetrc', False):
678 try:
679 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
680 if info is not None:
681 username = info[0]
682 password = info[2]
683 else:
684 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
685 except (IOError, netrc.NetrcParseError) as err:
9b9c5355 686 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
5f6a1245 687
fc79158d
JMF
688 return (username, password)
689
e64b7569 690 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 691 """
692 Get the two-factor authentication info
693 TODO - asking the user will be required for sms/phone verify
694 currently just uses the command line option
695 If there's no info available, return None
696 """
697 if self._downloader is None:
83317f69 698 return None
699 downloader_params = self._downloader.params
700
d800609c 701 if downloader_params.get('twofactor') is not None:
83317f69 702 return downloader_params['twofactor']
703
e64b7569 704 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 705
46720279
JMF
706 # Helper functions for extracting OpenGraph info
707 @staticmethod
ab2d5247 708 def _og_regexes(prop):
448ef1f3 709 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
7a6d76a6
S
710 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
711 % {'prop': re.escape(prop)})
78fb87b2 712 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 713 return [
78fb87b2
JMF
714 template % (property_re, content_re),
715 template % (content_re, property_re),
ab2d5247 716 ]
46720279 717
864f24bd
S
718 @staticmethod
719 def _meta_regex(prop):
720 return r'''(?isx)<meta
8b9848ac 721 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
722 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
723
3c4e6d83 724 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 725 if name is None:
3c4e6d83 726 name = 'OpenGraph %s' % prop
ab2d5247 727 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
728 if escaped is None:
729 return None
730 return unescapeHTML(escaped)
46720279
JMF
731
732 def _og_search_thumbnail(self, html, **kargs):
10952eb2 733 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
734
735 def _og_search_description(self, html, **kargs):
736 return self._og_search_property('description', html, fatal=False, **kargs)
737
738 def _og_search_title(self, html, **kargs):
739 return self._og_search_property('title', html, **kargs)
740
8ffa13e0 741 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
742 regexes = self._og_regexes('video') + self._og_regexes('video:url')
743 if secure:
744 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 745 return self._html_search_regex(regexes, html, name, **kargs)
46720279 746
78338f71
JMF
747 def _og_search_url(self, html, **kargs):
748 return self._og_search_property('url', html, **kargs)
749
40c696e5 750 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
751 if display_name is None:
752 display_name = name
753 return self._html_search_regex(
864f24bd 754 self._meta_regex(name),
711ede6e 755 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
756
757 def _dc_search_uploader(self, html):
758 return self._html_search_meta('dc.creator', html, 'uploader')
759
8dbe9899
PH
760 def _rta_search(self, html):
761 # See http://www.rtalabel.org/index.php?content=howtofaq#single
762 if re.search(r'(?ix)<meta\s+name="rating"\s+'
763 r' content="RTA-5042-1996-1400-1577-RTA"',
764 html):
765 return 18
766 return 0
767
59040888
PH
768 def _media_rating_search(self, html):
769 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
770 rating = self._html_search_meta('rating', html)
771
772 if not rating:
773 return None
774
775 RATING_TABLE = {
776 'safe for kids': 0,
777 'general': 8,
778 '14 years': 14,
779 'mature': 17,
780 'restricted': 19,
781 }
d800609c 782 return RATING_TABLE.get(rating.lower())
59040888 783
69319969 784 def _family_friendly_search(self, html):
6ca7732d 785 # See http://schema.org/VideoObject
69319969
NJ
786 family_friendly = self._html_search_meta('isFamilyFriendly', html)
787
788 if not family_friendly:
789 return None
790
791 RATING_TABLE = {
792 '1': 0,
793 'true': 0,
794 '0': 18,
795 'false': 18,
796 }
d800609c 797 return RATING_TABLE.get(family_friendly.lower())
69319969 798
0c708f11
JMF
799 def _twitter_search_player(self, html):
800 return self._html_search_meta('twitter:player', html,
9e1a5b84 801 'twitter card player')
0c708f11 802
0b26ba3f 803 def _search_json_ld(self, html, video_id, **kwargs):
4ca2a3cf
S
804 json_ld = self._search_regex(
805 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
0b26ba3f 806 html, 'JSON-LD', group='json_ld', **kwargs)
4ca2a3cf
S
807 if not json_ld:
808 return {}
0b26ba3f 809 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
4ca2a3cf
S
810
811 def _json_ld(self, json_ld, video_id, fatal=True):
812 if isinstance(json_ld, compat_str):
813 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
814 if not json_ld:
815 return {}
816 info = {}
817 if json_ld.get('@context') == 'http://schema.org':
818 item_type = json_ld.get('@type')
819 if item_type == 'TVEpisode':
820 info.update({
821 'episode': unescapeHTML(json_ld.get('name')),
822 'episode_number': int_or_none(json_ld.get('episodeNumber')),
823 'description': unescapeHTML(json_ld.get('description')),
824 })
825 part_of_season = json_ld.get('partOfSeason')
826 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
827 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
828 part_of_series = json_ld.get('partOfSeries')
829 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
830 info['series'] = unescapeHTML(part_of_series.get('name'))
831 elif item_type == 'Article':
832 info.update({
833 'timestamp': parse_iso8601(json_ld.get('datePublished')),
834 'title': unescapeHTML(json_ld.get('headline')),
835 'description': unescapeHTML(json_ld.get('articleBody')),
836 })
837 return dict((k, v) for k, v in info.items() if v is not None)
838
27713812 839 @staticmethod
f8da79f8 840 def _hidden_inputs(html):
586f1cc5 841 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 842 hidden_inputs = {}
73eb13df 843 for input in re.findall(r'(?i)<input([^>]+)>', html):
be0e5dbd 844 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
201ea3ee 845 continue
bacec039 846 name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input)
201ea3ee
S
847 if not name:
848 continue
849 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
850 if not value:
851 continue
852 hidden_inputs[name.group('value')] = value.group('value')
853 return hidden_inputs
27713812 854
cf61d96d
S
855 def _form_hidden_inputs(self, form_id, html):
856 form = self._search_regex(
73eb13df 857 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
858 html, '%s form' % form_id, group='form')
859 return self._hidden_inputs(form)
860
3ded7bac 861 def _sort_formats(self, formats, field_preference=None):
7e8caf30 862 if not formats:
f1a9d64e 863 raise ExtractorError('No video formats found')
7e8caf30 864
b0d21ded
S
865 for f in formats:
866 # Automatically determine tbr when missing based on abr and vbr (improves
867 # formats sorting in some cases)
350cf045 868 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
b0d21ded
S
869 f['tbr'] = f['abr'] + f['vbr']
870
4bcc7bd1 871 def _formats_key(f):
e6812ac9
PH
872 # TODO remove the following workaround
873 from ..utils import determine_ext
874 if not f.get('ext') and 'url' in f:
875 f['ext'] = determine_ext(f['url'])
876
3ded7bac
S
877 if isinstance(field_preference, (list, tuple)):
878 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
879
4bcc7bd1
PH
880 preference = f.get('preference')
881 if preference is None:
d497a201 882 preference = 0
4bcc7bd1
PH
883 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
884 preference -= 0.5
885
d497a201 886 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
887
4bcc7bd1 888 if f.get('vcodec') == 'none': # audio only
dd867805 889 preference -= 50
4bcc7bd1 890 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 891 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 892 else:
f1a9d64e 893 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
894 ext_preference = 0
895 try:
896 audio_ext_preference = ORDER.index(f['ext'])
897 except ValueError:
898 audio_ext_preference = -1
899 else:
dd867805 900 if f.get('acodec') == 'none': # video only
901 preference -= 40
4bcc7bd1 902 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 903 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 904 else:
f1a9d64e 905 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
906 try:
907 ext_preference = ORDER.index(f['ext'])
908 except ValueError:
909 ext_preference = -1
910 audio_ext_preference = 0
911
912 return (
913 preference,
aff2f4f4 914 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 915 f.get('quality') if f.get('quality') is not None else -1,
9933b574 916 f.get('tbr') if f.get('tbr') is not None else -1,
03cd72b0 917 f.get('filesize') if f.get('filesize') is not None else -1,
4bcc7bd1 918 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
919 f.get('height') if f.get('height') is not None else -1,
920 f.get('width') if f.get('width') is not None else -1,
d497a201 921 proto_preference,
1e1896f2 922 ext_preference,
4bcc7bd1
PH
923 f.get('abr') if f.get('abr') is not None else -1,
924 audio_ext_preference,
2c8e03d9 925 f.get('fps') if f.get('fps') is not None else -1,
9732d77e 926 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 927 f.get('source_preference') if f.get('source_preference') is not None else -1,
74f72824 928 f.get('format_id') if f.get('format_id') is not None else '',
4bcc7bd1
PH
929 )
930 formats.sort(key=_formats_key)
59040888 931
96a53167
S
932 def _check_formats(self, formats, video_id):
933 if formats:
934 formats[:] = filter(
935 lambda f: self._is_valid_url(
936 f['url'], video_id,
937 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
938 formats)
939
f5bdb444
S
940 @staticmethod
941 def _remove_duplicate_formats(formats):
942 format_urls = set()
943 unique_formats = []
944 for f in formats:
945 if f['url'] not in format_urls:
946 format_urls.add(f['url'])
947 unique_formats.append(f)
948 formats[:] = unique_formats
949
96a53167 950 def _is_valid_url(self, url, video_id, item='video'):
2f0f6578
S
951 url = self._proto_relative_url(url, scheme='http:')
952 # For now assume non HTTP(S) URLs always valid
953 if not (url.startswith('http://') or url.startswith('https://')):
954 return True
96a53167 955 try:
4069766c 956 self._request_webpage(url, video_id, 'Checking %s URL' % item)
96a53167
S
957 return True
958 except ExtractorError as e:
943a1e24 959 if isinstance(e.cause, compat_urllib_error.URLError):
baa43cba
S
960 self.to_screen(
961 '%s: %s URL is invalid, skipping' % (video_id, item))
96a53167
S
962 return False
963 raise
964
20991253 965 def http_scheme(self):
1ede5b24 966 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
967 return (
968 'http:'
969 if self._downloader.params.get('prefer_insecure', False)
970 else 'https:')
971
57c7411f
PH
972 def _proto_relative_url(self, url, scheme=None):
973 if url is None:
974 return url
975 if url.startswith('//'):
976 if scheme is None:
977 scheme = self.http_scheme()
978 return scheme + url
979 else:
980 return url
981
4094b6e3
PH
982 def _sleep(self, timeout, video_id, msg_template=None):
983 if msg_template is None:
f1a9d64e 984 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
985 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
986 self.to_screen(msg)
987 time.sleep(timeout)
988
a38436e8 989 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
4de61310
S
990 transform_source=lambda s: fix_xml_ampersands(s).strip(),
991 fatal=True):
f036a632
JMF
992 manifest = self._download_xml(
993 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
994 'Unable to download f4m manifest',
995 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
996 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
4de61310
S
997 transform_source=transform_source,
998 fatal=fatal)
999
1000 if manifest is False:
8d29e47f 1001 return []
31bb8d3f 1002
0fdbb332
S
1003 return self._parse_f4m_formats(
1004 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1005 transform_source=transform_source, fatal=fatal)
1006
1007 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1008 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1009 fatal=True):
31bb8d3f 1010 formats = []
7a47d07c 1011 manifest_version = '1.0'
b2527359 1012 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1013 if not media_nodes:
7a47d07c 1014 manifest_version = '2.0'
34e48bed 1015 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762
S
1016 # Remove unsupported DRM protected media from final formats
1017 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1018 media_nodes = remove_encrypted_media(media_nodes)
1019 if not media_nodes:
1020 return formats
019839fa
S
1021 base_url = xpath_text(
1022 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1023 'base URL', default=None)
1024 if base_url:
1025 base_url = base_url.strip()
b2527359 1026 for i, media_el in enumerate(media_nodes):
7a47d07c 1027 if manifest_version == '2.0':
31c746e5
S
1028 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
1029 if not media_url:
1030 continue
cc357c4d
S
1031 manifest_url = (
1032 media_url if media_url.startswith('http://') or media_url.startswith('https://')
019839fa 1033 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1034 # If media_url is itself a f4m manifest do the recursive extraction
1035 # since bitrates in parent manifest (this one) and media_url manifest
1036 # may differ leading to inability to resolve the format by requested
1037 # bitrate in f4m downloader
1038 if determine_ext(manifest_url) == 'f4m':
7e5edcfd 1039 formats.extend(self._extract_f4m_formats(
0fdbb332
S
1040 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1041 transform_source=transform_source, fatal=fatal))
70f0f5a8 1042 continue
b2527359 1043 tbr = int_or_none(media_el.attrib.get('bitrate'))
31bb8d3f 1044 formats.append({
e21a55ab 1045 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
31bb8d3f
JMF
1046 'url': manifest_url,
1047 'ext': 'flv',
b2527359 1048 'tbr': tbr,
31bb8d3f
JMF
1049 'width': int_or_none(media_el.attrib.get('width')),
1050 'height': int_or_none(media_el.attrib.get('height')),
60ca389c 1051 'preference': preference,
31bb8d3f 1052 })
31bb8d3f
JMF
1053 return formats
1054
f0b5d6af 1055 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
60ca389c 1056 entry_protocol='m3u8', preference=None,
13af92fd
YCH
1057 m3u8_id=None, note=None, errnote=None,
1058 fatal=True):
f0b5d6af 1059
704df56d 1060 formats = [{
f207019c 1061 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1062 'url': m3u8_url,
1063 'ext': ext,
1064 'protocol': 'm3u8',
9fe6ef7a 1065 'preference': preference - 1 if preference else -1,
704df56d
PH
1066 'resolution': 'multiple',
1067 'format_note': 'Quality selection URL',
1068 }]
1069
f0b5d6af
PH
1070 format_url = lambda u: (
1071 u
1072 if re.match(r'^https?://', u)
1073 else compat_urlparse.urljoin(m3u8_url, u))
1074
dbd82a1d 1075 res = self._download_webpage_handle(
81515ad9 1076 m3u8_url, video_id,
621ed9f5 1077 note=note or 'Downloading m3u8 information',
13af92fd
YCH
1078 errnote=errnote or 'Failed to download m3u8 information',
1079 fatal=fatal)
dbd82a1d 1080 if res is False:
8d29e47f 1081 return []
dbd82a1d 1082 m3u8_doc, urlh = res
37113045 1083 m3u8_url = urlh.geturl()
9cdffeeb
S
1084
1085 # We should try extracting formats only from master playlists [1], i.e.
1086 # playlists that describe available qualities. On the other hand media
1087 # playlists [2] should be returned as is since they contain just the media
1088 # without qualities renditions.
1089 # Fortunately, master playlist can be easily distinguished from media
1090 # playlist based on particular tags availability. As of [1, 2] master
1091 # playlist tags MUST NOT appear in a media playist and vice versa.
1092 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1093 # and MUST NOT appear in master playlist thus we can clearly detect media
1094 # playlist with this criterion.
1095 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1096 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1097 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1098 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
7f32e5dc 1099 return [{
1100 'url': m3u8_url,
1101 'format_id': m3u8_id,
1102 'ext': ext,
1103 'protocol': entry_protocol,
1104 'preference': preference,
1105 }]
704df56d 1106 last_info = None
fa156077 1107 last_media = None
704df56d
PH
1108 kv_rex = re.compile(
1109 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1110 for line in m3u8_doc.splitlines():
1111 if line.startswith('#EXT-X-STREAM-INF:'):
1112 last_info = {}
1113 for m in kv_rex.finditer(line):
1114 v = m.group('val')
1115 if v.startswith('"'):
1116 v = v[1:-1]
1117 last_info[m.group('key')] = v
4cd95bcb
JMF
1118 elif line.startswith('#EXT-X-MEDIA:'):
1119 last_media = {}
1120 for m in kv_rex.finditer(line):
1121 v = m.group('val')
1122 if v.startswith('"'):
1123 v = v[1:-1]
1124 last_media[m.group('key')] = v
704df56d
PH
1125 elif line.startswith('#') or not line.strip():
1126 continue
1127 else:
daebaab6 1128 if last_info is None:
f0b5d6af 1129 formats.append({'url': format_url(line)})
3524cc25 1130 continue
704df56d 1131 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
8dc9d361
S
1132 format_id = []
1133 if m3u8_id:
1134 format_id.append(m3u8_id)
05d5392c 1135 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
8dc9d361 1136 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
704df56d 1137 f = {
8dc9d361 1138 'format_id': '-'.join(format_id),
f0b5d6af 1139 'url': format_url(line.strip()),
704df56d
PH
1140 'tbr': tbr,
1141 'ext': ext,
f0b5d6af
PH
1142 'protocol': entry_protocol,
1143 'preference': preference,
704df56d 1144 }
704df56d
PH
1145 resolution = last_info.get('RESOLUTION')
1146 if resolution:
1147 width_str, height_str = resolution.split('x')
1148 f['width'] = int(width_str)
1149 f['height'] = int(height_str)
fbb6edd2
S
1150 codecs = last_info.get('CODECS')
1151 if codecs:
1152 vcodec, acodec = [None] * 2
1153 va_codecs = codecs.split(',')
1154 if len(va_codecs) == 1:
1155 # Audio only entries usually come with single codec and
1156 # no resolution. For more robustness we also check it to
1157 # be mp4 audio.
1158 if not resolution and va_codecs[0].startswith('mp4a'):
1159 vcodec, acodec = 'none', va_codecs[0]
1160 else:
1161 vcodec = va_codecs[0]
1162 else:
1163 vcodec, acodec = va_codecs[:2]
1164 f.update({
1165 'acodec': acodec,
1166 'vcodec': vcodec,
1167 })
4cd95bcb
JMF
1168 if last_media is not None:
1169 f['m3u8_media'] = last_media
1170 last_media = None
704df56d
PH
1171 formats.append(f)
1172 last_info = {}
704df56d
PH
1173 return formats
1174
a107193e
S
1175 @staticmethod
1176 def _xpath_ns(path, namespace=None):
1177 if not namespace:
1178 return path
1179 out = []
1180 for c in path.split('/'):
1181 if not c or c == '.':
1182 out.append(c)
1183 else:
1184 out.append('{%s}%s' % (namespace, c))
1185 return '/'.join(out)
1186
09f572fb 1187 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1188 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 1189
995029a1
PH
1190 if smil is False:
1191 assert not fatal
1192 return []
e89a2aab 1193
17712eeb 1194 namespace = self._parse_smil_namespace(smil)
a107193e
S
1195
1196 return self._parse_smil_formats(
1197 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1198
1199 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1200 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1201 if smil is False:
1202 return {}
1203 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1204
09f572fb 1205 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
1206 return self._download_xml(
1207 smil_url, video_id, 'Downloading SMIL file',
09f572fb 1208 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
1209
1210 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 1211 namespace = self._parse_smil_namespace(smil)
a107193e
S
1212
1213 formats = self._parse_smil_formats(
1214 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1215 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1216
1217 video_id = os.path.splitext(url_basename(smil_url))[0]
1218 title = None
1219 description = None
647eab45 1220 upload_date = None
a107193e
S
1221 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1222 name = meta.attrib.get('name')
1223 content = meta.attrib.get('content')
1224 if not name or not content:
1225 continue
1226 if not title and name == 'title':
1227 title = content
1228 elif not description and name in ('description', 'abstract'):
1229 description = content
647eab45
S
1230 elif not upload_date and name == 'date':
1231 upload_date = unified_strdate(content)
a107193e 1232
1e5bcdec
S
1233 thumbnails = [{
1234 'id': image.get('type'),
1235 'url': image.get('src'),
1236 'width': int_or_none(image.get('width')),
1237 'height': int_or_none(image.get('height')),
1238 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1239
a107193e
S
1240 return {
1241 'id': video_id,
1242 'title': title or video_id,
1243 'description': description,
647eab45 1244 'upload_date': upload_date,
1e5bcdec 1245 'thumbnails': thumbnails,
a107193e
S
1246 'formats': formats,
1247 'subtitles': subtitles,
1248 }
1249
17712eeb
S
1250 def _parse_smil_namespace(self, smil):
1251 return self._search_regex(
1252 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1253
f877c6ae 1254 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
1255 base = smil_url
1256 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1257 b = meta.get('base') or meta.get('httpBase')
1258 if b:
1259 base = b
1260 break
e89a2aab
S
1261
1262 formats = []
1263 rtmp_count = 0
a107193e 1264 http_count = 0
7f32e5dc 1265 m3u8_count = 0
a107193e 1266
81e1c4e2 1267 srcs = []
a107193e
S
1268 videos = smil.findall(self._xpath_ns('.//video', namespace))
1269 for video in videos:
1270 src = video.get('src')
81e1c4e2 1271 if not src or src in srcs:
a107193e 1272 continue
81e1c4e2 1273 srcs.append(src)
a107193e 1274
e7d8e98a 1275 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
a107193e
S
1276 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1277 width = int_or_none(video.get('width'))
1278 height = int_or_none(video.get('height'))
1279 proto = video.get('proto')
1280 ext = video.get('ext')
1281 src_ext = determine_ext(src)
1282 streamer = video.get('streamer') or base
1283
1284 if proto == 'rtmp' or streamer.startswith('rtmp'):
1285 rtmp_count += 1
1286 formats.append({
1287 'url': streamer,
1288 'play_path': src,
1289 'ext': 'flv',
1290 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1291 'tbr': bitrate,
1292 'filesize': filesize,
1293 'width': width,
1294 'height': height,
1295 })
f877c6ae
YCH
1296 if transform_rtmp_url:
1297 streamer, src = transform_rtmp_url(streamer, src)
1298 formats[-1].update({
1299 'url': streamer,
1300 'play_path': src,
1301 })
a107193e
S
1302 continue
1303
1304 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 1305 src_url = src_url.strip()
a107193e
S
1306
1307 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 1308 m3u8_formats = self._extract_m3u8_formats(
1309 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1310 if len(m3u8_formats) == 1:
1311 m3u8_count += 1
1312 m3u8_formats[0].update({
1313 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1314 'tbr': bitrate,
1315 'width': width,
1316 'height': height,
1317 })
1318 formats.extend(m3u8_formats)
a107193e
S
1319 continue
1320
1321 if src_ext == 'f4m':
1322 f4m_url = src_url
1323 if not f4m_params:
1324 f4m_params = {
1325 'hdcore': '3.2.0',
1326 'plugin': 'flowplayer-3.2.0.1',
1327 }
1328 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 1329 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 1330 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
a107193e
S
1331 continue
1332
c78e4817 1333 if src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
1334 http_count += 1
1335 formats.append({
1336 'url': src_url,
1337 'ext': ext or src_ext or 'flv',
1338 'format_id': 'http-%d' % (bitrate or http_count),
1339 'tbr': bitrate,
1340 'filesize': filesize,
1341 'width': width,
1342 'height': height,
1343 })
1344 continue
63757032 1345
e89a2aab
S
1346 return formats
1347
ce00af87 1348 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 1349 urls = []
a107193e
S
1350 subtitles = {}
1351 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1352 src = textstream.get('src')
d413095f 1353 if not src or src in urls:
a107193e 1354 continue
d413095f 1355 urls.append(src)
df634be2 1356 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 1357 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
1358 subtitles.setdefault(lang, []).append({
1359 'url': src,
1360 'ext': ext,
1361 })
1362 return subtitles
63757032 1363
942acef5
S
1364 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1365 xspf = self._download_xml(
8d6765cf 1366 playlist_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
1367 'Unable to download xspf manifest', fatal=fatal)
1368 if xspf is False:
1369 return []
1370 return self._parse_xspf(xspf, playlist_id)
8d6765cf 1371
942acef5 1372 def _parse_xspf(self, playlist, playlist_id):
8d6765cf
S
1373 NS_MAP = {
1374 'xspf': 'http://xspf.org/ns/0/',
1375 's1': 'http://static.streamone.nl/player/ns/0',
1376 }
1377
1378 entries = []
1379 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1380 title = xpath_text(
98044462 1381 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
1382 description = xpath_text(
1383 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1384 thumbnail = xpath_text(
1385 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1386 duration = float_or_none(
1387 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1388
1389 formats = [{
1390 'url': location.text,
1391 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1392 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1393 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1394 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1395 self._sort_formats(formats)
1396
1397 entries.append({
1398 'id': playlist_id,
1399 'title': title,
1400 'description': description,
1401 'thumbnail': thumbnail,
1402 'duration': duration,
1403 'formats': formats,
1404 })
1405 return entries
1406
1bac3455 1407 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1408 res = self._download_webpage_handle(
1409 mpd_url, video_id,
1410 note=note or 'Downloading MPD manifest',
1411 errnote=errnote or 'Failed to download MPD manifest',
2d2fa82d 1412 fatal=fatal)
1bac3455 1413 if res is False:
2d2fa82d 1414 return []
1bac3455 1415 mpd, urlh = res
1416 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1417
91cb6b50 1418 return self._parse_mpd_formats(
1bac3455 1419 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
2d2fa82d 1420
91cb6b50 1421 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1bac3455 1422 if mpd_doc.get('type') == 'dynamic':
1423 return []
2d2fa82d 1424
91cb6b50 1425 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 1426
1427 def _add_ns(path):
1428 return self._xpath_ns(path, namespace)
1429
675d0016 1430 def is_drm_protected(element):
1431 return element.find(_add_ns('ContentProtection')) is not None
1432
1bac3455 1433 def extract_multisegment_info(element, ms_parent_info):
1434 ms_info = ms_parent_info.copy()
f14be228 1435 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 1436 if segment_list is not None:
f14be228 1437 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 1438 if segment_urls_e:
1439 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
f14be228 1440 initialization = segment_list.find(_add_ns('Initialization'))
1bac3455 1441 if initialization is not None:
1442 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1443 else:
f14be228 1444 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 1445 if segment_template is not None:
1446 start_number = segment_template.get('startNumber')
1447 if start_number:
1448 ms_info['start_number'] = int(start_number)
f14be228 1449 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1bac3455 1450 if segment_timeline is not None:
f14be228 1451 s_e = segment_timeline.findall(_add_ns('S'))
1bac3455 1452 if s_e:
1453 ms_info['total_number'] = 0
1454 for s in s_e:
1455 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1456 else:
1457 timescale = segment_template.get('timescale')
1458 if timescale:
1459 ms_info['timescale'] = int(timescale)
1460 segment_duration = segment_template.get('duration')
1461 if segment_duration:
1462 ms_info['segment_duration'] = int(segment_duration)
1463 media_template = segment_template.get('media')
1464 if media_template:
1465 ms_info['media_template'] = media_template
1466 initialization = segment_template.get('initialization')
1467 if initialization:
1468 ms_info['initialization_url'] = initialization
1469 else:
f14be228 1470 initialization = segment_template.find(_add_ns('Initialization'))
1bac3455 1471 if initialization is not None:
1472 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1473 return ms_info
b323e170 1474
1bac3455 1475 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 1476 formats = []
f14be228 1477 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 1478 period_duration = parse_duration(period.get('duration')) or mpd_duration
1479 period_ms_info = extract_multisegment_info(period, {
1480 'start_number': 1,
1481 'timescale': 1,
1482 })
f14be228 1483 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
675d0016 1484 if is_drm_protected(adaptation_set):
1485 continue
1bac3455 1486 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 1487 for representation in adaptation_set.findall(_add_ns('Representation')):
675d0016 1488 if is_drm_protected(representation):
1489 continue
1bac3455 1490 representation_attrib = adaptation_set.attrib.copy()
1491 representation_attrib.update(representation.attrib)
a6c8b759
YCH
1492 # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
1493 mime_type = representation_attrib['mimeType']
1494 content_type = mime_type.split('/')[0]
1bac3455 1495 if content_type == 'text':
1496 # TODO implement WebVTT downloading
1497 pass
1498 elif content_type == 'video' or content_type == 'audio':
1499 base_url = ''
1500 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 1501 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 1502 if base_url_e is not None:
1503 base_url = base_url_e.text + base_url
1504 if re.match(r'^https?://', base_url):
1505 break
bb20526b
S
1506 if mpd_base_url and not re.match(r'^https?://', base_url):
1507 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1508 mpd_base_url += '/'
1bac3455 1509 base_url = mpd_base_url + base_url
1510 representation_id = representation_attrib.get('id')
d577c796 1511 lang = representation_attrib.get('lang')
51e9094f 1512 url_el = representation.find(_add_ns('BaseURL'))
1513 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1bac3455 1514 f = {
154c209e 1515 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1bac3455 1516 'url': base_url,
a6c8b759 1517 'ext': mimetype2ext(mime_type),
1bac3455 1518 'width': int_or_none(representation_attrib.get('width')),
1519 'height': int_or_none(representation_attrib.get('height')),
1520 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1521 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1522 'fps': int_or_none(representation_attrib.get('frameRate')),
1523 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1524 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
d577c796 1525 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1bac3455 1526 'format_note': 'DASH %s' % content_type,
51e9094f 1527 'filesize': filesize,
1bac3455 1528 }
1529 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1530 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1531 if 'total_number' not in representation_ms_info and 'segment_duration':
6a3828fd 1532 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1533 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1bac3455 1534 media_template = representation_ms_info['media_template']
1535 media_template = media_template.replace('$RepresentationID$', representation_id)
db8ee7ec 1536 media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
fb38aa8b 1537 media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
1bac3455 1538 media_template.replace('$$', '$')
b507cc92
S
1539 representation_ms_info['segment_urls'] = [
1540 media_template % {
1541 'Number': segment_number,
1542 'Bandwidth': representation_attrib.get('bandwidth')}
1543 for segment_number in range(
1544 representation_ms_info['start_number'],
1545 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1bac3455 1546 if 'segment_urls' in representation_ms_info:
1547 f.update({
1548 'segment_urls': representation_ms_info['segment_urls'],
1549 'protocol': 'http_dash_segments',
df374b52 1550 })
1bac3455 1551 if 'initialization_url' in representation_ms_info:
1552 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1553 f.update({
1554 'initialization_url': initialization_url,
1555 })
1556 if not f.get('url'):
1557 f['url'] = initialization_url
1558 try:
1559 existing_format = next(
1560 fo for fo in formats
1561 if fo['format_id'] == representation_id)
1562 except StopIteration:
1563 full_info = formats_dict.get(representation_id, {}).copy()
1564 full_info.update(f)
1565 formats.append(full_info)
1566 else:
1567 existing_format.update(f)
17b598d3 1568 else:
1bac3455 1569 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
17b598d3
YCH
1570 return formats
1571
f4b1c7ad
PH
1572 def _live_title(self, name):
1573 """ Generate the title for a live video """
1574 now = datetime.datetime.now()
611c1dd9 1575 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
1576 return name + ' ' + now_str
1577
b14f3a4c
PH
1578 def _int(self, v, name, fatal=False, **kwargs):
1579 res = int_or_none(v, **kwargs)
1580 if 'get_attr' in kwargs:
1581 print(getattr(v, kwargs['get_attr']))
1582 if res is None:
1583 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1584 if fatal:
1585 raise ExtractorError(msg)
1586 else:
1587 self._downloader.report_warning(msg)
1588 return res
1589
1590 def _float(self, v, name, fatal=False, **kwargs):
1591 res = float_or_none(v, **kwargs)
1592 if res is None:
1593 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1594 if fatal:
1595 raise ExtractorError(msg)
1596 else:
1597 self._downloader.report_warning(msg)
1598 return res
1599
42939b61 1600 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
1601 cookie = compat_cookiejar.Cookie(
1602 0, name, value, None, None, domain, None,
42939b61
JMF
1603 None, '/', True, False, expire_time, '', None, None, None)
1604 self._downloader.cookiejar.set_cookie(cookie)
1605
799207e8 1606 def _get_cookies(self, url):
1607 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
5c2266df 1608 req = sanitized_Request(url)
799207e8 1609 self._downloader.cookiejar.add_cookie_header(req)
1610 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1611
05900629
PH
1612 def get_testcases(self, include_onlymatching=False):
1613 t = getattr(self, '_TEST', None)
1614 if t:
1615 assert not hasattr(self, '_TESTS'), \
1616 '%s has _TEST and _TESTS' % type(self).__name__
1617 tests = [t]
1618 else:
1619 tests = getattr(self, '_TESTS', [])
1620 for t in tests:
1621 if not include_onlymatching and t.get('only_matching', False):
1622 continue
1623 t['name'] = type(self).__name__[:-len('IE')]
1624 yield t
1625
1626 def is_suitable(self, age_limit):
1627 """ Test whether the extractor is generally suitable for the given
1628 age limit (i.e. pornographic sites are not, all others usually are) """
1629
1630 any_restricted = False
1631 for tc in self.get_testcases(include_onlymatching=False):
1632 if 'playlist' in tc:
1633 tc = tc['playlist'][0]
1634 is_restricted = age_restricted(
1635 tc.get('info_dict', {}).get('age_limit'), age_limit)
1636 if not is_restricted:
1637 return True
1638 any_restricted = any_restricted or is_restricted
1639 return not any_restricted
1640
a504ced0 1641 def extract_subtitles(self, *args, **kwargs):
9868ea49
JMF
1642 if (self._downloader.params.get('writesubtitles', False) or
1643 self._downloader.params.get('listsubtitles')):
1644 return self._get_subtitles(*args, **kwargs)
1645 return {}
a504ced0
JMF
1646
1647 def _get_subtitles(self, *args, **kwargs):
611c1dd9 1648 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 1649
912e0b7e
YCH
1650 @staticmethod
1651 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1652 """ Merge subtitle items for one language. Items with duplicated URLs
1653 will be dropped. """
1654 list1_urls = set([item['url'] for item in subtitle_list1])
1655 ret = list(subtitle_list1)
1656 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1657 return ret
1658
1659 @classmethod
8c97f819 1660 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
912e0b7e 1661 """ Merge two subtitle dictionaries, language by language. """
912e0b7e
YCH
1662 ret = dict(subtitle_dict1)
1663 for lang in subtitle_dict2:
8c97f819 1664 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
912e0b7e
YCH
1665 return ret
1666
360e1ca5 1667 def extract_automatic_captions(self, *args, **kwargs):
9868ea49
JMF
1668 if (self._downloader.params.get('writeautomaticsub', False) or
1669 self._downloader.params.get('listsubtitles')):
1670 return self._get_automatic_captions(*args, **kwargs)
1671 return {}
360e1ca5
JMF
1672
1673 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 1674 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 1675
d77ab8e2
S
1676 def mark_watched(self, *args, **kwargs):
1677 if (self._downloader.params.get('mark_watched', False) and
1678 (self._get_login_info()[0] is not None or
1679 self._downloader.params.get('cookiefile') is not None)):
1680 self._mark_watched(*args, **kwargs)
1681
1682 def _mark_watched(self, *args, **kwargs):
1683 raise NotImplementedError('This method must be implemented by subclasses')
1684
8dbe9899 1685
d6983cb4
PH
1686class SearchInfoExtractor(InfoExtractor):
1687 """
1688 Base class for paged search queries extractors.
10952eb2 1689 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
1690 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1691 """
1692
1693 @classmethod
1694 def _make_valid_url(cls):
1695 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1696
1697 @classmethod
1698 def suitable(cls, url):
1699 return re.match(cls._make_valid_url(), url) is not None
1700
1701 def _real_extract(self, query):
1702 mobj = re.match(self._make_valid_url(), query)
1703 if mobj is None:
f1a9d64e 1704 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
1705
1706 prefix = mobj.group('prefix')
1707 query = mobj.group('query')
1708 if prefix == '':
1709 return self._get_n_results(query, 1)
1710 elif prefix == 'all':
1711 return self._get_n_results(query, self._MAX_RESULTS)
1712 else:
1713 n = int(prefix)
1714 if n <= 0:
f1a9d64e 1715 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 1716 elif n > self._MAX_RESULTS:
f1a9d64e 1717 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
1718 n = self._MAX_RESULTS
1719 return self._get_n_results(query, n)
1720
1721 def _get_n_results(self, query, n):
1722 """Get a specified number of results for a query"""
611c1dd9 1723 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
1724
1725 @property
1726 def SEARCH_KEY(self):
1727 return self._SEARCH_KEY